In [1]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import pickle
from tqdm import tqdm

# Data Processing

In [2]:
###############################################################################
# 주어진 결과와 정확히 비교하기 위한 random seed 고정
###############################################################################

seed = 0  # 바꾸지 마시오!
random.seed(seed)
np.random.seed(seed)

In [3]:
base_path = os.path.join(os.curdir, '/opt/ml/input/data/train')
data_path = os.path.join(base_path, 'train_ratings.csv')
genre_path = os.path.join(base_path, "genres.tsv")
writer_path = os.path.join(base_path, "writers.tsv")
director_path = os.path.join(base_path, "directors.tsv")
director_path

'/opt/ml/input/data/train/directors.tsv'

In [4]:
# 평점 데이터
ratings_df = pd.read_csv(data_path)
print(f'ratings_df의 shape {ratings_df.shape}')
ratings_df.columns = ['user', 'item', 'time']
ratings_df=ratings_df.drop(columns=['time'])
ratings_df['rating']=1 # 상호작용이 있기 때문에 1로 넣는다
ratings_df.head(5)

ratings_df의 shape (5154471, 3)


Unnamed: 0,user,item,rating
0,11,4643,1
1,11,170,1
2,11,531,1
3,11,616,1
4,11,2140,1


In [5]:
def neg_sample(rating_df, num_negative):
    #-- Negative instance 생성
    print(f"[INFO] Create Nagetive instances")
    
    items = set(rating_df.loc[:, 'item'])
    user_group_dfs = list(rating_df.groupby('user')['item'])
    
    user_neg_dfs = np.array([]).reshape(0, 3)

    for u, user_seen_list in user_group_dfs:
        
        #-- User가 시청한 영화 집합
        user_seen_set = set(user_seen_list)
        
        #-- 시청한 영화를 제외한 num_negative개의 영화 선택
        i_user_neg_item = np.random.choice(list(items - user_seen_set), num_negative, replace=False)
        
        #-- negative sample item's rating = 0
        neg_users = np.full(num_negative, u)
        neg_ratings = np.zeros(num_negative)
        
        #-- user u 에 대한 negative sample 결과 생성 : ["neg_user", "neg_item", "neg_rate"]
        neg_results = np.vstack((neg_users, i_user_neg_item, neg_ratings)).T
        user_neg_dfs = np.vstack((user_neg_dfs, neg_results))
    
    neg_rating_df = pd.DataFrame(data=user_neg_dfs, columns=["user", "item", "rating"])
    rating_df = pd.concat([rating_df, neg_rating_df], axis=0, sort=False)
    
    return rating_df

In [6]:
NUM_NEGATIVE=50
print(f'(Before) {ratings_df.shape}')
ratings_df=neg_sample(ratings_df,NUM_NEGATIVE) # 5분 이하 소요
print(f'(After) {ratings_df.shape}')
ratings_df.head()

(Before) (5154471, 3)
[INFO] Create Nagetive instances


KeyboardInterrupt: 

In [7]:
# 장르 데이터
genre_df = pd.read_csv(genre_path,delimiter="\t")
genre_list=genre_df.loc[:,'genre'].to_numpy()
genre_list=np.unique(genre_list)
print(genre_df.head(5))
print("장르 종류 : ", genre_list)

item2onegenre=dict()
for i in range(len(genre_list)):
    item2onegenre[genre_list[i]]=i
print("장르->카테고리 숫자 : ",item2onegenre) # 장르 카테고리화

for i in range(len(genre_df)):
    genre_df.loc[i,'genre']=item2onegenre[genre_df.loc[i,'genre']]
genre_df.head()

   item     genre
0   318     Crime
1   318     Drama
2  2571    Action
3  2571    Sci-Fi
4  2571  Thriller
장르 종류 :  ['Action' 'Adventure' 'Animation' 'Children' 'Comedy' 'Crime'
 'Documentary' 'Drama' 'Fantasy' 'Film-Noir' 'Horror' 'Musical' 'Mystery'
 'Romance' 'Sci-Fi' 'Thriller' 'War' 'Western']
장르->카테고리 숫자 :  {'Action': 0, 'Adventure': 1, 'Animation': 2, 'Children': 3, 'Comedy': 4, 'Crime': 5, 'Documentary': 6, 'Drama': 7, 'Fantasy': 8, 'Film-Noir': 9, 'Horror': 10, 'Musical': 11, 'Mystery': 12, 'Romance': 13, 'Sci-Fi': 14, 'Thriller': 15, 'War': 16, 'Western': 17}


Unnamed: 0,item,genre
0,318,5
1,318,7
2,2571,0
3,2571,14
4,2571,15


In [8]:
# 장르 1개 데이터
item_list=genre_df.loc[:,'item'].to_numpy()
item_list=np.unique(item_list)
print('길이 : ',len(item_list),item_list[-10:])

one_genre_list=[]
for i in item_list:
    temp=genre_df[genre_df.loc[:,'item']==i]
    one_genre_list.append(temp.iloc[0,:].to_numpy().tolist())
one_genre_df=pd.DataFrame(data=one_genre_list,columns=['item','genre'])
one_genre_df.head(5)

길이 :  6807 [116823 117176 117533 117881 118696 118700 118900 118997 119141 119145]


Unnamed: 0,item,genre
0,1,1
1,2,1
2,3,4
3,4,4
4,5,4


In [9]:
# 작가 데이터
writer_df = pd.read_csv(writer_path,delimiter="\t")
print(f'writer_df의 shape {writer_df.shape}') #전체 
print(writer_df.head())
writer_list=writer_df.loc[:,'writer'].to_numpy()
writer_list=np.unique(writer_list)
print('unique한 writer_list길이 : ',len(writer_list),writer_list[-10:])

# 작가 1명만 데이터
item_list=writer_df.loc[:,'item'].to_numpy()
item_list=np.unique(item_list)
print('item_list 길이 : ',len(item_list),item_list[-10:])

one_writer_list=[]
for i in item_list:
    temp=writer_df[writer_df.loc[:,'item']==i]
    one_writer_list.append(temp.iloc[0,:].to_numpy().tolist()) # 첫번째 writer만 가져옴
one_writer_df=pd.DataFrame(data=one_writer_list,columns=['item','writer'])
one_writer_df.head(5)

writer_df의 shape (11306, 2)
   item     writer
0  1237  nm0000005
1  5147  nm0000005
2  7327  nm0000005
3  2068  nm0000005
4  7396  nm0000005
unique한 writer_list길이 :  2989 ['nm3890871' 'nm4160687' 'nm4611078' 'nm4950667' 'nm4951717' 'nm5022110'
 'nm5335213' 'nm5371819' 'nm5927607' 'nm5927608']
item_list 길이 :  5648 [115617 116161 116799 116823 117881 118696 118900 118997 119141 119145]


Unnamed: 0,item,writer
0,1,nm0004056
1,2,nm0378144
2,3,nm0425756
3,4,nm0060103
4,5,nm0329304


In [10]:
# {'nm000005' : 0, 'nm':1 ..} 
item2writer=dict()
for i in range(len(writer_list)):
    item2writer[writer_list[i]]=i

# 원래 dataframe의 writer를 카테고리로 변경
for i in range(len(one_writer_df)):
    one_writer_df.loc[i,'writer']=item2writer[one_writer_df.loc[i,'writer']]

print(f'one_writer_df의 shape {one_writer_df.shape}')
one_writer_df.head()

one_writer_df의 shape (5648, 2)


Unnamed: 0,item,writer
0,1,296
1,2,1304
2,3,1435
3,4,466
4,5,1165


In [11]:
# 감독 데이터
director_df = pd.read_csv(director_path,delimiter="\t")
print(f'director_df의 shape {director_df.shape}')
print(director_df.head())

# 감독 1명만 데이터
item_list=director_df.loc[:,'item'].to_numpy()
item_list=np.unique(item_list)
print('item_list 길이 : ',len(item_list),item_list[-10:])

one_director_list=[]
for i in item_list:
    temp=director_df[director_df.loc[:,'item']==i]
    one_director_list.append(temp.iloc[0,:].to_numpy().tolist()) # 첫번째 writer만 가져옴
one_director_df=pd.DataFrame(data=one_director_list,columns=['item','director'])
one_director_df.head(5)

director_df의 shape (5905, 2)
   item   director
0  1237  nm0000005
1  5147  nm0000005
2  7327  nm0000005
3  2068  nm0000005
4  7396  nm0000005
item_list 길이 :  5503 [116797 116799 116823 117176 117881 118696 118900 118997 119141 119145]


Unnamed: 0,item,director
0,1,nm0005124
1,2,nm0002653
2,3,nm0222043
3,4,nm0001845
4,5,nm0796124


In [12]:
#{'nm0000005' : 0 , ...}
director_list=director_df.loc[:,'director'].to_numpy()
director_list=np.unique(director_list)
print('길이 : ',len(director_list),director_list[-10:])

item2director=dict()
for i in range(len(director_list)):
    item2director[director_list[i]]=i

for i in range(len(one_director_df)):
    one_director_df.loc[i,'director']=item2director[one_director_df.loc[i,'director']]

print(f'director_df의 shape {one_director_df.shape}')
# one_director_df.head()

길이 :  1340 ['nm2284484' 'nm2304017' 'nm2320658' 'nm2480587' 'nm2482088' 'nm2588606'
 'nm2648685' 'nm2676052' 'nm2879822' 'nm9054338']
director_df의 shape (5503, 2)


In [None]:
df=pd.merge(ratings_df, one_genre_df,how='left', on='item')
print("rating+genre",df.shape)
df=pd.merge(df,one_writer_df ,how='left', on='item')
print("rating+genre+writer",df.shape)
df=pd.merge(df,one_director_df ,how='left', on='item')
print("rating+genre+writer+director",df.shape)
df.head()

rating+genre (5154471, 4)
rating+genre+writer (5154471, 5)
rating+genre+writer+director (5154471, 6)


Unnamed: 0,user,item,rating,genre,writer,director
0,11,4643,1,0,574.0,47
1,11,170,1,0,,1111
2,11,531,1,3,279.0,280
3,11,616,1,2,400.0,1016
4,11,2140,1,1,174.0,84


## 결측치

In [14]:
df.isnull().sum()

user             0
item             0
rating           0
genre            0
writer      584411
director    607148
dtype: int64

In [15]:
df['writer']=df['writer'].fillna(len(writer_list)) # 끝 값
df['director']=df['director'].fillna(len(director_list)) 
df.isnull().sum()

user        0
item        0
rating      0
genre       0
writer      0
director    0
dtype: int64

In [16]:
print(pd.__version__ ,' > 1.0.5 여야 실행됨')
df=df.astype(np.int64)
print(ratings_df.shape)
print('*'*20,'df.info()','*'*20)
print(df.info())
print('*'*20,'df.head())','*'*20)
print(df.head())

1.4.2  > 1.0.5 여야 실행됨
(6722471, 3)
******************** df.info() ********************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 6722471 entries, 0 to 6722470
Data columns (total 6 columns):
 #   Column    Dtype
---  ------    -----
 0   user      int64
 1   item      int64
 2   rating    int64
 3   genre     int64
 4   writer    int64
 5   director  int64
dtypes: int64(6)
memory usage: 359.0 MB
None
******************** df.head()) ********************
   user  item  rating  genre  writer  director
0    11  4643       1      0     574        47
1    11   170       1      0    2989      1111
2    11   531       1      3     279       280
3    11   616       1      2     400      1016
4    11  2140       1      1     174        84


In [20]:
ffm_df=df.copy()
# col_dict = dict()

# for col in ffm_df.columns:
#     if col == 'rating': continue
#     ffm_df[col] = ffm_df[col].astype("category")
#     col_dict[col] = {value: idx for idx, value in enumerate(ffm_df[col].cat.categories)}
#     ffm_df[col] = ffm_df[col].cat.codes

# col_dict
col_len = list()

for col in ffm_df.columns:
    if col == "rating":
        continue
    category = ffm_df[f"{col}"].astype('category')
    col_len.append(len(ffm_df[f"{col}"].astype('category').cat.categories))
    ffm_df[f"{col}"] = category.cat.codes
print("[user수(len),item수,genre수,writer수,director수] : ",col_len)

[user수(len),item수,genre수,writer수,director수] :  [31360, 6807, 18, 2028, 1283]


In [21]:
print(ffm_df.shape)
train_X, test_X, train_y, test_y = train_test_split(
    ffm_df.loc[:, ffm_df.columns != 'rating'], ffm_df['rating'], test_size=0.2, random_state=seed)
print('학습 데이터 크기:', train_X.shape, train_y.shape)
print('테스트 데이터 크기:', test_X.shape, test_y.shape)

(6722471, 6)
학습 데이터 크기: (5377976, 5) (5377976,)
테스트 데이터 크기: (1344495, 5) (1344495,)


# Train

In [22]:
# PyTorch의 DataLoader에서 사용할 수 있도록 변환 
train_dataset_ffm = TensorDataset(torch.LongTensor(np.array(train_X)), torch.Tensor(np.array(train_y)))
test_dataset_ffm = TensorDataset(torch.LongTensor(np.array(test_X)), torch.Tensor(np.array(test_y)))
del df, train_X, train_y, test_X, test_y

In [13]:
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    train_loss = 0
    
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)
        train_loss += loss.item()

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")
    train_loss /= num_batches
    
    return train_loss


def test_loop(dataloader, model, loss_fn, task):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, y_all, pred_all = 0, list(), list()

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item() / num_batches
            y_all.append(y)
            pred_all.append(pred)
    
    y_all = torch.cat(y_all)
    pred_all = torch.cat(pred_all)
    
    if task == 'reg':
        err = abs(pred_all - y_all).type(torch.float).mean().item()
        print(f"Test Error: \n  MAE: {(err):>8f} \n  Avg loss: {test_loss:>8f}")
    else:
        err = roc_auc_score(y_all, torch.sigmoid(pred_all)).item()
        print(f"Test Error: \n  AUC: {err:>8f} \n  Avg loss: {test_loss:>8f}")
    
    return err, test_loss

def train_and_test(train_dataloader, test_dataloader, model, loss_fn, optimizer, epochs, task):
    train_loss, test_err, test_loss = list(), list(), list()
    
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        train_loss.append(train_loop(train_dataloader, model, loss_fn, optimizer))
        test_result = test_loop(test_dataloader, model, loss_fn, task)
        test_err.append(test_result[0])
        test_loss.append(test_result[1])
        print("-------------------------------\n")
    print("Done!")
    
    return train_loss, test_err, test_loss

class FFMLayer(nn.Module):
    def __init__(self, field_dims, factor_dim):
        '''
        Parameter
            field_dims: List of field dimensions 
                        The sum become the entire dimension of input (in sparse feature)
                        The length become the number of fields
            factor_dim: Factorization dimension
        '''
        super().__init__()
        self.num_fields = len(field_dims)
        self.input_dim = sum(field_dims)
        self.embedding = nn.ModuleList([
            # FILL HERE : Fill in the places `None` with                                      #
            #             either `factorization_dim`, `self.num_fields`, or `self.input_dim`. #
            nn.Embedding(
                self.input_dim, factor_dim
            ) for _ in range(self.num_fields)
        ])

    def forward(self, x):
        '''
        Parameter
            x: Long tensor of size "(batch_size, num_fields)"
               Each value of variable is an index calculated including the dimensions up to the previous variable.
               for instance, [gender:male, age:20, is_student:True] 
                             -> [1,0, 0,1,0,0,0,0, 0,1] in one-hot encoding
                             -> x = [0,3,9].
        Return
            y: Float tensor of size "(batch_size)"
        '''
        
        xv = [self.embedding[f](x) for f in range(self.num_fields)]
        
        y = list()
        for f in range(self.num_fields):
            for g in range(f + 1, self.num_fields):
                y.append(xv[f][:, g] *  xv[g][:, f])
        y = torch.stack(y, dim=1)
        
        return torch.sum(y, dim=(2,1))

class FieldAwareFM(nn.Module):
    def __init__(self, field_dims, factor_dim):
        '''
        Parameter
            field_dims: List of field dimensions
            factor_dim: Factorization dimension
        '''
        super().__init__()
        self.input_dim = sum(field_dims)
        self.encoding_dims = np.concatenate([[0], np.cumsum(field_dims)[:-1]])
        self.linear = nn.Linear(self.input_dim, 1, bias=True) # FILL HERE : Fill in the places `None` #
        self.ffm = FFMLayer(field_dims, factor_dim) # FILL HERE : Fill in the places `None` #
        
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.normal_(m.weight, 0, 0.01)
                nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Embedding):
                nn.init.xavier_uniform_(m.weight)
            elif isinstance(m, FFMLayer):
                nn.init.normal_(m.v, 0, 0.01)

    def forward(self, x):
        '''
        Parameter
            x: Long tensor of size "(batch_size, num_fields)"
                x_multihot: Multi-hot coding of x. size "(batch_size, self.input_dim)"
        
        Return
            y: Float tensor of size "(batch_size)"
        '''
        x = x + x.new_tensor(self.encoding_dims).unsqueeze(0)

        x_multihot = torch.zeros(x.size(0), self.input_dim, device=device).scatter_(1, x, 1.)
        #x_multihot.shape : [256,41496]
        y = self.linear(x_multihot).squeeze(1) + self.ffm(x) # FILL HERE : Use `self.linear()` and `self.ffm()` #
        return y

In [14]:
######## Hyperparameter ########

batch_size = 256
data_shuffle = True
task = 'reg'
factorization_dim = 8
epochs = 100
learning_rate = 0.001
gpu_idx = 0

In [14]:
# torch.cuda.empty_cache() # if necessary
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [25]:
train_dataloader_ffm = DataLoader(train_dataset_ffm, batch_size=batch_size, shuffle=data_shuffle)
test_dataloader_ffm = DataLoader(test_dataset_ffm, batch_size=batch_size, shuffle=data_shuffle)

field_dims = col_len
model = FieldAwareFM(field_dims, factorization_dim).to(device)

loss_fn = nn.MSELoss().to(device) if (task == 'reg') else nn.BCEWithLogitsLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.001, amsgrad=True)

In [None]:
train_loss, test_err, test_loss = train_and_test(train_dataloader_ffm, test_dataloader_ffm, 
                                                 model, loss_fn, optimizer, epochs, task)

In [None]:
torch.save(model, f"FFM_epoch{epochs}.pt")

## 학습 결과 시각화

In [None]:
fig, (loss_ax, err_ax) = plt.subplots(1,2, figsize=(12,4))

loss_ax.plot(range(1,epochs+1), train_loss, 'b', label='training loss')
loss_ax.plot(range(1,epochs+1), test_loss, 'r', label='test loss')
err_ax.plot(range(1,epochs+1), test_err, 'r', label='test err')

loss_ax.set_xticks(range(0, epochs+1, 10))
loss_ax.set_xlabel('Epoch')
loss_ax.set_ylabel('Average Loss')
loss_ax.legend()

err_ax.set_xticks(range(0, epochs+1, 10))
err_ax.set_xlabel('Epoch')
err_ax.set_ylabel('AUC')
err_ax.legend()

plt.show()

# Inference

In [15]:
model=torch.load(f'FFM_epoch{epochs}.pt')#.to(device)
model.eval()

FieldAwareFM(
  (linear): Linear(in_features=41496, out_features=1, bias=True)
  (ffm): FFMLayer(
    (embedding): ModuleList(
      (0): Embedding(41496, 8)
      (1): Embedding(41496, 8)
      (2): Embedding(41496, 8)
      (3): Embedding(41496, 8)
      (4): Embedding(41496, 8)
    )
  )
)

In [24]:
inference_df=pd.read_csv("inf_df.csv")  #이미 본 영화들이랑 1000번 이하로 시청한 영화들 제거
print('inference_df.shape : ',inference_df.shape)
inference_df.sort_values(by="user",axis = 0,inplace = True)
print(inference_df.head(5))
print(inference_df.tail(5))

inference_df.shape :  (32761678, 2)
     user  item
0       0     1
611     0  1197
612     0  5298
613     0  1202
614     0  5302
           user  item
32761045  31359   605
32761046  31359   606
32761047  31359   608
32761035  31359   592
32761677  31359  4094


In [25]:
with open('user_dict.pickle', 'rb') as fr:
    user_cat_dict = pickle.load(fr)
    # load data
with open('item_dict.pickle', 'rb') as fr:
    item_cat_dict = pickle.load(fr)

user_id_dict={v:k for k,v in user_cat_dict.items()} # {user_id: category, ..}
item_id_dict={v:k for k,v in item_cat_dict.items()} # {item_id: category,..}
print("users :", len(user_cat_dict)) #31360 # {category : user_id, ..}
print("items :", len(item_cat_dict)) #6807 # {category : item_id, ..}
print('user max category: ',inference_df.loc[:,'user'].max(),'||| user min category: ',inference_df.loc[:,'user'].min())
print('item max category: ',inference_df.loc[:,'item'].max(),'||| item min category: ',inference_df.loc[:,'item'].min()) # 왜 6770? -> 1000번 이하로 시청되었으면 삭제됬을 수 있음

users : 31360
items : 6807
user max category:  31359 ||| user min category:  0
item max category:  6770 ||| item min category:  0


In [26]:
#genre
one_genre_df_inf=one_genre_df.copy() # one_genre_df:item_id 기준, one_genre_df_inf:카테고리 기준

print("(Before) item_id로 indexing되어 있음 ")
print(one_genre_df_inf.tail(5))
for i in range(len(one_genre_df_inf.item)):
    one_genre_df_inf.loc[i,'item']=item_id_dict[one_genre_df_inf.loc[i,'item']]
print("(After) category로 indexing되어 있음 ")
print(one_genre_df_inf.tail())

#writer
one_writer_df_inf=one_writer_df.copy()
for i in range(len(one_writer_df_inf.item)):
    one_writer_df_inf.loc[i,'item']=item_id_dict[one_writer_df_inf.loc[i,'item']]

#director
one_director_df_inf=one_director_df.copy()
for i in range(len(one_director_df_inf.item)):
    one_director_df_inf.loc[i,'item']=item_id_dict[one_director_df_inf.loc[i,'item']]

inference_df=pd.merge(inference_df, one_genre_df_inf,how='left', on='item')
print("inference+genre",inference_df.shape)
inference_df=pd.merge(inference_df, one_writer_df_inf,how='left', on='item')
print("inference+genre+writer",inference_df.shape)
inference_df=pd.merge(inference_df, one_director_df_inf,how='left', on='item')
print("inference+genre+writer+director",inference_df.shape)
inference_df.head()

(Before) item_id로 indexing되어 있음 
        item  genre
6802  118700      7
6803  118900      7
6804  118997      3
6805  119141      0
6806  119145      0
(After) category로 indexing되어 있음 
      item  genre
6802  6802      7
6803  6803      7
6804  6804      3
6805  6805      0
6806  6806      0
inference+genre (32761678, 3)
inference+genre+writer (32761678, 4)
inference+genre+writer+director (32761678, 5)


Unnamed: 0,user,item,genre,writer,director
0,0,1,1,1304.0,293
1,0,1197,1,88.0,88
2,0,5298,6,,1272
3,0,1202,0,1001.0,944
4,0,5302,0,191.0,307


## 결측치

In [27]:
inference_df.isnull().sum()

user              0
item              0
genre             0
writer      1584673
director    1114195
dtype: int64

In [28]:
inference_df['writer']=inference_df['writer'].fillna(len(writer_list)-1) # 끝 값
inference_df['director']=inference_df['director'].fillna(len(director_list)-1) 
inference_df.isnull().sum()

user        0
item        0
genre       0
writer      0
director    0
dtype: int64

In [29]:
# cuda setting
use_cuda = torch.cuda.is_available()
#device = torch.device("cuda" if use_cuda else "cpu")
device = torch.device("cpu")
print(device)

batch_size = 1024

inference_dataset = TensorDataset(torch.LongTensor(np.array(inference_df)))
inference_dataloader = DataLoader(inference_dataset,
                                  batch_size=batch_size,
                                  #pin_memory=use_cuda,
                                  drop_last=False,
                                  shuffle=False,
                                  num_workers = 4,
                                  )
print('use_cuda : ',use_cuda)
print("dataset length :", len(inference_dataloader))

cpu
use_cuda :  True
dataset length : 31994


In [30]:
user_list = list()
score_list = list()
item_list = list()

with torch.no_grad():
    cnt = 0
    for batch in tqdm(inference_dataloader):
        # break
        x = batch[0].to(device)  #x.shape : [1024,5]
        # print ("[DEBUG] model input x-----")
        
        # print ("--------------------------")
        output = model(x) #[B] ///x 에 대한 점수
        #idx = torch.where(output >= 1)[0] # 점수가 1 이상인 index
        
        info = x.cpu()
        #scores = output.index_select(0,idx).cpu().tolist()
        scores = output.cpu().tolist()
        users = info[:,0].tolist()
        items = info[:,1].tolist()

        user_list += users
        item_list += items
        score_list += scores
        

np_user_list = np.array(user_list)
np_item_list = np.array(item_list)
np_score_list = np.array(score_list)

  0%|          | 0/31994 [00:00<?, ?it/s]


RuntimeError: index 41552 is out of bounds for dimension 1 with size 41496

In [None]:
users = list()
items = list()
for user_code, user_id in tqdm(user_dict.items()):
    u_id = int(user_id)

    idx = np.where(np_user_list == user_code)[0].tolist()
    
    item_score = np_score_list.take(idx) #user code 에 해당하는 item_score
    item_ = np_item_list.take(idx) # user code에 해당하는 item
    top10_idx = np.argpartition(item_score, -10)[-10:] # 상위 10개 index 추출

    top10_item = [int(item_dict[code]) for code in item_.take(top10_idx)] #top 10(item code -> item id)
    user_id = [u_id] * 10

    users += user_id
    items += top10_item

result.head()

In [None]:
result = np.vstack((users,items)).T
result.head()

In [None]:
submit = pd.DataFrame(result, columns=['user','item'])
submit.to_csv(f"FFM_submission_Epoch:{epochs}.csv",index=False)