# Deep Fatorization Machines with PyTorch


* Referneces :
    - boostcamp ai tech, special mission (Recsys #4)
    - DeepFM: A Factorization-Machine based Neural Network for CTR Prediction (https://arxiv.org/pdf/1703.04247.pdf)  
    - Wide & Deep Learning for Recommender Systems (https://arxiv.org/pdf/1606.07792.pdf)
    - Factorization Machines (https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=5694074)
    - https://d2l.ai/chapter_recommender-systems/deepfm.html
    - dataset : movielens modified w/o rating info.

# Modules

In [None]:
import csv
import numpy as np
import pandas as pd
from collections import Counter
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [None]:
# import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Data preprocessing
0. Dataset 다운로드  
<br/>
1. Rating df 생성  
rating 데이터(train_ratings.csv)를 불러와 [user, item, rating]의 컬럼으로 구성된 데이터 프레임을 생성합니다.   
<br/>
2. Genre df 생성   
genre 정보가 담긴 데이터(genres.tsv)를 불러와 genre이름을 id로 변경하고, [item, genre]의 컬럼으로 구성된 데이터 프레임을 생성합니다.    
<br/>
3. Negative instances 생성   
rating 데이터는 implicit feedback data(rating :0/1)로, positive instances로 구성되어 있습니다. 따라서 rating이 없는 item중 negative instances를 뽑아서 데이터에 추가하게 됩니다.   
<br/>
4. Join dfs   
rating df와 genre df를 join하여 [user, item, rating, genre]의 컬럼으로 구성된 데이터 프레임을 생성합니다.   
<br/>
5. zero-based index로 mapping   
Embedding을 위해서 user,item,genre를 zero-based index로 mapping합니다.
    - user : 0-31359
    - item : 0-6806
    - genre : 0-17  
<br/>
6. feature matrix X, label tensor y 생성   
[user, item, genre] 3개의 field로 구성된 feature matrix를 생성합니다.   
<br/>
7. data loader 생성

## 데이터 다운로드
이곳에 대회 사이트(AI Stages)에 있는 data의 URL을 입력해주세요. 
- 데이터 URL은 변경될 수 있습니다.
- 예) `!wget https://aistages-prod-server-public.s3.amazonaws.com/app/Competitions/000176/data/data.tar.gz`

In [None]:
# 1. Rating df 생성
rating_data = "./data/train/train_ratings.csv"

raw_rating_df = pd.read_csv(rating_data)
raw_rating_df
raw_rating_df['rating'] = 1.0 # implicit feedback
raw_rating_df.drop(['time'],axis=1,inplace=True)
print("Raw rating df")
print(raw_rating_df)

users = set(raw_rating_df.loc[:, 'user'])
items = set(raw_rating_df.loc[:, 'item'])

#2. Genre df 생성
genre_data = "./data/train/genres.tsv"

raw_genre_df = pd.read_csv(genre_data, sep='\t')
raw_genre_df = raw_genre_df.drop_duplicates(subset=['item']) #item별 하나의 장르만 남도록 drop
# print(raw_genre_df)

genre_dict = {genre:i for i, genre in enumerate(set(raw_genre_df['genre']))}
raw_genre_df['genre']  = raw_genre_df['genre'].map(lambda x : genre_dict[x]) #genre id로 변경
print("Raw genre df - changed to id")
print(raw_genre_df)

In [None]:
# 3. Negative instance 생성
print("Create Nagetive instances")
num_negative = 50
user_group_dfs = list(raw_rating_df.groupby('user')['item'])
first_row = True
user_neg_dfs = pd.DataFrame()

for u, u_items in tqdm(user_group_dfs):
    u_items = set(u_items)
    i_user_neg_item = np.random.choice(list(items - u_items), num_negative, replace=False)
    
    i_user_neg_df = pd.DataFrame({'user': [u]*num_negative, 'item': i_user_neg_item, 'rating': [0]*num_negative})
    if first_row == True:
        user_neg_dfs = i_user_neg_df
        first_row = False
    else:
        user_neg_dfs = pd.concat([user_neg_dfs, i_user_neg_df], axis = 0, sort=False)

raw_rating_df = pd.concat([raw_rating_df, user_neg_dfs], axis = 0, sort=False)

# 4. Join dfs
joined_rating_df = pd.merge(raw_rating_df, raw_genre_df, left_on='item', right_on='item', how='inner')
# print("Joined rating df")
# print(joined_rating_df)


In [None]:
# 5. user, item을 zero-based index로 mapping
users = list(set(joined_rating_df.loc[:,'user']))
users.sort()
items =  list(set((joined_rating_df.loc[:, 'item'])))
items.sort()
genres =  list(set((joined_rating_df.loc[:, 'genre'])))
genres.sort()

if len(users)-1 != max(users):
    users_dict = {users[i]: i for i in range(len(users))}
    joined_rating_df['user']  = joined_rating_df['user'].map(lambda x : users_dict[x])
    users = list(set(joined_rating_df.loc[:,'user']))
    
if len(items)-1 != max(items):
    items_dict = {items[i]: i for i in range(len(items))}
    joined_rating_df['item']  = joined_rating_df['item'].map(lambda x : items_dict[x])
    items =  list(set((joined_rating_df.loc[:, 'item'])))

joined_rating_df = joined_rating_df.sort_values(by=['user'])
joined_rating_df.reset_index(drop=True, inplace=True)

data = joined_rating_df
# print("Data")
# print(data)

n_data = len(data)
n_user = len(users)
n_item = len(items)
n_genre = len(genres)

print("# of data : {}\n# of users : {}\n# of items : {}\n# of genres : {}".format(n_data, n_user, n_item, n_genre))

In [None]:
#6. feature matrix X, label tensor y 생성
user_col = torch.tensor(data.loc[:,'user'])
item_col = torch.tensor(data.loc[:,'item'])
genre_col = torch.tensor(data.loc[:,'genre'])

offsets = [0, n_user, n_user+n_item]
for col, offset in zip([user_col, item_col, genre_col], offsets):
    col += offset

In [None]:
X = torch.cat([user_col.unsqueeze(1), item_col.unsqueeze(1), genre_col.unsqueeze(1)], dim=1)
y = torch.tensor(list(data.loc[:,'rating']))

#7. data loader 생성
class RatingDataset(Dataset):
    def __init__(self, input_tensor, target_tensor):
        self.input_tensor = input_tensor.long()
        self.target_tensor = target_tensor.long()

    def __getitem__(self, index):
        return self.input_tensor[index], self.target_tensor[index]

    def __len__(self):
        return self.target_tensor.size(0)


dataset = RatingDataset(X, y)
train_ratio = 0.9

train_size = int(train_ratio * len(data))
test_size = len(data) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

   # Model architecture (DeepFM)
   DeepFM 모델은 1) FM component와  2) Deep component가 병렬적으로 결합되어 있습니다. 구조는 다음과 같습니다.
<img src='https://drive.google.com/uc?id=1vwcxUJQTIsg5QH9CuH5PcUEfExhToUHR'>  
각 구조는 다음과 같습니다.  
   **1. FM component**  
       FM component는 우리가 아는 2-way Factorization machines(degree=2)입니다. FM은 variables 간의 interaction을 다음과 같이 모델링 합니다.   
     **<center> equation (1) </center>**
   $$\hat{y}(x):=w_0 + \sum_{i=1}^{n}w_ix_i + \sum_{i=1}^{n}\sum_{j=i+1}^{n}<\mathbf{v}_i,\mathbf{v}_j>x_ix_j$$   
   이때, 세번째 interaction term을 전개하여 다음과 같이 쓸 수 있습니다.(논문 참고)  
   구현 코드는 전개된 식을 바탕으로 합니다.   
     **<center> equation (2)> </center>**
   $$\sum_{i=1}^{n}\sum_{j=i+1}^{n}<\mathbf{v}_i,\mathbf{v}_j>x_ix_j = \frac{1}{2}\sum_{f=1}^{k}((\sum_{i=1}^{n}v_{i,f}x_i)^2-\sum_{i=1}^{n}v_{i,f}^2x_i^2)$$   
           
   **2. Deep component**  
       Deep component는 MLP Layers로 구성되어 있습니다.   
       구현 코드는 Input dimension이 30-20-10인 3 layer MLP 구조입니다.
  
   

# DeepFM

In [None]:
class DeepFM(nn.Module):
    def __init__(self, input_dims, embedding_dim, mlp_dims, drop_rate=0.1):
        super(DeepFM, self).__init__()
        total_input_dim = int(sum(input_dims)) # n_user + n_movie + n_genre

        # Fm component의 constant bias term과 1차 bias term
        self.bias = nn.Parameter(torch.zeros((1,)))
        self.fc = nn.Embedding(total_input_dim, 1)
        
        self.embedding = nn.Embedding(total_input_dim, embedding_dim) 
        self.embedding_dim = len(input_dims) * embedding_dim

        mlp_layers = []
        for i, dim in enumerate(mlp_dims):
            if i==0:
                mlp_layers.append(nn.Linear(self.embedding_dim, dim))
            else:
                mlp_layers.append(nn.Linear(mlp_dims[i-1], dim)) #TODO 1 : linear layer를 넣어주세요.
            mlp_layers.append(nn.ReLU(True))
            mlp_layers.append(nn.Dropout(drop_rate))
        mlp_layers.append(nn.Linear(mlp_dims[-1], 1))
        self.mlp_layers = nn.Sequential(*mlp_layers)

    def fm(self, x):
        # x : (batch_size, total_num_input)
        embed_x = self.embedding(x)

        fm_y = self.bias + torch.sum(self.fc(x), dim=1)
        square_of_sum = torch.sum(embed_x, dim=1) ** 2         #TODO 2 : torch.sum을 이용하여 square_of_sum을 작성해주세요(hint : equation (2))
        sum_of_square = torch.sum(embed_x ** 2, dim=1)         #TODO 3 : torch.sum을 이용하여 sum_of_square을 작성해주세요(hint : equation (2))
        fm_y += 0.5 * torch.sum(square_of_sum - sum_of_square, dim=1, keepdim=True)
        return fm_y
    
    def mlp(self, x):
        embed_x = self.embedding(x)
        # print(embed_x.shape)
        # print(x.shape)
        
        inputs = embed_x.view(-1, self.embedding_dim)
        mlp_y = self.mlp_layers(inputs)
        return mlp_y

    def forward(self, x):
        embed_x = self.embedding(x)
        #fm component
        fm_y = self.fm(x).squeeze(1)
        
        #deep component
        mlp_y = self.mlp(x).squeeze(1)
        
        y = torch.sigmoid(fm_y + mlp_y)
        return y


# Training

In [None]:
device = torch.device('cuda')

input_dims = [n_user, n_item, n_genre]
# print(input_dims)
embedding_dim = 10
model = DeepFM(input_dims, embedding_dim, mlp_dims=[30, 20, 10]).to(device)
bce_loss = nn.BCELoss() # Binary Cross Entropy loss
lr, num_epochs = 0.001, 100
optimizer = optim.Adam(model.parameters(), lr=lr)

for e in tqdm(range(num_epochs)) :
    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        model.train()
        optimizer.zero_grad()
        output = model(x)
        loss = bce_loss(output, y.float())
        print(loss)
        loss.backward()
        optimizer.step()

torch.save(model, "dfm.pt")
        

# Generate Top-N List

- 1. Generate dataset (ux(i,g))
- 2. predict ratings for all joint (u, i)
- 3. rating index (unseen) [:10]
- 4. make submission file

In [None]:
# X , y 생성하기
# X : [user, {item, genre}], for all users, u and all {item, genre}, i_g 
##### y : [0] * len(user) * len{item, genre}  --> 형태만 맞추기. 사용되지 않음. 

In [None]:
# 1. Users list 생성하기 
rating_df = pd.read_csv("data/train/train_ratings.csv")
rating_df.drop(['time'],axis=1,inplace=True)
users = list(set(rating_df.loc[:,'user'])) # list(rating_df['user'].unique())  # NAME_list
# n_users = 31360, users.min = 11,  users.max = 138493

# 2. Items, genres list 생성하기 
genre_df = pd.read_csv("data/train/genres.tsv", sep='\t')
genre_df = genre_df.drop_duplicates(subset=['item'])
items = list(set(genre_df.loc[:,'item'])) # list(genre_df['item'].unique())
genres = list(set(genre_df.loc[:,'genre']))

In [None]:
# 3. user, item, genre를 zero-based index로 mapping
users_dict = {users[i]: i for i in range(len(users))}
rating_df['user']  = rating_df['user'].map(lambda x : users_dict[x])
users_zero = list(set(rating_df.loc[:,'user']))

items_dict = {items[i] : i for i in range(len(items))}
genre_df['item']  = genre_df['item'].map(lambda x : items_dict[x])
items_zero =  list(set((genre_df.loc[:, 'item'])))

genre_dict = {genre:i for i, genre in enumerate(set(genre_df['genre']))}
genre_df['genre']  = genre_df['genre'].map(lambda x : genre_dict[x]) #genre id로 변경
genres_zero = list(genre_df['genre'] ) # list(set(genre_df.loc[:,'genre'])) # list(genre_df['genre'])
# genres = list(genre_df['genre'])

n_item = len(items)
n_user = len(users)
n_genre = len(genres)


In [None]:
# print(len(users_zero), len(items_zero), len(genres_zero))
# print(min(users_zero), min(items_zero), min(genres_zero))
# print(max(users_zero), max(items_zero), max(genres_zero))

In [None]:
ratings = pd.read_csv('data/train/train_ratings.csv')
ratings['rating'] = 1.0

ratings_df = ratings[['user', 'item', 'rating']]
# column = 'title'로 title 컬럼으로 pivot 수행
ratings_matrix = ratings_df.pivot_table('rating', index='user', columns='item')

# # NaN 값을 모두 0으로 변환
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix

In [None]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자id index와 unseen_list로 들어온 영화명 컬럼을 추출하여
    # 가장 예측 평점이 높은 순으로 정렬함.
    res_dict = {pred_df[i]: i for i in range(len(pred_df))}
    pred = pred_df.sort()
    res_movie_ids = [ items[i] for i in result.sort().indices.tolist()]
    recomm_ids = list(set(res_movie_ids) & set(unseen_list))

    return res_movie_ids[:10]

In [None]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화정보 추출하여 Series로 반환함. 
    # 반환된 user_rating 은 영화명(title)을 index로 가지는 Series 객체임. 
    
    user_rating = ratings_matrix.loc[userId,:]
    # user_rating이 0보다 크면 기존에 관람한 영화임. 대상 index를 추출하여 list 객체로 만듬

    already_seen = user_rating[ user_rating > 0].index.tolist()
    # 모든 영화명을 list 객체로 만듬. 
    movies_list = ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 movie는 movies_list에서 제외함. 
    unseen_list = [ movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [None]:
#7. data loader 생성

# offsets = [0, n_user, n_user+n_item]
# for col, offset in zip([user_col, item_col, genre_col], offsets):
#     col += offset

class RatingDataset(Dataset):
    def __init__(self, input_tensor, target_tensor):
        self.input_tensor = input_tensor.long()
        self.target_tensor = target_tensor.long()

    def __getitem__(self, index):
        return self.input_tensor[index], self.target_tensor[index]

    def __len__(self):
        return self.target_tensor.size(0)


In [None]:
items_sr = pd.Series(data = items_zero)
genres_sr = pd.Series(data = genres_zero)

In [None]:
mp_items = []
users_ = []
itr = 0

item_col = torch.tensor(items_sr)
genre_col = torch.tensor(genres_sr)

for u in tqdm(users_zero):
    user_col = torch.tensor(pd.Series([u]* n_item))

    # offsets = [0, n_user, n_user+n_item]
    # for col, offset in zip([user_col, item_col, genre_col], offsets):
    #     col += offset

    X = torch.cat([user_col.unsqueeze(1), item_col.unsqueeze(1), genre_col.unsqueeze(1)], dim=1)
    y = torch.tensor(list([0]*len(items)))
   
    cf_test_dataset = RatingDataset(X, y)
    cf_test_loader = DataLoader(cf_test_dataset, batch_size=1024, shuffle=False)

    result = torch.tensor([]).to(device)
    for x, y in cf_test_loader:
        x, y = x.to(device), y.to(device)
        model.eval()
        output = model(x)
        result = torch.cat((result, output), 0)
    
    # 사용자가 관람하지 않는 영화명 추출   
    unseen_list = get_unseen_movies(ratings_matrix, users[u])   

    # 아이템 기반의 인접 이웃 협업 필터링으로 영화 추천 
    recomm_movies = recomm_movie_by_userid(result, users[u], unseen_list, top_n=10)
    mp_items= mp_items + recomm_movies
    users_ = users_ + [users[u]]*10 # [u, u, u, u, u, u, u, u, u, u]

test_df = pd.DataFrame(zip(users_,mp_items), columns=['user','item'])
test_df.to_csv("submission_DeepFM.csv", index=False)


In [None]:
test_df = test_df.sort_values(by=["user"], ascending=[True])
test_df.to_csv("submission_DeepFM_sorted.csv", index=False)