In [2]:
import pandas as pd
import numpy as np
import gc

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

### Read Data

Mapping dictionary has been generated when processing raw data

We just need to use the dictionary to transform validation dataset


In [4]:
def read_dict(file_path):
    '''read id mapping into dictionary
    '''
    d = {}
    with open(file_path) as f:
        next(f)
        for line in f:
           (key, val) = line.strip('\n').split(',')
           d[int(key)] = int(val)
    return d

In [5]:
user_dict = read_dict('./processed/user_dict.txt')    

In [6]:
movie_dict = read_dict('./processed/movie_dict.txt')  

In [7]:
num_user = len(user_dict)
num_movie = len(movie_dict)

print('{} users and {} movies'.format(num_user, num_movie))

400267 users and 17375 movies


In [8]:
dtypes = {'user_id': 'int32',
          'movie_id': 'int16',
          'rating': 'int8'}

cols = ['user_id', 'movie_id', 'rating']

In [9]:
train = pd.read_csv('processed/netflix_train_encoded.csv', usecols=cols, dtype=dtypes)

In [10]:
val = pd.read_csv('processed/netflix_val.csv', usecols=cols, dtype=dtypes)

In [11]:
val['user_id'] = val['user_id'].map(user_dict)     # map user_id to continious index
val['movie_id'] = val['movie_id'].map(movie_dict)  # map movie_id to continious index

In [12]:
# remove user and movie not seen in train
val = val.loc[val['user_id'].notnull() & val['movie_id'].notnull()]   

### Matrix Factorization Model

In [13]:
class Netflix_Dataset(Dataset):
    def __init__(self, df):
        self.users = df['user_id'].values
        self.movies = df['movie_id'].values
        self.ratings = df['rating'].values
        
    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, idx):
        return [self.users[idx], self.movies[idx], self.ratings[idx]] 

In [14]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        # initlializing weights
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        
    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)   

In [30]:
def train_epocs(model, train_dl, test_dl, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    model.train()
    for epoch in range(epochs):
        for i, (users, items, ratings) in enumerate(train_dl):
            #users = torch.LongTensor(users) #.cuda()
            #items = torch.LongTensor(movies) #.cuda()
            #ratings = torch.FloatTensor(ratings)  #.cuda()
            
            if unsqueeze:
                ratings = ratings.unsqueeze(1)
            y_hat = model.forward(users.long(), items.long())
            loss = F.mse_loss(y_hat, ratings.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print("[epoch {}] loss: {}".format(epoch+1, loss.item())) # used to be loss.data[0]
        
    print("[test] loss: {}".format(test_loss(model, test_dl, unsqueeze)))

In [31]:
def test_loss(model, test_dl, unsqueeze=False):
    model.eval()
    #users, items, ratings = test
    #users = torch.LongTensor(df_val.userId.values) # .cuda()
    #items = torch.LongTensor(df_val.movieId.values) #.cuda()
    #ratings = torch.FloatTensor(df_val.rating.values) #.cuda()
    
    total = 0
    sum_loss = 0
    
    for i, (users, items, ratings) in enumerate(test_dl):
        if unsqueeze:
            ratings = ratings.unsqueeze(1)
        
        batch_size = ratings.shape[0]
        y_hat = model(users.long(), items.long())
        
        batch_loss = F.mse_loss(y_hat, ratings.float()).item() * batch_size
        sum_loss += batch_loss
        total += batch_size
        
    return sum_loss/total

### Train MF Model

In [32]:
train_ds = Netflix_Dataset(train[:10000])
valid_ds = Netflix_Dataset(val[:10000])

In [33]:
batch_size = 100
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
valid_dl = DataLoader(valid_ds, batch_size=batch_size)

In [34]:
model = MF(num_user, num_movie, 100)

In [35]:
train_epocs(model, train_dl, valid_dl, epochs=1, lr=0.01)

[epoch 1] loss: 3.5486316680908203
[test] loss: 7.437562992572785
