In [1]:
import os
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle

import torch
import torch.nn as nn
import torch.nn.functional as F

%matplotlib inline

In [2]:
# !wget -nc http://files.grouplens.org/datasets/movielens/ml-20m.zip

In [3]:
# !unzip ml-20m.zip

In [4]:
DATA_DIR = 'ml-20m/'
print(os.listdir(DATA_DIR))

['README.txt', 'links.csv', 'ratings.csv', 'tags.csv', 'movies.csv', 'genome-scores.csv', 'genome-tags.csv']


In [5]:
ratings = pd.read_csv(DATA_DIR+'ratings.csv')
print(ratings.shape)
ratings.head()

(20000263, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [6]:
ratings.userId = pd.Categorical(ratings.userId)
ratings['userId_codes'] = ratings.userId.cat.codes

In [7]:
ratings.movieId = pd.Categorical(ratings.movieId)
ratings['movieId_codes'] = ratings.movieId.cat.codes

In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,userId_codes,movieId_codes
0,1,2,3.5,1112486027,0,1
1,1,29,3.5,1112484676,0,28
2,1,32,3.5,1112484819,0,31
3,1,47,3.5,1112484727,0,46
4,1,50,3.5,1112484580,0,49


In [9]:
print('Unique User ids : ',len(ratings.userId_codes.unique()))
print('Unique Movie ids : ',len(ratings.movieId_codes.unique()))

Unique User ids :  138493
Unique Movie ids :  26744


In [9]:
class Model(nn.Module):
    def __init__(self,n_users,n_movies,embed_dim,hidden_units = 1024):
        super(Model,self).__init__()
        self.N = n_users
        self.D = embed_dim
        self.M = n_movies
        self.n_hidden = hidden_units
        
        self.user_embed = nn.Embedding(self.N,self.D)
        self.movie_embed = nn.Embedding(self.M,self.D)
        
        self.fc1 = nn.Linear(2 * self.D , self.n_hidden)
        self.fc2 = nn.Linear(self.n_hidden,1)
        
    def forward(self,user,movie):
        out_user = self.user_embed(user)
        out_movie = self.movie_embed(movie)
        
        out = torch.cat((out_user,out_movie),1)
        out = self.fc1(out)
        out = F.relu(out)
        out = self.fc2(out)
        return out

In [10]:
## create dataset
user_ids_np = ratings.userId_codes.values
movie_ids_np = ratings.movieId_codes.values
ratings_np = ratings.rating.values

N = len(set(user_ids_np))
M = len(set(movie_ids_np))
print('Unique users : ',N)
print('Unoque movies : ',M)

Unique users :  138493
Unoque movies :  26744


In [11]:
# randomly shuffle dataset
user_ids_np,movie_ids_np,ratings_np = shuffle(user_ids_np,movie_ids_np,ratings_np)

# convert numpy arras into tensors
user_ids_t = torch.from_numpy(user_ids_np).long()
movie_ids_t = torch.from_numpy(movie_ids_np).long()
ratings_t = torch.from_numpy(ratings_np)

# create data iterator
N_train = int(0.8 * len(ratings_np))
train_dataset = torch.utils.data.TensorDataset(
    user_ids_t[:N_train],
    movie_ids_t[:N_train],
    ratings_t[:N_train]
)

test_dataset = torch.utils.data.TensorDataset(
    user_ids_t[N_train:],
    movie_ids_t[N_train:],
    ratings_t[N_train:]
)

train_loader = torch.utils.data.DataLoader(train_dataset,batch_size = 32,shuffle = True)
test_loader = torch.utils.data.DataLoader(test_dataset,batch_size = 16,shuffle = False)

In [12]:
### constants
# N = len(set(user_ids_np))
# M = len(set(movie_ids_np))
D = 10
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'

print(DEVICE)

cuda:0


In [13]:
## load model

model = Model(N,M,D)
model.to(DEVICE)
print(model)

Model(
  (user_embed): Embedding(138493, 10)
  (movie_embed): Embedding(26744, 10)
  (fc1): Linear(in_features=20, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1, bias=True)
)


In [14]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

In [15]:
# def train_model(model,optimizer,criterion,train_iter,test_iter,epochs,device = 'cpu'):
    
#     train_losses = np.zeros(epochs)
#     test_losses = np.zeros(epochs)
    
#     for epoch in range(epochs):
        
#         start_time = time.time()
        
#         # train batch loop
#         train_loss = []
#         for user,movie,rating in train_iter:
#             rating = rating.view(-1,1).float()
#             user,movie,rating = user.to(device),movie.to(device),rating.to(device)
            
#             optimizer.zero_grad()
#             out = model(user,movie)
# #             print('out shape : ',out.shape)
#             loss = criterion(out,rating)
#             loss.backward()
#             optimizer.step()
            
#             train_loss.append(loss.item())
        
#         tr_loss_ = np.mean(train_loss)
#         train_losses[epoch] = tr_loss_
        
#         # test batch loop
#         test_loss = []
#         for user,movie,rating in test_iter:
#             rating = rating.view(-1,1).float()
# #             print(rating.shape)
#             user,movie,rating = user.to(device),movie.to(device),rating.to(device)
            
#             out = model(user,movie)
# #             print(out)
#             loss = criterion(out,rating)
            
#             test_loss.append(loss.item())
            
#         te_loss_ = np.mean(test_loss)
#         test_losses[epoch] = te_loss_
        
#         time_taken = time.time() - start_time
#         torch.save(model.state_dict(),'tmp/reco_model_epoch_{}.pt'.format(epoch))
#         print(f'Epoch : {epoch}, train loss : {tr_loss_:.4f}, test loss : {te_loss_:.4f}, duration : {time_taken:.4f}')
        
#     return train_losses,test_losses

# train_loss,test_loss = train_model(model,optimizer,criterion,train_loader,test_loader,25,device = DEVICE)

In [16]:
def train_model(model,optimizer,criterion,train_dataset,test_dataset,batch_size,epochs,device = 'cpu'):
    
    train_losses = np.zeros(epochs)
    test_losses = np.zeros(epochs)
    
    n_train_batches = len(train_dataset)//batch_size
    n_test_batches = len(test_dataset)//batch_size
    print('no of training batches : ',n_train_batches)
    print('no of testing batches : ',n_test_batches)
    
    train_users,train_movies,train_ratings = train_dataset.tensors
    test_users,test_movies,test_ratings = test_dataset.tensors
    
    for epoch in range(epochs):
        
        start_time = time.time()
        
        # train batch loop
        train_loss = []
        train_users,train_movies,train_ratings = shuffle(train_users,train_movies,train_ratings)

        for batch_no in range(n_train_batches):
            user = train_users[(batch_no*batch_size) : (batch_no*batch_size + batch_size)]
            movie = train_movies[(batch_no*batch_size) : (batch_no*batch_size + batch_size)]
            rating = train_ratings[(batch_no*batch_size) : (batch_no*batch_size + batch_size)]
            
            rating = rating.view(-1,1).float()
            user,movie,rating = user.to(device),movie.to(device),rating.to(device)
            
            optimizer.zero_grad()
            out = model(user,movie)
            loss = criterion(out,rating)
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
        tr_loss_ = np.mean(train_loss)
        train_losses[epoch] = tr_loss_
            
        
        # test batch loop
        test_loss = []
        
        test_users,test_movies,test_ratings = shuffle(test_users,test_movies,test_ratings)
        
        for batch_no in range(n_test_batches):
            user = test_users[(batch_no*batch_size) : (batch_no*batch_size + batch_size)]
            movie = test_movies[(batch_no*batch_size) : (batch_no*batch_size + batch_size)]
            rating = test_ratings[(batch_no*batch_size) : (batch_no*batch_size + batch_size)]
            rating = rating.view(-1,1).float()
#             print(rating.shape)
            user,movie,rating = user.to(device),movie.to(device),rating.to(device)
            
            out = model(user,movie)
#             print(out)
            loss = criterion(out,rating)
            
            test_loss.append(loss.item())
        te_loss_ = np.mean(test_loss)
        test_losses[epoch] = te_loss_
            
        
        time_taken = time.time() - start_time
        torch.save(model.state_dict(),'tmp/reco_model_epoch_{}.pt'.format(epoch))
        print(f'Epoch : {epoch}, train loss : {tr_loss_:.4f}, test loss : {te_loss_:.4f}, duration : {time_taken:.4f}')
        
    return train_losses,test_losses

# train_loss,test_loss = train_model(model,optimizer,criterion,train_dataset,test_dataset,32,25,device = DEVICE)

In [26]:
train_users,train_movies,train_ratings = train_dataset.tensors
test_users,test_movies,test_ratings = test_dataset.tensors
print(train_users.shape,train_movies.shape,train_ratings.shape)

torch.Size([16000210]) torch.Size([16000210]) torch.Size([16000210])


In [28]:
## making predcitions using trained model

# load model weights (pytorch epoch 25)
model_pth = 'tmp/reco_model_epoch_24.pt'
model.load_state_dict(torch.load(model_pth))

# using single observation
random_int = np.random.randint(len(test_ratings))
# print(random_int)
user,movie,rating = test_users[random_int:(random_int+1)],test_movies[random_int:(random_int+1)],test_ratings[random_int:(random_int+1)]
# user = user.view(-1,1)
# movie = movie.view(-1,1)
# rating = rating.view(-1,1).float()
user,movie,rating = user.to(DEVICE),movie.to(DEVICE),rating.to(DEVICE)
# print(user,movie,rating)
out = model(user,movie)
print(out.item())

3.1799793243408203
