# Collaborative Filtering - Matrix factorization
This script takes user and movie ratings as input to predict a user's rating of another movie, and thus recommend it.

In [17]:
#!pip install torchtext
#!pip install torch

In [18]:
#### Packages ####
import torch
import pandas as pd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
from scipy.sparse import rand as sprand
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
from torch.utils import data
from torchtext.data import Dataset, BucketIterator, Field, TabularDataset, Iterator
import math

%matplotlib inline
from sklearn.metrics import accuracy_score
import matplotlib
import matplotlib.pyplot as plt

In [19]:
#### Activating cuda for speeding up training ####
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device in use:", device)

Device in use: cuda


In [20]:
#### Loading dataset ####
# We are only interested in user ID, movie ID and ratings
ratings_df = pd.read_csv('ratings_small.csv', usecols = ['userId','movieId','rating'])

In [21]:
#### Creating new indices ####
def zero_indexing(column):
    uniq = column.unique()
    newindex = {o:i for i,o in enumerate(uniq)}
    # Subtracting 1 from the original ID's
    return newindex, np.array([newindex.get(x, -1) for x in column])

_,usercol = zero_indexing(ratings_df["userId"])
_,moviecol = zero_indexing(ratings_df["movieId"])

ratings_df["userId"] = usercol
ratings_df["movieId"] = moviecol

# Removing ID's which are less than 0
ratings_dataset = ratings_df[ratings_df["userId"] >= 0]
ratings_dataset = ratings_df[ratings_df["movieId"] >= 0]

print("Printing head of dataset with new IDs: ")
print(ratings_dataset.head())

Printing head of dataset with new IDs: 
   userId  movieId  rating
0       0        0     2.5
1       0        1     3.0
2       0        2     3.0
3       0        3     2.0
4       0        4     4.0


In [22]:
#### Splitting dataset into training, validation and test sets ####
n = len(ratings_dataset)
sizes = [0.7, 0.2, 0.1]
train_size = int(sizes[0]*n)
val_size = int(sizes[1]*n)
test_size = int(sizes[2]*n)

train_set = ratings_dataset[:train_size-1].copy() # till 70002-1 = 70001
val_set = ratings_dataset[train_size:-test_size-1].copy() # from 70002 to 100.004 - 20000
test_set = ratings_dataset[train_size+val_size:].copy() # from train+val size

print("Entire dataset: ", n)
print("Train size: ", train_set.shape[0])
print("Validation size: ", val_set.shape[0])
print("Test size: ", test_set.shape[0])
print("Checking if dimensions match: ", train_set.shape[0] + val_set.shape[0] + test_set.shape[0] == n )

Entire dataset:  100004
Train size:  70001
Validation size:  20001
Test size:  10002
Checking if dimensions match:  True


In [23]:
#### Creating a dataformatting to format the data ####
class dataformatting(Dataset):
    def __init__(self, users, movies, rating):
        self.movies = movies
        self.users = users
        self.rating = rating
        
    def __len__(self):
        return len(self.rating)

    def __getitem__(self, index):
        u = self.users[index]
        m = self.movies[index]
        r = self.rating[index]
        #obs = {'movieId':movieId,'userId':userId,'rating':rating}
        #obs = self.movieLens.drop('timestamp',axis=1)
        #obs = obs.iloc[index,:].as_matrix()
        return [u, m, r]

In [24]:
#### Formatting training, validation and training sets ####
u_train = torch.LongTensor(train_set.userId.values)
m_train = torch.LongTensor(train_set.movieId.values)
r_train = torch.FloatTensor(train_set.rating.values)

train_dataset = dataformatting(u_train, m_train, r_train)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=100, shuffle=True)

u_val = torch.LongTensor(val_set.userId.values)
m_val = torch.LongTensor(val_set.movieId.values)
r_val = torch.FloatTensor(val_set.rating.values)
val_dataset = dataformatting(u_val, m_val,r_val)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=100, shuffle=True)

u_test = torch.LongTensor(test_set.userId.values)
m_test = torch.LongTensor(test_set.movieId.values)
r_test = torch.FloatTensor(test_set.rating.values)
test_dataset = dataformatting(u_test, m_test, r_test)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=100, shuffle=True)

In [25]:
t1 = iter(train_loader)
next(t1)
t2 = iter(val_loader)
next(t2)
t3 = iter(test_loader)
next(t3)
#iter(train_batches.next())
#iter(val_batches.next())
#iter(test_batches.next())

[tensor([ 653,  647,  606,  615,  647,  627,  597,  626,  663,  623,
          623,  647,  597,  623,  614,  652,  653,  607,  623,  632,
          614,  598,  607,  663,  623,  623,  610,  623,  623,  640,
          627,  614,  666,  653,  604,  654,  604,  601,  663,  607,
          626,  611,  623,  627,  670,  614,  613,  597,  622,  647,
          636,  626,  607,  606,  608,  596,  622,  614,  657,  606,
          623,  597,  636,  615,  670,  647,  624,  606,  606,  663,
          663,  606,  653,  647,  654,  608,  663,  653,  645,  623,
          604,  653,  651,  620,  638,  598,  604,  623,  640,  611,
          608,  664,  659,  653,  613,  623,  614,  664,  623,  607]),
 tensor([ 3174,  7326,   151,  2073,  6230,   769,   481,  3901,  8684,
          1706,  2363,  1244,   243,   968,   181,  9043,   386,   334,
          2141,    81,   122,    29,  4212,  1509,   183,  4026,  4084,
          5932,  8783,  3063,   416,  1619,   685,  1412,  4792,   487,
           861,  194

In [26]:
num_user = len(ratings_dataset.userId.unique()) 
num_movie = len(ratings_dataset.movieId.unique())

class MatrixFactorization(nn.Module):
    def __init__(self, num_user, num_movie, emb_size=100):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_user, emb_size)
        self.movie_embedding = nn.Embedding(num_movie, emb_size)
        
    def forward(self, u, m):
        U = self.user_embedding(u)
        M = self.movie_embedding(m)
        r_max = 5 # maximum rating
        r_min = 1 # minimum rating
        return F.sigmoid((U*M).sum(1))*(r_max - r_min) + r_min

In [27]:
model = MatrixFactorization(num_user, num_movie, 100)
model.parameters()
[p for p in model.parameters()]

[Parameter containing:
 tensor([[ 2.1312e+00, -1.7845e+00,  6.3465e-02,  ..., -5.0529e-02,
           1.6703e-01,  8.5937e-01],
         [-3.6144e-01, -1.3320e+00, -1.1073e-01,  ..., -3.0096e-01,
           6.7772e-04, -7.5342e-01],
         [-2.3945e+00,  5.5422e-01, -1.2960e+00,  ...,  1.1372e-01,
           8.2862e-01,  1.7779e+00],
         ...,
         [ 9.7439e-02,  2.3393e-01, -1.8835e-03,  ...,  1.0384e+00,
          -5.7337e-01,  2.3056e-01],
         [ 2.9964e-01,  5.8796e-01, -7.5203e-01,  ..., -4.5147e-01,
          -8.7082e-01,  8.8244e-01],
         [ 1.5405e-01,  1.2580e+00,  1.3282e+00,  ...,  4.2246e-03,
           9.2561e-01, -6.0783e-01]]), Parameter containing:
 tensor([[-0.9053, -0.3488,  0.8056,  ..., -0.4942, -0.2067,  0.0937],
         [-0.2199, -0.9943,  1.5239,  ..., -0.5914,  0.7237, -0.3912],
         [-0.3353,  0.7329,  2.1100,  ...,  1.9340,  0.3706, -0.3525],
         ...,
         [ 0.1909, -0.6452,  0.0886,  ...,  1.2922, -2.3757, -1.0737],
         [-

In [28]:
num_epochs = 10
lr = 0.01
wd = 1e-6

def training_loss(model, epochs=10, lr=0.01, wd=0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters()) # get all parameters which need grad
    optimizer = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    model.train() # into training mode
    for i in range(epochs):
        running_loss = 0.0
        
        for j, data in enumerate(train_loader):
            users, items, ratings = data
            users = Variable(users)
            items = Variable(items)
            ratings = Variable(ratings).float()
        
            y_hat = model(users,items)
            loss = F.mse_loss(y_hat, ratings) # crossentropy?
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print("Training loss for epoch",i+1,":", running_loss/j+1) # used to be loss.data[0]

In [29]:
def validation_loss(model):
    model.eval() # go to evaluation mode
    
    running_loss = 0.
    for j, data in enumerate(val_loader):
        users, items, ratings = data
        users = Variable(users)
        items = Variable(items)
        ratings = Variable(ratings).float()
            
        y_hat = model(users,items)
        loss_now = F.mse_loss(y_hat, ratings)
        running_loss+= loss_now.item()
        
    print("Validation loss:", running_loss/len(val_batches)) # j means this many iterations till end

In [30]:
#training_loss(model, epochs=10, lr=0.01, wd = 0.0)

In [31]:
#### Training loop ####
def train(model, train_loader, optimizer, criterion, epoch):
    model.train()
    for batch_idx in enumerate(train_loader):
        users, items, ratings = data
        users = Variable(users)
        items = Variable(items)
        ratings = Variable(ratings).float()
        
        optimizer.zero_grad()
        output = model(users, items)
        output = output.view(-1)

        loss = criterion(output.float(), ratings)
        loss.backward()
        optimizer.step()

def validate(model, val_loader, criterion, epoch):
    model.eval()
    outputlist = []
    val_loss = 0
    TP,FP,TN,FN = 0,0,0,0
    with torch.no_grad():
        for batch_idx in enumerate(val_loader):
            users, items, ratings = data
            users = Variable(users)
            items = Variable(items)
            ratings = Variable(ratings).float()
            
            output = model(users, items)
            output_flat = [0 if o < 0.5 else 1 for o in output.data]
            p = output_flat.count(1)
            TP += p
            FN += len(output_flat) - p
            outputlist += [output]
            val_loss += criterion(output.float(), ratings).item() # sum up batch loss

    #print(TP, FN)
    acc = (TP + TN)/(TP + TN + FP + FN)
    #sens = TP/(TP + FN)
    val_loss /= len(val_loader)
    
    
    print(f'Epoch {epoch}: Validation average loss: {val_loss:.2f} | Accuracy: {acc:.2f}')
    return acc, val_loss

def training(epochs, lr=0.001, wd = 1e-6): 
    model = MatrixFactorization(num_user, num_movie, 100)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay = wd)
    
    accs = []
    losses = []
    
    for epoch in range(1, epochs + 1):
        train(model, train_loader, optimizer, criterion, epoch) # HER 
        acc, val_loss = validate(model, val_loader, criterion, epoch)# HER
        accs += [acc]
        losses += [val_loss]
    
    plt.plot(range(1,epochs+1), accs)
    plt.show()
    plt.plot(range(1,epochs+1), losses)
    plt.show()

In [32]:
training(epochs=5, lr=0.0001, wd=1e-6)

TypeError: 'module' object is not iterable