<a href="https://colab.research.google.com/github/bodamohannaik/Udemy_DL/blob/master/autoencoders/movie_recommendation_using_autoencoders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import Libraries

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import torch

# Download Dataset

In [2]:
!curl -O http://files.grouplens.org/datasets/movielens/ml-100k.zip
!ls

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4808k  100 4808k    0     0   9.7M      0 --:--:-- --:--:-- --:--:--  9.7M
ml-100k.zip  sample_data


In [3]:
!curl -O http://files.grouplens.org/datasets/movielens/ml-1m.zip
!ls

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 5778k  100 5778k    0     0  15.6M      0 --:--:-- --:--:-- --:--:-- 15.6M
ml-100k.zip  ml-1m.zip	sample_data


In [4]:
!unzip -q ml-100k.zip
!ls

ml-100k  ml-100k.zip  ml-1m.zip  sample_data


In [5]:
!unzip -q ml-1m.zip
!ls

ml-100k  ml-100k.zip  ml-1m  ml-1m.zip	sample_data


# Load Dataset

In [6]:
# all movies from 1m dataset
movies = pd.read_csv('ml-1m/movies.dat', sep="::", engine='python', encoding ='latin-1', header = None)
movies

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
3878,3948,Meet the Parents (2000),Comedy
3879,3949,Requiem for a Dream (2000),Drama
3880,3950,Tigerland (2000),Drama
3881,3951,Two Family House (2000),Drama


In [7]:
# all users from 1m dataset
users = pd.read_csv('ml-1m/users.dat', sep="::", engine='python', encoding ='latin-1', header = None)
users

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
...,...,...,...,...,...
6035,6036,F,25,15,32603
6036,6037,F,45,1,76006
6037,6038,F,56,1,14706
6038,6039,F,45,0,01060


In [8]:
# all ratings from 1m dataset
ratings = pd.read_csv('ml-1m/ratings.dat', sep="::", engine='python', encoding ='latin-1', header = None)
ratings

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [9]:
# training set 
training_set = pd.read_csv('ml-100k/u1.base', header = None, sep = '\t', engine='python', encoding = 'latin-1')
training_set

Unnamed: 0,0,1,2,3
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712
...,...,...,...,...
79995,943,1067,2,875501756
79996,943,1074,4,888640250
79997,943,1188,3,888640250
79998,943,1228,3,888640275


In [10]:
# test set 
test_set = pd.read_csv('ml-100k/u1.test', header = None, sep = '\t', engine='python', encoding = 'latin-1')
test_set

Unnamed: 0,0,1,2,3
0,1,6,5,887431973
1,1,10,3,875693118
2,1,12,5,878542960
3,1,14,5,874965706
4,1,17,3,875073198
...,...,...,...,...
19995,458,648,4,886395899
19996,458,1101,4,886397931
19997,459,934,3,879563639
19998,460,10,3,882912371


# Prepare Data

In [11]:
# unique users in training and test
user_ids = list(set(training_set[0].unique().tolist()+test_set[0].unique().tolist()))
user_ids.sort()
# unique movies in training and test
movie_ids = list(set(training_set[1].unique().tolist()+test_set[1].unique().tolist()))
movie_ids.sort()
print(f"User ids : {len(user_ids)}, Movie ids:{len(movie_ids)}")

User ids : 943, Movie ids:1682


In [12]:
def convert(data):
  df = pd.DataFrame(index = user_ids, columns = movie_ids)
  for ind, record in data.iterrows():
    df.loc[record[0], record[1]] = record[2]
  df.fillna(value = 0, inplace =True)
  return df

In [13]:
training_set_converted = convert(data = training_set)
training_set_converted 


Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,5,3,4,3,3,0,4,1,5,0,...,0,0,0,0,0,0,0,0,0,0
2,4,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0,0,0,0,0,0,0,0,5,0,...,0,0,0,0,0,0,0,0,0,0
940,0,0,0,2,0,0,4,5,3,0,...,0,0,0,0,0,0,0,0,0,0
941,5,0,0,0,0,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
942,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
test_set_converted = convert(data = test_set)
test_set_converted 

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
1,0,0,0,0,0,5,0,0,0,3,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
940,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
941,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
942,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X_train = torch.FloatTensor(training_set_converted.values)
X_train

tensor([[5., 3., 4.,  ..., 0., 0., 0.],
        [4., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 5., 0.,  ..., 0., 0., 0.]])

In [16]:
X_test = torch.FloatTensor(test_set_converted.values)
X_test

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [17]:
X_train.shape, X_test.shape

(torch.Size([943, 1682]), torch.Size([943, 1682]))

# AutoEncoder

In [23]:
class SAE(torch.nn.Module):
  def __init__(self):
    super(SAE, self).__init__()
    self.fc1 = torch.nn.Linear(in_features=len(movie_ids), out_features =20)
    self.fc2 = torch.nn.Linear(in_features=20, out_features =10)
    self.fc3 = torch.nn.Linear(in_features=10, out_features =20)
    self.fc4 = torch.nn.Linear(in_features=20, out_features=len(movie_ids))
    self.activation = torch.nn.Sigmoid()
  
  def forward(self, x):
    x = self.activation(self.fc1(x))
    x = self.activation(self.fc2(x))
    x = self.activation(self.fc3(x))
    x = self.fc4(x)
    return x

In [24]:
sae = SAE()
criterion = torch.nn.MSELoss()
criterion

MSELoss()

In [26]:
list(sae.parameters())

[Parameter containing:
 tensor([[-0.0229, -0.0002,  0.0109,  ..., -0.0203, -0.0084,  0.0120],
         [-0.0049,  0.0162,  0.0144,  ...,  0.0008, -0.0087, -0.0163],
         [-0.0113, -0.0052, -0.0226,  ..., -0.0015, -0.0049,  0.0122],
         ...,
         [ 0.0135, -0.0065, -0.0153,  ..., -0.0054, -0.0228, -0.0116],
         [-0.0193, -0.0008,  0.0006,  ..., -0.0208,  0.0166,  0.0102],
         [-0.0071, -0.0209,  0.0187,  ...,  0.0017,  0.0205,  0.0190]],
        requires_grad=True), Parameter containing:
 tensor([-0.0037, -0.0156,  0.0018, -0.0045, -0.0082,  0.0171,  0.0197, -0.0180,
         -0.0049,  0.0064,  0.0221, -0.0241, -0.0014, -0.0154,  0.0018, -0.0172,
         -0.0136, -0.0076, -0.0170,  0.0182], requires_grad=True), Parameter containing:
 tensor([[ 6.4335e-02, -1.7371e-01,  4.4766e-02,  1.4259e-01, -2.0679e-02,
           1.8920e-01,  7.2865e-02, -6.5304e-02, -2.6777e-02,  1.4591e-01,
           1.7153e-01, -1.9161e-02,  7.7246e-02, -1.0270e-01, -2.1337e-01,
         

In [27]:
optimizer = torch.optim.RMSprop(params = sae.parameters(), lr = 0.01, weight_decay=0.5)

# Training

In [33]:
torch.autograd.Variable(X_train[0]).unsqueeze(0)

tensor([[5., 3., 4.,  ..., 0., 0., 0.]])

In [39]:
nb_epochs = 200
for epoch in range(nb_epochs):
  train_loss = 0
  s = 0
  for id_user in range(len(user_ids)):
    input = torch.autograd.Variable(X_train[id_user]).unsqueeze(0)
    target = input.clone()
    # atleast one movie is rated
    if torch.sum(target.data>0) > 0:
      output = sae(input)
      target.require_grad = False
      # reset not rated movies
      output[target == 0] = 0
      # loss
      loss = criterion(output, target)
      loss.backward()
      optimizer.step()
      # train loss
      factor = len(movie_ids)/torch.sum(target.data>0)
      train_loss += np.sqrt(loss.data*factor)
      s += 1
  print(f"epoch: {epoch}, loss: {train_loss/s}")



epoch: 0, loss: 1.7659744024276733
epoch: 1, loss: 1.0966130495071411
epoch: 2, loss: 1.0534288883209229
epoch: 3, loss: 1.0384124517440796
epoch: 4, loss: 1.0308024883270264
epoch: 5, loss: 1.0268207788467407
epoch: 6, loss: 1.0237958431243896
epoch: 7, loss: 1.0221843719482422
epoch: 8, loss: 1.0207428932189941
epoch: 9, loss: 1.0196189880371094
epoch: 10, loss: 1.0188524723052979
epoch: 11, loss: 1.018357515335083
epoch: 12, loss: 1.017783522605896
epoch: 13, loss: 1.0173391103744507
epoch: 14, loss: 1.017177939414978
epoch: 15, loss: 1.0167347192764282
epoch: 16, loss: 1.0168002843856812
epoch: 17, loss: 1.01651132106781
epoch: 18, loss: 1.0164403915405273
epoch: 19, loss: 1.0160936117172241
epoch: 20, loss: 1.0161186456680298
epoch: 21, loss: 1.0160155296325684
epoch: 22, loss: 1.015859603881836
epoch: 23, loss: 1.015973448753357
epoch: 24, loss: 1.0156749486923218
epoch: 25, loss: 1.0156995058059692
epoch: 26, loss: 1.0152504444122314
epoch: 27, loss: 1.0152149200439453
epoch: 28

# Validation on Test Set

In [41]:

test_loss = 0
s = 0
for id_user in range(len(user_ids)):
  input = torch.autograd.Variable(X_train[id_user]).unsqueeze(0)
  target = torch.autograd.Variable(X_test[id_user]).unsqueeze(0)
  # atleast one movie is rated
  if torch.sum(target.data>0) > 0:
    output = sae(input)
    target.require_grad = False
    # reset input and not rated movies
    output[target == 0] = 0
    # loss
    loss = criterion(output, target)
    # train loss
    factor = len(movie_ids)/torch.sum(target.data>0)
    test_loss += np.sqrt(loss.data*factor)
    s += 1
print(f"test loss: {test_loss/s}")



test loss: 0.9489670395851135
