In [1]:
import numpy as np 
import pandas as pd 
import torch 
import torch.nn as nn
import torch.nn.parallel 
import torch.optim as optim 
import torch.utils.data
from torch.autograd import Variable

In [2]:
training_set = pd.read_csv('training_set.csv')

In [3]:
training_set.head()

Unnamed: 0,User,Movie,Rating,Timestamp
0,1,661,3,978302109
1,1,914,3,978301968
2,1,3408,4,978300275
3,1,2355,5,978824291
4,1,1287,5,978302039


In [4]:
test_set = pd.read_csv('test_set.csv')

In [5]:
test_set.head()

Unnamed: 0,User,Movie,Rating,Timestamp
0,1,1193,5,978300760
1,1,1197,3,978302268
2,1,2804,5,978300719
3,1,595,5,978824268
4,1,938,4,978301752


In [6]:
import seaborn as sns

In [7]:
movies = pd.read_csv('ml-1m/ml-1m/movies.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [8]:
users = pd.read_csv('ml-1m/ml-1m/users.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [9]:
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [10]:
ratings = pd.read_csv('ml-1m/ml-1m/ratings.dat', sep = '::', header = None, engine = 'python', encoding = 'latin-1')

In [11]:
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [12]:
movies[movies[0]==357]

Unnamed: 0,0,1,2
353,357,Four Weddings and a Funeral (1994),Comedy|Romance


In [13]:
training_set = np.array(training_set, dtype = 'int')

In [14]:
training_set

array([[        1,       661,         3, 978302109],
       [        1,       914,         3, 978301968],
       [        1,      3408,         4, 978300275],
       ...,
       [     6040,       562,         5, 956704746],
       [     6040,      1096,         4, 956715648],
       [     6040,      1097,         4, 956715569]])

In [15]:
test_set = np.array(test_set, dtype = 'int')

In [16]:
test_set

array([[        1,      1193,         5, 978300760],
       [        1,      1197,         3, 978302268],
       [        1,      2804,         5, 978300719],
       ...,
       [     6040,       527,         5, 956704219],
       [     6040,      2003,         1, 956716294],
       [     6040,       535,         4, 964828734]])

In [17]:
nb_users = int(max(max(training_set[:,0]),max(test_set[:,0])))

In [18]:
nb_users

6040

In [19]:
nb_movies = int(max(max(training_set[:,1]),max(test_set[:,1])))

In [20]:
nb_movies

3952

In [21]:
id_movies = training_set[:,1][training_set[:,0]==1]
id_movies

array([ 661,  914, 3408, 2355, 1287,  594,  919, 2918, 1035, 2791, 2687,
       2018, 3105,  720, 1270,  527, 2340,   48, 1097, 1721, 1545,  745,
       2294, 3186, 1566,  588, 1907, 1836, 1022,  150,    1, 1961, 2692,
       1028, 1029, 1207, 2028, 3114,  608, 1246])

In [22]:
def convert(data):
    new_data = []
    for id_users in range(1,nb_users+1):
        id_movies = data[:,1][data[:,0]==id_users]
        id_ratings = data[:,2][data[:,0]==id_users]
        ratings = np.zeros(nb_movies)
        ratings[id_movies-1] = id_ratings
        new_data.append(list(ratings))
    return new_data


In [23]:
training_set = convert(training_set)

In [24]:
test_set = convert(test_set)

In [25]:
len(training_set[0])

3952

In [26]:
len(training_set)

6040

In [27]:
 training_set = torch.FloatTensor(training_set)

In [28]:
 test_set = torch.FloatTensor(test_set)

In [29]:
training_set

tensor([[5., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [3., 0., 0.,  ..., 0., 0., 0.]])

In [30]:
test_set

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [31]:
class SAE(nn.Module):
    def __init__(self, ):
        super(SAE,self).__init__()
        self.fc1 = nn.Linear(nb_movies,20)
        self.fc2 = nn.Linear(20,10)
        self.fc3 = nn.Linear(10,20)
        self.fc4 = nn.Linear(20,nb_movies)
        self.activation = nn.Sigmoid()
    def forward(self,x):
        x=self.activation(self.fc1(x))
        x=self.activation(self.fc2(x))
        x=self.activation(self.fc3(x))
        x=self.fc4(x)
        return x
    
sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr = 0.01, weight_decay = 0.5)
    
        

In [32]:
nb_epochs = 200

In [33]:
for epoch in range(1,nb_epochs+1):
    train_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies/float(torch.sum(target.data > 0 )+ 1e-10)
            loss.backward() #decided the direction of update
            train_loss += np.sqrt(loss.item()*mean_corrector)
            s += 1. 
            optimizer.step()  #decides the intensity of update
    print('epoch : '+str(epoch)+'loss: '+str(train_loss/s))
            
    
    

epoch : 1loss: 1.3478086477607272
epoch : 2loss: 1.0099722382425138
epoch : 3loss: 0.989965574088563
epoch : 4loss: 0.9832631817745073
epoch : 5loss: 0.9801505619759615
epoch : 6loss: 0.9783678935924842
epoch : 7loss: 0.9773657703478474
epoch : 8loss: 0.9765357378747919
epoch : 9loss: 0.9759378646957357
epoch : 10loss: 0.9754782408866515
epoch : 11loss: 0.9753469568286042
epoch : 12loss: 0.9748521926096132
epoch : 13loss: 0.9747134685509039
epoch : 14loss: 0.9746140776221671
epoch : 15loss: 0.9744869989291467
epoch : 16loss: 0.9742890532216576
epoch : 17loss: 0.9742294160331261
epoch : 18loss: 0.9740309084293539
epoch : 19loss: 0.974026320116927
epoch : 20loss: 0.9738597710773592
epoch : 21loss: 0.9737657226018885
epoch : 22loss: 0.9737042983773742
epoch : 23loss: 0.9735370076313347
epoch : 24loss: 0.9732751950371644
epoch : 25loss: 0.9727709354932378
epoch : 26loss: 0.9721188872530531
epoch : 27loss: 0.9713210615727462
epoch : 28loss: 0.9704315196316684
epoch : 29loss: 0.9695356633606

In [34]:
training_set[0].shape

torch.Size([3952])

In [35]:
(Variable(training_set[0]).unsqueeze(0)).shape

torch.Size([1, 3952])

In [36]:
torch.sum(training_set[0].data > 0 )

tensor(40)

In [38]:
training_set.shape

torch.Size([6040, 3952])

In [39]:
test_set.shape

torch.Size([6040, 3952])

In [51]:
test_loss = 0
s = 0.
for id_user in range(nb_users):
    input = Variable(training_set[id_user]).unsqueeze(0)
    target = Variable(test_set[id_user])
    if torch.sum(target.data > 0) > 0:
        output = sae(input)
        target.require_grad = False
        output[target.view(1, -1) == 0] = 0
        loss = criterion(output, target)
        mean_corrector = nb_movies/float(torch.sum(target.data > 0 )+ 1e-10)
        test_loss += np.sqrt(loss.item()*mean_corrector)
        s += 1. 
          
print('test loss: '+str(test_loss/s))
            

  return F.mse_loss(input, target, reduction=self.reduction)


test loss: 0.912290756977653


In [43]:
t = Variable(test_set[id_user])

In [47]:
len(t)

3952

In [54]:
test_set

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])