In [112]:
import pandas as pd
import numpy as np
import math
import sys
from scipy.sparse.linalg import svds

In [85]:
cols = ['userId', 'movieId', 'rating']

ratings = pd.read_csv(r'C:\Users\Korisnik\Desktop\Work\DataSets\ratings_small.csv', usecols = cols)
print(ratings)

        userId  movieId  rating
0            1       31     2.5
1            1     1029     3.0
2            1     1061     3.0
3            1     1129     2.0
4            1     1172     4.0
...        ...      ...     ...
99999      671     6268     2.5
100000     671     6269     4.0
100001     671     6365     4.0
100002     671     6385     2.5
100003     671     6565     3.5

[100004 rows x 3 columns]


In [86]:
ratings_set = ratings.values
ratings_set

array([[1.000e+00, 3.100e+01, 2.500e+00],
       [1.000e+00, 1.029e+03, 3.000e+00],
       [1.000e+00, 1.061e+03, 3.000e+00],
       ...,
       [6.710e+02, 6.365e+03, 4.000e+00],
       [6.710e+02, 6.385e+03, 2.500e+00],
       [6.710e+02, 6.565e+03, 3.500e+00]])

In [87]:
ratings_table = ratings.pivot_table(index = 'userId', columns = 'movieId', values = 'rating')

averages = ratings_table.mean(axis = 1)

normalized_ratings_table = ratings_table.apply(lambda x : x - averages[x.name], axis = 1)
normalized_ratings_table = normalized_ratings_table.fillna(0)

In [88]:
print(normalized_ratings_table)

movieId    1       2       3       4       5         6       7       8       \
userId                                                                        
1        0.000000     0.0    0.00     0.0     0.0  0.000000     0.0     0.0   
2        0.000000     0.0    0.00     0.0     0.0  0.000000     0.0     0.0   
3        0.000000     0.0    0.00     0.0     0.0  0.000000     0.0     0.0   
4        0.000000     0.0    0.00     0.0     0.0  0.000000     0.0     0.0   
5        0.000000     0.0    0.09     0.0     0.0  0.000000     0.0     0.0   
...           ...     ...     ...     ...     ...       ...     ...     ...   
667      0.000000     0.0    0.00     0.0     0.0  0.352941     0.0     0.0   
668      0.000000     0.0    0.00     0.0     0.0  0.000000     0.0     0.0   
669      0.000000     0.0    0.00     0.0     0.0  0.000000     0.0     0.0   
670      0.193548     0.0    0.00     0.0     0.0  0.000000     0.0     0.0   
671      1.082609     0.0    0.00     0.0     0.0  0

In [89]:
R = normalized_ratings_table.values

In [90]:
keys, values = normalized_ratings_table.columns, (np.arange(9066)+1)

new_id = dict(zip(keys, values))
new_id

normalized_ratings_table.columns = np.arange(9066)+1

In [91]:
print(normalized_ratings_table)

            1     2     3     4     5         6     7     8     9     \
userId                                                                 
1       0.000000   0.0  0.00   0.0   0.0  0.000000   0.0   0.0   0.0   
2       0.000000   0.0  0.00   0.0   0.0  0.000000   0.0   0.0   0.0   
3       0.000000   0.0  0.00   0.0   0.0  0.000000   0.0   0.0   0.0   
4       0.000000   0.0  0.00   0.0   0.0  0.000000   0.0   0.0   0.0   
5       0.000000   0.0  0.09   0.0   0.0  0.000000   0.0   0.0   0.0   
...          ...   ...   ...   ...   ...       ...   ...   ...   ...   
667     0.000000   0.0  0.00   0.0   0.0  0.352941   0.0   0.0   0.0   
668     0.000000   0.0  0.00   0.0   0.0  0.000000   0.0   0.0   0.0   
669     0.000000   0.0  0.00   0.0   0.0  0.000000   0.0   0.0   0.0   
670     0.193548   0.0  0.00   0.0   0.0  0.000000   0.0   0.0   0.0   
671     1.082609   0.0  0.00   0.0   0.0  0.000000   0.0   0.0   0.0   

            10    ...  9057  9058  9059  9060  9061  9062  9063

In [92]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(ratings_set, test_size=0.2, random_state=42)

In [76]:
def SGD():
    
    alpha = 0.01
    epochs = 10
    l_factors = 50
    
    p = np.random.normal(0, .1, (R.shape[0], l_factors))
    q = np.random.normal(0, .1, (R.shape[1], l_factors))
    
    for i in range(epochs):
        for j in range(train.shape[0]):
                user, item, rating = train[j]
                user = user - 1
                item = new_id[item] - 1
                #print(user ,item , rating)
                #print(type(user.astype(np.int32)))
                error = rating - np.dot(p[user.astype(np.int32)], q[item.astype(np.int32)])
                p[user.astype(np.int32)] += alpha * error * q[item.astype(np.int32)]
                q[item.astype(np.int32)] += alpha * error * p[user.astype(np.int32)]
    return p,q
                

In [77]:
P, Q = SGD()

In [79]:
def pred(u,i):
    return np.dot(P[u-1],Q[i-1])

In [111]:
pred(2,1)

1.8108495739044483

In [113]:
prediction = []

for i in range(len(test)):
    #print(test[i][2],pred(test[i][0].astype(np.int32), new_id[test[i][1]].astype(np.int32)))
    prediction.append(pred(test[i][0].astype(np.int32), new_id[test[i][1]].astype(np.int32)))

In [108]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(test[:,2],prediction)
print(mse)
rmse = np.sqrt(mse)
print(rmse)

0.8467282508146903
0.9201783798887531
