In [1]:
# Import required packages and set some global parametars

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats.stats import pearsonr
from scipy.spatial import distance

BETA = 10 # Discount Factor in Discounted Pearoson Corellation Coefficent

In [2]:
# Import MovieLens dataset into pandas DataFrame

data = pd.read_csv(r'C:/Users/Korisnik/ratings.csv', usecols = ['userId', 'movieId', 'rating'])
data

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [3]:
# Split data into Training Data and Test Data

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [4]:
print('Training Data has {} rows, and Test Data has {} rows'.format(train_data.shape[0], test_data.shape[0]))

Training Data has 80668 rows, and Test Data has 20168 rows


In [5]:
# Make ratings tables from Training Data 

ratings = train_data.pivot_table(index = 'userId', columns = 'movieId', values = 'rating')
ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [6]:
print('Ratings matrix has {} rows and {} coulmns'.format(ratings.shape[0], ratings.shape[1]))

Ratings matrix has 610 rows and 8983 coulmns


In [7]:
# Now it would be useful to normalize the ratings around the average of each user (row)
# First we need to make averages array

average_ratings = ratings.mean(axis = 1)
average_ratings

userId
1      4.331606
2      3.920000
3      2.580645
4      3.464706
5      3.657895
         ...   
606    3.657002
607    3.744828
608    3.117820
609    3.290323
610    3.683429
Length: 610, dtype: float64

In [8]:
# Normalizing the ratings matrix

normalized_ratings = ratings.apply(lambda x: x - average_ratings[x.name], axis = 1)
normalized_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.331606,,-0.331606,,,-0.331606,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157002,,,,,,-1.157002,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,-0.617820,-1.11782,-1.117820,,,,,,,0.882180,...,,,,,,,,,,
609,-0.290323,,,,,,,,,0.709677,...,,,,,,,,,,


In [9]:
# Make a numpy ratings matrix for 'easier' use

R = np.zeros((610,193609))

tmp = normalized_ratings.unstack().reset_index(name='rating')

tmp = tmp[tmp['rating'].notna()]

for index,row in tmp.iterrows():
    R[row['userId'].astype(np.int)-1][row['movieId'].astype(np.int)-1] = row['rating'] 
R 

array([[-0.33160622,  0.        , -0.33160622, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.61782032, -1.11782032, -1.11782032, ...,  0.        ,
         0.        ,  0.        ],
       [-0.29032258,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.31657088,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [10]:
def intersection(u,v):
    _u, _v = [], []
    for i in range(len(u)):
        if u[i] is not 0 and v[i] is not [0]:
            _u.append(u[i])
            _v.append(v[i])
    return _u, _v

In [17]:
def find_similar_users(u):
    sims = []
    for v in range(610):
        if v is not u:
            a, b= intersection(R[u], R[v])
            sims.append((v,pearsonr(a, b)[0]))
    sims = sorted(sims, key=lambda x: x[1])
    sims.reverse()
    return sims[:5]

In [18]:
def find_similar_users_who_have_rated_specific_item(u, j):
    sims = []
    for v in range(610):
        if v is not u and R[v][j] != 0:
            a, b = intersection(R[u], R[v])
            sims.append((v,pearsonr(a, b)[0]))
    sims = sorted(sims, key=lambda x: x[1])
    sims.reverse()
    return sims[:5]

In [19]:
find_similar_users_who_have_rated_specific_item(0, 1)

[(607, 0.10013897792785642),
 (476, 0.0933426769597332),
 (134, 0.08845671917332959),
 (413, 0.07861598882260362),
 (589, 0.0765963137923511)]

In [20]:
averages = []
for row in average_ratings:
    averages.append(row)

In [21]:
def predict_rating(u, j):
    r_uj = averages[u]
    num, den = 0, 0
    for v, sim in find_similar_users_who_have_rated_specific_item(u, j):
        num += R[v][j] * sim
        den += abs(sim)
    try:
        return r_uj + (num / den)
    except ZeroDivisionError:
        return -10

In [22]:
predict_rating(0, 0)

4.2336385433501516

In [None]:
for index, row in test_data.iterrows():
    if row['userId'] <= 75:
        print(predict_rating(row['userId'].astype(np.int) - 1, row['movieId'].astype(np.int) - 1), row['rating'])

4.000764750477544 4.0
3.648421053013417 4.0
3.0970932202934236 4.0
4.143272862291049 3.5
3.1104468271382926 2.5
3.950223538751615 5.0
2.6578031092544077 2.0
3.051953645090971 3.0
3.3926903509499216 3.5
4.047993489803462 5.0
2.06699496746785 4.0
3.36793903154595 3.5
4.229843062479959 4.0
3.5314598600705467 3.5
4.010333801788997 2.0
4.34674507875032 4.5
2.8790717003198845 4.0
2.3133264147617316 3.0
4.849178444899719 4.5
3.2406399273448967 3.0
2.3351973893707596 0.5
4.833927891551752 4.0
4.478828828828829 1.5
5.101646504434103 5.0
-10 3.0
