In [2]:
# Import required packages 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats.stats import pearsonr
from scipy.spatial import distance
import math

In [3]:
data = pd.read_csv(r'C:/Users/Korisnik/ratings.csv', usecols = ['userId', 'movieId', 'rating'])
data

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [4]:
# Split data into Training Data and Test Data

train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [5]:
print('Training Data has {} rows, and Test Data has {} rows'.format(train_data.shape[0], test_data.shape[0]))

Training Data has 80668 rows, and Test Data has 20168 rows


In [7]:
# Make ratings tables from Training Data and non-normalized numpy ratings matrix

r = np.zeros((193609,610))

for index, row in train_data.iterrows():
    r[row['movieId'].astype(np.int)-1][row['userId'].astype(np.int)-1] = row['rating']

ratings = train_data.pivot_table(index = 'userId', columns = 'movieId', values = 'rating')
ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,


In [8]:
print('Ratings matrix has {} rows and {} coulmns'.format(ratings.shape[0], ratings.shape[1]))

Ratings matrix has 610 rows and 8983 coulmns


In [9]:
coulmn_wise_average = ratings.mean(axis = 0)  
coulmn_wise_average

movieId
1         3.893678
2         3.373626
3         3.162500
4         2.250000
5         2.955882
            ...   
193581    4.000000
193583    3.500000
193585    3.500000
193587    3.500000
193609    4.000000
Length: 8983, dtype: float64

In [10]:
normalized_ratings = ratings.apply(lambda x: x - coulmn_wise_average[x.name], axis = 0)
normalized_ratings

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.106322,,0.8375,,,0.031646,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.393678,,,,,,-0.627907,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,-1.393678,-1.373626,-1.1625,,,,,,,0.571429,...,,,,,,,,,,
609,-0.893678,,,,,,,,,0.571429,...,,,,,,,,,,


In [11]:
# Make normalized numpy ratings matrix

R = np.zeros((610,193609))

tmp = normalized_ratings.unstack().reset_index(name='rating')

tmp = tmp[tmp['rating'].notna()]

for index,row in tmp.iterrows():
    R[row['userId'].astype(np.int)-1][row['movieId'].astype(np.int)-1] = row['rating'] 
R = R.T
R

array([[ 0.10632184,  0.        ,  0.        , ..., -1.39367816,
        -0.89367816,  1.10632184],
       [ 0.        ,  0.        ,  0.        , ..., -1.37362637,
         0.        ,  0.        ],
       [ 0.8375    ,  0.        ,  0.        , ..., -1.1625    ,
         0.        ,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [12]:
def intersection(u,v):
    _u, _v = [], []
    for i in range(min(10000,len(u))):
        if u[i] != 0 and v[i] != [0]:
            _u.append(u[i])
            _v.append(v[i])
    return _u, _v

In [13]:
print(intersection(R[0],R[1]))

([-0.39367816091954033, 0.10632183908045967, -0.39367816091954033, -0.8936781609195403, -1.3936781609195403, 0.10632183908045967, 0.10632183908045967, -1.8936781609195403, 0.10632183908045967, 0.6063218390804597, 1.1063218390804597, 0.10632183908045967, 0.10632183908045967, 0.10632183908045967, -0.39367816091954033, -0.39367816091954033, 0.10632183908045967, 0.10632183908045967, 0.6063218390804597, -1.8936781609195403, 1.1063218390804597, 0.10632183908045967, -0.39367816091954033, -0.39367816091954033, 0.10632183908045967, 1.1063218390804597, 1.1063218390804597, 1.1063218390804597, -0.8936781609195403, 1.1063218390804597, -0.39367816091954033, 0.10632183908045967, -1.3936781609195403, 0.10632183908045967, 0.10632183908045967, 0.10632183908045967, 0.10632183908045967, 0.10632183908045967, 0.6063218390804597, 0.10632183908045967, 0.10632183908045967, 1.1063218390804597, 0.10632183908045967, 1.1063218390804597, 0.10632183908045967, -0.8936781609195403, 0.10632183908045967, -1.393678160919

In [14]:
def find_similar_items(i):
    sims = []
    for k in range(10000):
        if k is not i:
            a, b = intersection(R[i],R[k])
            if len(a) == 0:
                continue
            sim = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
            if math.isnan(sim):
                continue
            sims.append((k, sim))
    
    sims = sorted(sims, key=lambda x: x[1])
    sims.reverse()
    return sims[:5]

In [15]:
def find_similar_items_who_have_been_rated_by_specific_user(i, u):
    sims = []
    for k in range(10000):
        if k is not i and R[k][u] != 0:
            a, b = intersection(R[i],R[k])
            sim = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
            if math.isnan(sim):
                continue
            sims.append((k, sim))
    
    sims = sorted(sims, key=lambda x: x[1])
    sims.reverse()
    return sims[:5]

In [16]:
find_similar_items(0) # ovo nesto nevalja 

[(5631, 1.0000000000000002),
 (5101, 1.0000000000000002),
 (5003, 1.0000000000000002),
 (4753, 1.0000000000000002),
 (4586, 1.0000000000000002)]

In [17]:
find_similar_items_who_have_been_rated_by_specific_user(1, 10)

[(2026, 1.0),
 (1517, 1.0),
 (510, 1.0),
 (1839, 0.9977480548864894),
 (375, 0.7217765419704331)]

In [18]:
def predict_rating(i, u):
    num, den = 0, 0
    for k, sim in find_similar_items_who_have_been_rated_by_specific_user(i, u):
        #print(k, sim, r[k][u])
        num += sim*r[k][u]
        den += abs(sim)
    try:
        #print("num, den",num, den)
        return num/den
    
    except ZeroDivisionError:
        return -10

In [21]:
predict_rating(0,0)

4.1697699809957705

In [20]:
for index, row in test_data.iterrows():
    print(row['rating'], predict_rating(row['movieId'].astype(np.int) - 1, row['userId'].astype(np.int) - 1))

  


4.5 3.0
3.0 3.2
3.0 2.4000000000000004
4.0 3.4
4.0 3.296526395507075
4.0 3.1
3.5 3.936994429636083
4.5 1.6412988980129566
0.5 2.9
3.5 3.83140969383773
4.5 3.2382695694262895
4.0 2.6
5.0 3.6
5.0 4.5
4.0 3.5803363811420397
3.5 1.9
4.5 -10
3.0 4.6
3.5 2.8032185451640967
2.5 2.4
3.5 3.4
2.0 -10
5.0 3.2
4.0 4.999999999999999
4.0 3.0
3.0 3.6
2.0 3.8
1.5 3.0358172947519626
3.0 3.35077616520823
4.0 -10
3.0 3.3
4.5 -10
1.5 2.926129271438813
4.0 3.623581926569529
3.5 2.9
4.0 4.1012898415024726
2.0 3.1
4.0 2.7
3.5 3.1
4.0 -10
3.5 2.8
3.0 3.3070075255414406
5.0 2.9950690825541453
3.0 3.8
4.5 3.5977348701835203
3.0 3.4
4.0 3.6
4.0 3.4
4.0 4.198169359499094
4.0 3.2276240078746987
4.5 4.609203206789963
3.0 4.195312927131537
5.0 2.8212266900984706
1.5 1.7223376093174967
3.5 2.9
4.0 3.8953263700553613
3.0 -10
4.0 4.07596382424372
4.0 3.2520595776599794
2.5 2.8
4.5 4.314425199638523
0.5 3.7358682769058555
3.0 3.6
3.0 3.1334956418752364
2.0 -10
3.0 3.9689828690110973
2.5 -10
3.0 2.9
3.0 4.2
3.5 3.3
4.0 3

KeyboardInterrupt: 