In [24]:
import numpy as np
import pandas as pd
import random
from sklearn import preprocessing
from numpy.linalg import matrix_power

Run the following notebook if you are using Google Colab

In [None]:
df = pd.DataFrame()
path  = "/content/drive/MyDrive/u.data"
df = pd.read_csv(path, sep="\t", header=None, names=['user_id','movie_id','rating','timestamp'])
df.head()

Run the following notebook if you are running the notebook locally

In [26]:
df = pd.DataFrame()
path  = "u.data"
df = pd.read_csv(path, sep="\t", header=None, names=['user_id','movie_id','rating','timestamp'])
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [27]:
#preprocessing
random.seed(42)
np.random.seed(42)

df = df[['user_id','movie_id','rating']]

user_label_encoder  = preprocessing.LabelEncoder()
movie_label_encoder = preprocessing.LabelEncoder()
df.user_id  =  user_label_encoder.fit_transform(df.user_id.values)
df.movie_id =  movie_label_encoder.fit_transform(df.movie_id.values)
num_users =  len(user_label_encoder.classes_)
num_movies = len(movie_label_encoder.classes_)

#taking the random test ratings


random_sample_ratings = pd.DataFrame()

# random_sample_ratings = df.sample(frac=0.1,random_state=1) 

for i in range (0,num_users):
    random_sample_ratings = random_sample_ratings.append(df[df["user_id"] == i].sample(frac=0.1,random_state=1))

test_ratings = pd.DataFrame()
dismissed_ratings = pd.DataFrame()

for i in range(0,len(random_sample_ratings)):
    if(random_sample_ratings.iloc[i]['rating']==5):
        test_ratings = test_ratings.append(random_sample_ratings.iloc[i])
    else:
        dismissed_ratings =  dismissed_ratings.append(random_sample_ratings.iloc[i])

#returning the non-five dropped ratings
train_ratings = df.drop(test_ratings.index)

In [28]:
# num of user and movies
print(num_users)
print(num_movies)

943
1682


In [29]:
train_ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,195,241,3
1,185,301,3
2,21,376,1
3,243,50,2
4,165,345,1


In [30]:
test_ratings.head()
# print(len(test_ratings))

Unnamed: 0,movie_id,rating,user_id
62069,190.0,5.0,0.0
47860,86.0,5.0,0.0
4280,81.0,5.0,0.0
1382,252.0,5.0,0.0
2328,63.0,5.0,0.0


In [31]:
#constructing the users_items matrix
users_items = np.zeros((num_users, num_movies))*1j
for i in range(0,num_users):
    user_ratings = train_ratings[train_ratings["user_id"] == i]
    for j in range(0,len(user_ratings)):
        if(user_ratings.iloc[j]['rating']>=3):
            users_items[i,int(user_ratings.iloc[j]['movie_id'])] = 1j
        else:
            users_items[i,int(user_ratings.iloc[j]['movie_id'])] = -1j

In [32]:
#constructing the adjacency matrix
items_users = -(1*users_items).T
users_users = np.zeros((num_users,num_users))
items_items = np.zeros((num_movies,num_movies))

part1 = np.hstack((users_users,users_items))
part2 = np.hstack((items_users,items_items))

adjacency_matrix = np.vstack((part1,part2))

In [33]:
print(adjacency_matrix.shape)

(2625, 2625)


In [34]:
print(adjacency_matrix)

[[ 0.+0.j  0.+0.j  0.+0.j ...  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j ...  0.+0.j  0.+0.j  0.+0.j]
 [ 0.+0.j  0.+0.j  0.+0.j ...  0.+0.j  0.+0.j  0.+0.j]
 ...
 [-0.-0.j -0.-0.j -0.-0.j ...  0.+0.j  0.+0.j  0.+0.j]
 [-0.-0.j -0.-0.j -0.-0.j ...  0.+0.j  0.+0.j  0.+0.j]
 [-0.-0.j -0.-0.j -0.-0.j ...  0.+0.j  0.+0.j  0.+0.j]]


In [35]:
def calc_matrix_power(p):
    CORLP_length =  matrix_power(adjacency_matrix,p)
    print("CORLP Matrix")
    print(CORLP_length)
    return CORLP_length

In [36]:
# Finding out the non-rated items by any of the users
unrated_movie_indexes = np.where(users_items==0)
arr = np.array(unrated_movie_indexes)
# print(arr.shape)
# print(arr)
unrated_movie_indexes = {}
for A, B in zip(arr[0], arr[1]):
    if A in unrated_movie_indexes:
        unrated_movie_indexes[A].append(B)
    else:
        unrated_movie_indexes[A] = [B]        

In [41]:
# rankning the predictions
def get_top_N_per_user(N,CORLP):
    predictions_matrix = np.zeros((num_users,num_movies))*1j
    predictions_matrix = CORLP[0:num_users,num_users:]
    predictions = np.zeros((num_users,N))
    movies = list(range(0, num_movies,1))
    recommendations_sorted = (-predictions_matrix).argsort(axis=1)
    for i in range (0,num_users):
        mask   = np.isin(recommendations_sorted[i],unrated_movie_indexes[i])
        # excluding from the recommendation list the previously rated ones by the user
        user_recommendations = recommendations_sorted[i][mask]
        top_N = user_recommendations[0:N]
        predictions[i] = np.array(top_N)
    return predictions

In [38]:
def calc_hits_rate(predictions):
    hits = 0 
    for i in range (0,num_users):
        user_ratings = test_ratings[test_ratings["user_id"] == int(i)]
        for j in range(0,predictions.shape[1]):
            for k in range(0,len(user_ratings)):
                if predictions[i,j] == user_ratings.iloc[k]['movie_id']:
                    hits+= 1
                    break
  # print(hits)            
    return hits         

In [39]:
def Evaluate_Model(top_n_length, path_length):
    CORLP_length = calc_matrix_power(path_length)
    preds = get_top_N_per_user(top_n_length,CORLP_length)
    hits = calc_hits_rate(preds)
    HitsRate = hits/len(test_ratings.index)
    print(HitsRate)
    print("Hits Rate is "+str(HitsRate*100)+" %")   

In [40]:
path_length = input("Enter the path length:")
n = input("Enter the length of top-N items:")
Evaluate_Model(int(n), int(path_length))

Enter the path length:3
Enter the length of top-N items:100
CORLP Matrix
[[0.  +0.j 0.  +0.j 0.  +0.j ... 0.  -6.j 0. +72.j 0.+117.j]
 [0.  +0.j 0.  +0.j 0.  +0.j ... 0. -11.j 0.  +8.j 0.  +9.j]
 [0.  +0.j 0.  +0.j 0.  +0.j ... 0.  +5.j 0.  -2.j 0.  +3.j]
 ...
 [0.  +6.j 0. +11.j 0.  -5.j ... 0.  +0.j 0.  +0.j 0.  +0.j]
 [0. -72.j 0.  -8.j 0.  +2.j ... 0.  +0.j 0.  +0.j 0.  +0.j]
 [0.-117.j 0.  -9.j 0.  -3.j ... 0.  +0.j 0.  +0.j 0.  +0.j]]
0.6529356060606061
Hits Rate is 65.29356060606061 %
