###Import Library

In [2]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [3]:
import numpy as np
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

###Read dataset

In [4]:
#get dataset rating
header = ['user_id', 'item_id', 'rating', 'timestamp']
drivePrefix = '/content/gdrive/My Drive/1301174046_MovieRecommender/'
driveSuffix = 'ml-latest-small/ratings.csv'
df_rating = pd.read_csv(drivePrefix + driveSuffix, sep=',', names =header, skiprows=1)

In [5]:
#get dataset movies.csv
header = ['movie_id', 'title', 'genre']
drivePrefix = '/content/gdrive/My Drive/1301174046_MovieRecommender/'
driveSuffix = 'ml-latest-small/movies.csv'
df_movies = pd.read_csv(drivePrefix + driveSuffix, sep=',', names =header, skiprows=1)

In [6]:
#calculate n users and n items
n_users = df_rating.user_id.unique().shape[0]
n_items = df_rating.item_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | number of movies = ' + str(n_items))
df_rating.head()

Number of users = 610 | number of movies = 9724


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


###Split dataset

80% training data, 20% testing data

In [8]:
#split data 20% testing data, 80% training data
def split_data_ml100k(data, num_users, num_items, test_ratio):
    mask = [True if x == 1 else False for x in np.random.uniform(0, 1, (len(data))) < 1 - test_ratio]
    neg_mask = [not x for x in mask]
    train_data, test_data = data[mask], data[neg_mask]
    return train_data, test_data

In [9]:
train, test = split_data_ml100k(df_rating, n_users, n_items, 0.25)

In [10]:
train.head()

Unnamed: 0,user_id,item_id,rating,timestamp
1,1,3,4.0,964981247
3,1,47,5.0,964983815
4,1,50,5.0,964982931
5,1,70,3.0,964982400
7,1,110,4.0,964982176


In [11]:
test.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,4.0,964982703
2,1,6,4.0,964982224
6,1,101,5.0,964980868
12,1,223,3.0,964980985
19,1,349,4.0,964982563


####Masking dataset

In [12]:
#masking for train data
df_train = df_rating
for i in range(test.shape[0]):
  df_train.loc[(df_train.user_id == test.iloc[[i]].user_id.values[0]) & (df_train.item_id == test.iloc[[i]].item_id.values[0]), 'rating'] = 0

In [13]:
#data training
df_train.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,0.0,964982703
1,1,3,4.0,964981247
2,1,6,0.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [14]:
#make rating 0
df_test = df_rating
for col in df_test.rating:
  df_test['rating'].values[:] = 0

In [15]:
#masking for test data
for i in range(test.shape[0]):
  df_test.loc[(df_test.user_id == test.iloc[[i]].user_id.values[0]) & (df_test.item_id == test.iloc[[i]].item_id.values[0]), 'rating'] = test.iloc[[i]].rating.values[0]

In [16]:
#data testing
df_test.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,0.0,964981247
2,1,6,4.0,964982224
3,1,47,0.0,964983815
4,1,50,0.0,964982931


##Collaborative Filtering

In [17]:
#matrix factorixation algorithm
def matrix_factorization(R, P, Q, K, steps=100, alpha=0.002, beta=0.001):
    Q = Q.T
    print(Q)
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2))

        if e < 0.001:
            break

    return P, Q.T    

###Training MF

In [18]:
#make matrix 
mf_train = df_train.pivot(index='user_id', columns='item_id', values='rating')
mf_train = mf_train.fillna(0)
mf_train

item_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,...,185135,185435,185473,185585,186587,187031,187541,187593,187595,187717,188189,188301,188675,188751,188797,188833,189043,189111,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,3.5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
#collaborative filtering process
mf_train = mf_train.to_numpy()

N = len(mf_train)
M = len(mf_train[0])

K = 8

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

user_latent_features, item_latent_features = matrix_factorization(mf_train, P, Q, K)

[[0.12623313 0.5227832  0.815139   ... 0.27076889 0.48031582 0.89629991]
 [0.62746804 0.01773422 0.31714695 ... 0.98505212 0.64630026 0.62852657]
 [0.0486334  0.38765423 0.73252585 ... 0.25515525 0.68625237 0.89192412]
 ...
 [0.95745543 0.83138814 0.72024172 ... 0.8630422  0.18215912 0.05156805]
 [0.38372975 0.93572584 0.7272011  ... 0.26474296 0.03376313 0.48400288]
 [0.97616907 0.58133905 0.75060101 ... 0.98889472 0.3619107  0.3101925 ]]


In [20]:
print("The original matrix")
print(mf_train)

The original matrix
[[4.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [2.5 0.  2.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]]


In [21]:
train_pred = np.dot(user_latent_features, item_latent_features.T)
print("The approximation matrix by MF")
print(train_pred)

The approximation matrix by MF
[[4.51997978 3.6963219  3.80975357 ... 3.66658363 3.13684095 3.1292911 ]
 [3.06763861 2.71641636 2.78400434 ... 2.77183061 2.60127862 2.71656056]
 [3.02610436 2.89437969 3.31361374 ... 2.49213929 0.64093953 0.80798962]
 ...
 [3.51321076 2.73026106 2.52093797 ... 3.16351879 2.94716418 1.40300106]
 [3.88777304 3.26554549 2.76995792 ... 3.09549522 3.16142843 2.69622261]
 [4.42050837 3.40588897 3.25033215 ... 3.88054773 3.41085831 3.15600351]]


####Training Evaluation 

In [22]:
#calculate mse and rmse for training set
pred = train_pred
testset = mf_train
total = 0
jum = 0
for i in range(len(testset)):
    for j in range(len(testset[i])):
        if testset[i][j] != 0:
            total += (pred[i][j] - testset[i][j])**2
            jum += 1

mse = total / jum
print('MSE Training = ', mse)
rmse = mse**0.5
print('RMSE Training = ', rmse)

MSE Training =  0.23930869420923537
RMSE Training =  0.48919187872371245


###Testing MF

In [23]:
#make matrif for testing
mf_test = df_test.pivot(index='user_id', columns='item_id', values='rating')
mf_test = mf_test.fillna(0)
mf_test

item_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,...,185135,185435,185473,185585,186587,187031,187541,187593,187595,187717,188189,188301,188675,188751,188797,188833,189043,189111,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,4.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,3.5,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
#testing process

mf_test = mf_test.to_numpy()

N = len(mf_test)
M = len(mf_test[0])

K = 8

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

user_latent_features_test, item_latent_features_test = matrix_factorization(mf_test, P, Q, K)

[[0.52429246 0.35029206 0.65846154 ... 0.07055864 0.47853742 0.70856985]
 [0.26108083 0.23215987 0.74087813 ... 0.15331503 0.26477823 0.69635732]
 [0.13186614 0.5426448  0.0326571  ... 0.51918827 0.99845265 0.5729967 ]
 ...
 [0.29971808 0.96258311 0.07843226 ... 0.26498529 0.68439633 0.99031743]
 [0.01488883 0.32520217 0.19703618 ... 0.99925873 0.166315   0.7026312 ]
 [0.93657915 0.6261376  0.38566501 ... 0.28254539 0.2239735  0.72624826]]


In [25]:
print("The original matrix")
print(mf_test)

The original matrix
[[4.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [2.5 0.  2.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]]


In [26]:
test_pred = np.dot(user_latent_features_test, item_latent_features_test.T)
print("The approximation matrix by MF")
print(test_pred)

The approximation matrix by MF
[[4.32828428 4.19727585 4.05490504 ... 2.17699042 3.15552978 4.61931263]
 [3.67536584 2.76824249 3.00824384 ... 1.7870378  2.46920008 3.67914768]
 [3.11911715 2.17647622 3.26646038 ... 0.1647738  1.21850843 2.54778463]
 ...
 [3.35084861 3.14164863 2.89005062 ... 2.45454679 2.12240652 3.12687056]
 [3.85930322 2.91036621 3.08975846 ... 2.20450708 2.97376103 3.89680721]
 [3.5825077  3.51290046 3.25400298 ... 2.49960346 2.64562708 4.18281421]]


####Testing Evaluation

In [27]:
#calculate testing mse and rmse
pred = test_pred
testset = mf_test
total = 0
jum = 0
for i in range(len(testset)):
    for j in range(len(testset[i])):
        if testset[i][j] != 0:
            total += (pred[i][j] - testset[i][j])**2
            jum += 1

mse = total / jum
print('MSE = ', mse)
rmse = mse**0.5
print('RMSE = ', rmse)

MSE =  0.2399420217315455
RMSE =  0.4898387711600068


##Recommendation process

Top 20 movies for each user

In [28]:
movie_rec = []
for i in range(len(test_pred)):
  movie_rec.append([np.argsort(-1*test_pred[i])[:20]])

In [29]:
movierec = df_train.pivot(index='user_id', columns='item_id', values='rating')
movierec = movierec.fillna(0)

In [30]:
df_movierec = pd.DataFrame(columns=['user','movie_recommendation'])
for i in range(len(movie_rec)):
  rec = []
  for j in range(len(movie_rec[i])):
    rec.append(movierec.columns[movie_rec[i][j]])  
  movie = []
  for k in range(len(rec[0])):
    movie.append(df_movies.loc[df_movies.movie_id == rec[0][k]].title.values[0])
  df_movierec.loc[i,'user'] = i
  df_movierec.loc[i,'movie_recommendation'] = movie

Movie recommendation for user 546th

In [31]:
df_movierec.loc[546].movie_recommendation

['The Lair of the White Worm (1988)',
 'I Origins (2014)',
 'Stuart Little 3: Call of the Wild (2005)',
 'Blob, The (1958)',
 'History of Future Folk, The (2012)',
 'Philadelphia Story, The (1940)',
 'Yojimbo (1961)',
 'Unicorn City (2012)',
 'Jetée, La (1962)',
 'Shutter Island (2010)',
 'Guys and Dolls (1955)',
 'Jump In! (2007)',
 'Alias Betty (Betty Fisher et autres histoires) (2001)',
 'Lamerica (1994)',
 'Patton (1970)',
 'Dune (2000)',
 'Fresh (1994)',
 'Gallipoli (1981)',
 'Conversations with Other Women (2005)',
 '84 Charing Cross Road (1987)']

##Content-Based Filtering

In [32]:
#dataset genre music
header = ['movie_id', 'title', 'genre']
drivePrefix = '/content/gdrive/My Drive/1301174046_MovieRecommender/'
driveSuffix = 'ml-latest-small/movies.csv'
df_movies = pd.read_csv(drivePrefix + driveSuffix, sep=',', names =header, skiprows=1)

In [33]:
df_movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [34]:
df_movies.shape

(9742, 3)

In [35]:
n_title = df_movies.title.unique().shape[0]
print('Number of movies = ', n_title)

Number of movies =  9737


###Pre-processing dataset

In [38]:
#praproses genre menjadi kata-kata dan huruf kecil
df_movies['related'] = df_movies['genre'].map(lambda x: x.lower().split('|')) 
df_movies.head()

Unnamed: 0,movie_id,title,genre,related
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"[adventure, animation, children, comedy, fantasy]"
1,2,Jumanji,Adventure|Children|Fantasy,"[adventure, children, fantasy]"
2,3,Grumpier Old Men,Comedy|Romance,"[comedy, romance]"
3,4,Waiting to Exhale,Comedy|Drama|Romance,"[comedy, drama, romance]"
4,5,Father of the Bride Part II,Comedy,[comedy]


In [39]:
#Menghapus tahun pada judul film kemudian menambahkan tahun ke kolom related
for i in range(df_movies.shape[0]):
  year = re.search(r' \((\d{4})\)', df_movies['title'][i])
  if year:
    year = re.sub(r'([()])','', year.group(0))
    df_movies['title'][i] = re.sub(r'\((\d{4})\)', '', df_movies['title'][i])
    df_movies['related'][i].append(year+'s')
  df_movies['related'][i] = ','.join(df_movies['related'][i])
df_movies.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,movie_id,title,genre,related
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"adventure,animation,children,comedy,fantasy"
1,2,Jumanji,Adventure|Children|Fantasy,"adventure,children,fantasy"
2,3,Grumpier Old Men,Comedy|Romance,"comedy,romance"
3,4,Waiting to Exhale,Comedy|Drama|Romance,"comedy,drama,romance"
4,5,Father of the Bride Part II,Comedy,comedy


In [40]:
#mengekstrak nama sebagai tag pada related
df_movies['titles'] = df_movies['title'].map(lambda x: x.lower().split(' '))
df_movies['titles'] = df_movies['titles'].map(lambda x: ','.join(set(x)))

In [41]:
df_movies['related'] = df_movies['related'] + df_movies['titles']
df_movies.drop(['titles', 'genre'], axis = 1) 

Unnamed: 0,movie_id,title,related
0,1,Toy Story,"adventure,animation,children,comedy,fantasy,st..."
1,2,Jumanji,"adventure,children,fantasy,jumanji"
2,3,Grumpier Old Men,"comedy,romance,old,grumpier,men"
3,4,Waiting to Exhale,"comedy,drama,romance,waiting,exhale,to"
4,5,Father of the Bride Part II,"comedy,of,father,ii,part,the,bride"
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"action,animation,comedy,fantasy,atlantic,black..."
9738,193583,No Game No Life: Zero,"animation,comedy,fantasy,game,no,life:,zero"
9739,193585,Flint,"drama,flint"
9740,193587,Bungo Stray Dogs: Dead Apple,"action,animationbungo,,apple,stray,dogs:,dead"


In [42]:
header = ['user_id', 'movie_idt', 'tags', 'timestamp']
drivePrefix = '/content/gdrive/My Drive/1301174046_MovieRecommender/'
driveSuffix = 'ml-latest-small/tags.csv'
df_tags = pd.read_csv(drivePrefix + driveSuffix, sep=',', names =header, skiprows=1)

In [43]:
#menggrupkan berdasarkan film kemudian tags yang ada pada film tersebut digabungkan
df_tags = df_tags.groupby(['movie_idt']).agg(lambda x: ','.join(set(x))).reset_index()
df_tags

Unnamed: 0,movie_idt,tags
0,1,"pixar,fun"
1,2,"game,fantasy,magic board game,Robin Williams"
2,3,"old,moldy"
3,5,"remake,pregnancy"
4,7,remake
...,...,...
1567,183611,"Rachel McAdams,Comedy,funny"
1568,184471,"video game adaptation,Alicia Vikander,adventure"
1569,187593,"Josh Brolin,Ryan Reynolds,sarcasm"
1570,187595,"star wars,Emilia Clarke"


In [44]:
#praproses tag
df_tags['tags'] = df_tags['tags'].map(lambda x: x.lower().split(' '))
df_tags['tags'] = df_tags['tags'].map(lambda x: ','.join(set(x)))
df_tags.head()

Unnamed: 0,movie_idt,tags
0,1,"pixar,fun"
1,2,"williams,game,fantasy,magic,board,game,robin"
2,3,"old,moldy"
3,5,"remake,pregnancy"
4,7,remake


###Combine data to one column

In [47]:
#menggabungkan genre dan tags untuk setiap film
# df_cbf = pd.merge(df_tags, df_movies, on='movie_id')
df_cbf = pd.concat([df_tags, df_movies], axis=1, sort=False)
df_cbff = df_cbf
df_cbf = df_cbf.replace(np.nan, '', regex=True)
df_cbf['related'] = df_cbf['related'] +','+ df_cbf['tags']
df_cbf

Unnamed: 0,movie_idt,tags,movie_id,title,genre,related,titles
0,1,"pixar,fun",1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"adventure,animation,children,comedy,fantasy,st...",",story,toy"
1,2,"williams,game,fantasy,magic,board,game,robin",2,Jumanji,Adventure|Children|Fantasy,"adventure,children,fantasy,jumanji,williams,ga...",",jumanji"
2,3,"old,moldy",3,Grumpier Old Men,Comedy|Romance,"comedy,romance,old,grumpier,men,old,moldy",",old,grumpier,men"
3,5,"remake,pregnancy",4,Waiting to Exhale,Comedy|Drama|Romance,"comedy,drama,romance,waiting,exhale,to,remake,...",",waiting,exhale,to"
4,7,remake,5,Father of the Bride Part II,Comedy,"comedy,of,father,ii,part,the,bride,remake",",of,father,ii,part,the,bride"
...,...,...,...,...,...,...,...
9737,,,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,"action,animation,comedy,fantasy,atlantic,black...",",atlantic,black,butler:,of,book,the"
9738,,,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,"animation,comedy,fantasy,game,no,life:,zero,",",game,no,life:,zero"
9739,,,193585,Flint,Drama,"drama,flint,",",flint"
9740,,,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,"action,animationbungo,,apple,stray,dogs:,dead,","bungo,,apple,stray,dogs:,dead"


###TF-IDF

In [49]:
#tf-idf untuk kolom related
tf = TfidfVectorizer(analyzer='word')
tfidf_matrix = tf.fit_transform(df_cbf['related'])


###Cosine Similarity

In [50]:
#cosine similarity antara film-film
cos_sim_cbf = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim_cbf

array([[1.        , 0.11665132, 0.01133238, ..., 0.        , 0.        ,
        0.01340453],
       [0.11665132, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01133238, 0.        , 1.        , ..., 0.        , 0.        ,
        0.01085177],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.01340453, 0.        , 0.01085177, ..., 0.        , 0.        ,
        1.        ]])

###Recommendation process

In [51]:
# fungsi membuat rekomendasi berdasarkan judul film. Fungsi ini akan mencari film yang memiliki kemiripan dengan judul film dan mengembalikan film recommended
def recommendations(title, cosine_sim = cos_sim_cbf):
    
    recommended_movies = []
    # idx untuk mendapatkan index film yang sama dengan judul
    idx = indices[indices == title].index[0]

    # cosine similarity terurut secara descanding
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # index film secara terurut
    top_rec = list(score_series.iloc[1:].index)

    # jdul film yang direkomendasikan
    for i in top_rec:
        recommended_movies.append(list(df_cbf.index)[i])
        
    return recommended_movies

In [52]:
df_cbf.set_index('title', inplace = True)
df_cbf.head()

Unnamed: 0_level_0,movie_idt,tags,movie_id,genre,related,titles
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Toy Story,1,"pixar,fun",1,Adventure|Animation|Children|Comedy|Fantasy,"adventure,animation,children,comedy,fantasy,st...",",story,toy"
Jumanji,2,"williams,game,fantasy,magic,board,game,robin",2,Adventure|Children|Fantasy,"adventure,children,fantasy,jumanji,williams,ga...",",jumanji"
Grumpier Old Men,3,"old,moldy",3,Comedy|Romance,"comedy,romance,old,grumpier,men,old,moldy",",old,grumpier,men"
Waiting to Exhale,5,"remake,pregnancy",4,Comedy|Drama|Romance,"comedy,drama,romance,waiting,exhale,to,remake,...",",waiting,exhale,to"
Father of the Bride Part II,7,remake,5,Comedy,"comedy,of,father,ii,part,the,bride,remake",",of,father,ii,part,the,bride"


In [54]:
#mendapatkan index setiap film
indices = pd.Series(df_cbf.index)
indices

0                                Toy Story 
1                                  Jumanji 
2                         Grumpier Old Men 
3                        Waiting to Exhale 
4              Father of the Bride Part II 
                       ...                 
9737    Black Butler: Book of the Atlantic 
9738                 No Game No Life: Zero 
9739                                 Flint 
9740          Bungo Stray Dogs: Dead Apple 
9741          Andrew Dice Clay: Dice Rules 
Name: title, Length: 9742, dtype: object

Movie recommendation based on movie-similarity

In [55]:
#CBF dengan inputan judul film
sim_movies = recommendations('Toy Story ')

In [56]:
#20 list film yang mirip dengan judul masukkan
sim_movies[:20]

['Toy Story 2 ',
 'Toy Story 3 ',
 'Toy, The ',
 'Fun ',
 'Toy Soldiers ',
 'NeverEnding Story, The ',
 'Christmas Story, A ',
 'Wild, The ',
 'In Search of the Castaways ',
 'Home ',
 'Turbo ',
 "We're Back! A Dinosaur's Story ",
 'Shrek ',
 'Moana ',
 'Goonies, The ',
 'Enchanted ',
 "Kid's Story ",
 'NeverEnding Story III, The ',
 'Valiant ',
 'Robots ']

##Hybrid process

In [57]:
df_movieforuser = pd.DataFrame(columns=['user','movie_recommendation'])
for userid in range(len(train_pred)):
  count = 0
  movrec_u = []
  train_pred_c = train_pred.reshape(9724,610)
  for i in range(0,len(sim_movies)):
    idx = indices[indices == sim_movies[i]].index[0]
    mov_id = df_cbff.loc[idx].movie_id
    col_rate = np.where(movierec.columns == mov_id)[0][0]
    rate_user_u = train_pred_c[col_rate][userid]
    if(rate_user_u > 3):
      count+=1
      movrec_u.append(mov_id)
    if(count >= 20):
      break
  mvr = []
  for i in range(len(movrec_u)):
    mvr.append(df_movies.loc[df_movies.movie_id == movrec_u[i]].title.values[0])
  df_movieforuser.loc[userid,'user'] = userid
  df_movieforuser.loc[userid, 'movie_recommendation'] = mvr

In [58]:
df_movieforuser['movie_recommendation'][214]

['Toy, The ',
 'Fun ',
 'Christmas Story, A ',
 'Wild, The ',
 'In Search of the Castaways ',
 'Home ',
 'Turbo ',
 'Shrek ',
 'Goonies, The ',
 'Enchanted ',
 'NeverEnding Story III, The ',
 'Robots ',
 'Inside Out ',
 'Labyrinth ',
 'Nelly & Monsieur Arnaud ',
 'Rio ',
 'Tale of Despereaux, The ',
 'Epic ',
 'Watership Down ',
 'Gnomeo & Juliet ']

In [59]:
df_movieforuser['movie_recommendation'][23]

['Toy Story 2 ',
 'Toy, The ',
 'Fun ',
 'Toy Soldiers ',
 'Wild, The ',
 'In Search of the Castaways ',
 'Turbo ',
 "We're Back! A Dinosaur's Story ",
 'Shrek ',
 'Goonies, The ',
 'Enchanted ',
 'Fun with Dick and Jane ',
 'A Story of Children and Film ',
 'Nelly & Monsieur Arnaud ',
 'Madagascar ',
 'Rio 2 ',
 'Rio ',
 'NeverEnding Story II: The Next Chapter, The ',
 'G.I. Joe: The Movie ',
 'Shrek the Third ']

###Recommendation result

In [60]:
df_movieforuser

Unnamed: 0,user,movie_recommendation
0,0,"[Toy, The , Fun , Toy Soldiers , NeverEnding S..."
1,1,"[Toy, The , Fun , Toy Soldiers , Wild, The , I..."
2,2,"[Toy, The , Christmas Story, A , Wild, The , I..."
3,3,"[Toy, The , Toy Soldiers , NeverEnding Story, ..."
4,4,"[NeverEnding Story, The , Wild, The , Home , T..."
...,...,...
605,605,"[Toy Story 3 , Toy, The , Fun , Wild, The , Ho..."
606,606,"[Toy Story 2 , Toy Story 3 , Toy, The , NeverE..."
607,607,"[Toy Story 2 , Toy, The , Fun , NeverEnding St..."
608,608,"[Fun , Toy Soldiers , Christmas Story, A , Wil..."


In [61]:
df_movieforuser.to_csv(drivePrefix + driveSuffix + 'movie_recommendation.csv')