# Recommender System

### Import Library

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

### Dataset

In [2]:
#Dataset rating
df_rating = pd.read_csv('ratings.csv', sep=',', names =['user_id', 'item_id', 'rating', 'timestamp'], skiprows=1)

In [3]:
#Dataset movies.csv
df_movies = pd.read_csv('movies.csv', sep=',', names =['movie_id', 'title', 'genre'], skiprows=1)

In [4]:
df_rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user_id    100836 non-null  int64  
 1   item_id    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [5]:
df_rating.describe()

Unnamed: 0,user_id,item_id,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [6]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  9742 non-null   int64 
 1   title     9742 non-null   object
 2   genre     9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [7]:
df_movies.describe()

Unnamed: 0,movie_id
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [8]:
#Calculasi n users and n items
n_users = df_rating.user_id.unique().shape[0]
n_items = df_rating.item_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | number of movies = ' + str(n_items))
df_rating.head()

Number of users = 610 | number of movies = 9724


Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Splitting dataset

75% training, 25% testing

In [9]:
#split data 20% testing data, 80% training data
def splitting_data(data, num_users, num_items, test_ratio):
    mask = [True if x == 1 else False for x in np.random.uniform(0, 1, (len(data))) < 1 - test_ratio]
    neg_mask = [not x for x in mask]
    train_data, test_data = data[mask], data[neg_mask]
    return train_data, test_data

In [10]:
train, test = splitting_data(df_rating, n_users, n_items, 0.25)

In [11]:
train

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,4.0,964982703
2,1,6,4.0,964982224
3,1,47,5.0,964983815
6,1,101,5.0,964980868
7,1,110,4.0,964982176
...,...,...,...,...
100830,610,166528,4.0,1493879365
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100834,610,168252,5.0,1493846352


In [12]:
test

Unnamed: 0,user_id,item_id,rating,timestamp
1,1,3,4.0,964981247
4,1,50,5.0,964982931
5,1,70,3.0,964982400
11,1,216,5.0,964981208
18,1,333,5.0,964981179
...,...,...,...,...
100811,610,156371,5.0,1479542831
100823,610,160836,3.0,1493844794
100825,610,161634,4.0,1493848362
100828,610,163981,3.5,1493850155


#### Preprocessing Dataset

In [13]:
#Untuk train data
df_train = df_rating
for i in range(test.shape[0]):
    df_train.loc[(df_train.user_id == test.iloc[[i]].user_id.values[0]) & (df_train.item_id == test.iloc[[i]].item_id.values[0]), 'rating'] = 0

In [14]:
#data training
df_train

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,0.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,0.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,0.0,1494273047
100834,610,168252,5.0,1493846352


In [15]:
#make rating 0
df_test = df_rating
for col in df_test.rating:
    df_test['rating'].values[:] = 0

In [16]:
#Untuk test data
for i in range(test.shape[0]):
    df_test.loc[(df_test.user_id == test.iloc[[i]].user_id.values[0]) & (df_test.item_id == test.iloc[[i]].item_id.values[0]), 'rating'] = test.iloc[[i]].rating.values[0]

In [17]:
#data testing
df_test

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,0.0,964982703
1,1,3,4.0,964981247
2,1,6,0.0,964982224
3,1,47,0.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,0.0,1493848402
100832,610,168248,0.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,0.0,1493846352


## Collaborative Filtering

In [18]:
#matrix factorixation algorithm
def matrix_factorization(R, P, Q, K, steps=100, alpha=0.002, beta=0.001):
    Q = Q.T
    print(Q)
    for step in range(steps):
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    eij = R[i][j] - np.dot(P[i,:],Q[:,j])
                    for k in range(K):
                        P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k])
                        Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j])
        eR = np.dot(P,Q)
        e = 0
        for i in range(len(R)):
            for j in range(len(R[i])):
                if R[i][j] > 0:
                    e = e + pow(R[i][j] - np.dot(P[i,:],Q[:,j]), 2)
                    for k in range(K):
                        e = e + (beta/2) * ( pow(P[i][k],2) + pow(Q[k][j],2))

        if e < 0.001:
            break

    return P, Q.T    

### Training matrix factorization

In [19]:
#make matrix 
mf_train = df_train.pivot(index='user_id', columns='item_id', values='rating')
mf_train = mf_train.fillna(0)
mf_train

item_id,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
#Collaborative Filtering Process
mf_train = mf_train.to_numpy()

N = len(mf_train)
M = len(mf_train[0])

K = 8

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

user_latent_features, item_latent_features = matrix_factorization(mf_train, P, Q, K)

[[0.35889478 0.66696192 0.75327697 ... 0.73270676 0.11395664 0.80372143]
 [0.95427722 0.16290717 0.2079624  ... 0.87598752 0.73024935 0.31622526]
 [0.79409133 0.94336163 0.48478252 ... 0.32094039 0.11878283 0.89152642]
 ...
 [0.68329351 0.60347736 0.98937903 ... 0.8140287  0.59149665 0.63521877]
 [0.34463113 0.11673897 0.86424074 ... 0.03857218 0.64552943 0.53543975]
 [0.05118551 0.15960022 0.038406   ... 0.53521261 0.3687317  0.42730549]]


In [21]:
np.random.rand(N,K)

array([[0.45202719, 0.85840227, 0.77152111, ..., 0.38702184, 0.05780065,
        0.67401093],
       [0.20553972, 0.99915744, 0.89807191, ..., 0.50758118, 0.08655571,
        0.65603388],
       [0.42895326, 0.17474527, 0.37318991, ..., 0.49005146, 0.87274775,
        0.97683398],
       ...,
       [0.63395921, 0.50034623, 0.06252867, ..., 0.66990522, 0.74595445,
        0.85370079],
       [0.39078376, 0.37120991, 0.4621919 , ..., 0.60488255, 0.27983003,
        0.77300005],
       [0.14966479, 0.96420032, 0.636661  , ..., 0.48662616, 0.64336251,
        0.87319578]])

In [22]:
print("The original matrix")
print(mf_train)

The original matrix
[[0. 0. 4. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [23]:
train_pred = np.dot(user_latent_features, item_latent_features.T)
print("The approximation matrix by MF")
print(train_pred)

The approximation matrix by MF
[[3.6909236  4.22610708 3.68859077 ... 3.61188587 3.48848852 5.4894669 ]
 [3.87741589 4.11736274 3.68243955 ... 3.95638689 2.82950053 4.64405908]
 [1.124114   0.26182868 0.32030705 ... 0.57518647 1.52198136 1.59650592]
 ...
 [4.79476189 3.12105864 3.53590014 ... 4.10368121 2.70674269 4.16598532]
 [3.44383024 3.19608387 3.24694322 ... 3.15052347 2.94915372 4.23977677]
 [3.72282987 3.30623698 2.83174739 ... 3.97040906 3.31443953 4.46165707]]


#### Evaluation Training 

In [24]:
#calculate mse and rmse for training set
pred = train_pred
testset = mf_train
total = 0
jum = 0
for i in range(len(testset)):
    for j in range(len(testset[i])):
        if testset[i][j] != 0:
            total += (pred[i][j] - testset[i][j])**2
            jum += 1

mse = total / jum
print('MSE Training = ', mse)
rmse = mse**0.5
print('RMSE Training = ', rmse)

MSE Training =  0.23169804703792132
RMSE Training =  0.48135023323763054


### Testing matrix factorization

In [25]:
#make matrif for testing
mf_test = df_test.pivot(index='user_id', columns='item_id', values='rating')
mf_test = mf_test.fillna(0)
mf_test

item_id,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
#testing process

mf_test = mf_test.to_numpy()

N = len(mf_test)
M = len(mf_test[0])

K = 8

P = np.random.rand(N,K)
Q = np.random.rand(M,K)

user_latent_features_test, item_latent_features_test = matrix_factorization(mf_test, P, Q, K)

[[0.37038152 0.25632865 0.88879887 ... 0.27256962 0.68557625 0.22606959]
 [0.3093492  0.37564227 0.43903874 ... 0.802824   0.54364207 0.8477265 ]
 [0.88347188 0.32094673 0.59110776 ... 0.97474077 0.89272168 0.55337252]
 ...
 [0.92933813 0.39897907 0.70457732 ... 0.16397282 0.99876656 0.19309977]
 [0.65284909 0.44707773 0.80182049 ... 0.30989777 0.25461236 0.47582392]
 [0.19289551 0.37993602 0.83253222 ... 0.30335562 0.14536375 0.03086733]]


In [27]:
print("The original matrix")
print(mf_test)

The original matrix
[[0. 0. 4. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [28]:
test_pred = np.dot(user_latent_features_test, item_latent_features_test.T)
print("The approximation matrix by MF")
print(test_pred)

The approximation matrix by MF
[[5.06033324 4.05434563 3.54881167 ... 3.11120947 4.7774277  4.04913376]
 [3.70197693 3.52470482 2.86325858 ... 3.32310799 3.95072724 3.60799597]
 [1.10958001 1.69845971 1.05086454 ... 1.22029943 1.24558349 1.91970737]
 ...
 [3.96333345 3.43403725 3.5516935  ... 3.52134694 3.98469599 4.17477893]
 [3.80334887 2.98115214 3.02176641 ... 2.46373397 3.45820382 3.2711042 ]
 [4.37699768 3.80945943 3.02282807 ... 2.95858915 3.94809295 3.62723881]]


#### Testing Evaluation

In [29]:
#Menghitung testing MSE and RMSE
pred = test_pred
testset = mf_test
total = 0
jum = 0
for i in range(len(testset)):
    for j in range(len(testset[i])):
        if testset[i][j] != 0:
            total += (pred[i][j] - testset[i][j])**2
            jum += 1

mse = total / jum
print('MSE = ', mse)
rmse = mse**0.5
print('RMSE = ', rmse)

MSE =  0.24384516475555384
RMSE =  0.49380680914255715


## Recommendation process

Top 20 movies for each user

In [30]:
movie_rec = []
for i in range(len(test_pred)):
    movie_rec.append([np.argsort(-1*test_pred[i])[:20]])

In [31]:
movierec = df_train.pivot(index='user_id', columns='item_id', values='rating')
movierec = movierec.fillna(0)

In [32]:
df_movierec = pd.DataFrame(columns=['user','movie_recommendation'])
for i in range(len(movie_rec)):
    rec = []
    for j in range(len(movie_rec[i])):
        rec.append(movierec.columns[movie_rec[i][j]])  
    movie = []
    for k in range(len(rec[0])):
        movie.append(df_movies.loc[df_movies.movie_id == rec[0][k]].title.values[0])
    df_movierec.loc[i,'user'] = i
    df_movierec.loc[i,'movie_recommendation'] = movie

Movie recommendation for user 546th

In [33]:
df_movierec.loc[546].movie_recommendation

['Mr. Skeffington (1944)',
 'Hustler, The (1961)',
 'Galaxy of Terror (Quest) (1981)',
 "Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)",
 'Towering Inferno, The (1974)',
 'Neon Genesis Evangelion: The End of Evangelion (Shin seiki Evangelion Gekijô-ban: Air/Magokoro wo, kimi ni) (1997)',
 '8 ½ Women (a.k.a. 8 1/2 Women) (a.k.a. Eight and a Half Women) (1999)',
 'The Idolmaker (1980)',
 'Master of the Flying Guillotine (Du bi quan wang da po xue di zi) (1975)',
 'Producers, The (1968)',
 'Woman in the Dunes (Suna no onna) (1964)',
 'Four Lions (2010)',
 'Holiday Inn (1942)',
 'Yojimbo (1961)',
 'Keanu (2016)',
 'Children of Heaven, The (Bacheha-Ye Aseman) (1997)',
 'On the Beach (1959)',
 'Ten, The (2007)',
 'World of Glory (1991)',
 'Mrs. Brown (a.k.a. Her Majesty, Mrs. Brown) (1997)']

In [34]:
df_movierec

Unnamed: 0,user,movie_recommendation
0,0,"[Galaxy of Terror (Quest) (1981), Bad Boy Bubb..."
1,1,"[Galaxy of Terror (Quest) (1981), Towering Inf..."
2,2,"[Galaxy of Terror (Quest) (1981), Master of th..."
3,3,[Raiders of the Lost Ark (Indiana Jones and th...
4,4,"[Galaxy of Terror (Quest) (1981), Towering Inf..."
...,...,...
605,605,"[Towering Inferno, The (1974), Mr. Skeffington..."
606,606,"[Priest (1994), Galaxy of Terror (Quest) (1981..."
607,607,"[You Can Count on Me (2000), Purge: Anarchy, T..."
608,608,"[Galaxy of Terror (Quest) (1981), Mr. Skeffing..."


## Content-Based Filtering

In [36]:
#dataset genre music
header = ['movie_id', 'title', 'genre']
df_movies = pd.read_csv('movies.csv', sep=',', names =header, skiprows=1)

In [37]:
df_movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [38]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   movie_id  9742 non-null   int64 
 1   title     9742 non-null   object
 2   genre     9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [39]:
df_movies.describe()

Unnamed: 0,movie_id
count,9742.0
mean,42200.353623
std,52160.494854
min,1.0
25%,3248.25
50%,7300.0
75%,76232.0
max,193609.0


In [40]:
df_movies.shape

(9742, 3)

In [41]:
n_title = df_movies.title.unique().shape[0]
print('Number of movies = ', n_title)

Number of movies =  9737


### Pre-processing dataset

In [42]:
#praproses genre menjadi kata-kata dan huruf kecil
df_movies['related'] = df_movies['genre'].map(lambda x: x.lower().split('|')) 
df_movies.head()

Unnamed: 0,movie_id,title,genre,related
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[adventure, animation, children, comedy, fantasy]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[adventure, children, fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[comedy, romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[comedy, drama, romance]"
4,5,Father of the Bride Part II (1995),Comedy,[comedy]


In [43]:
#Menghapus tahun pada judul film kemudian menambahkan tahun ke kolom related
for i in range(df_movies.shape[0]):
    year = re.search(r' \((\d{4})\)', df_movies['title'][i])
    if year:
        year = re.sub(r'([()])','', year.group(0))
        df_movies['title'][i] = re.sub(r'\((\d{4})\)', '', df_movies['title'][i])
        df_movies['related'][i].append(year+'s')
    df_movies['related'][i] = ','.join(df_movies['related'][i])
df_movies.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies['title'][i] = re.sub(r'\((\d{4})\)', '', df_movies['title'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_movies['related'][i] = ','.join(df_movies['related'][i])


Unnamed: 0,movie_id,title,genre,related
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"adventure,animation,children,comedy,fantasy, 1..."
1,2,Jumanji,Adventure|Children|Fantasy,"adventure,children,fantasy, 1995s"
2,3,Grumpier Old Men,Comedy|Romance,"comedy,romance, 1995s"
3,4,Waiting to Exhale,Comedy|Drama|Romance,"comedy,drama,romance, 1995s"
4,5,Father of the Bride Part II,Comedy,"comedy, 1995s"


In [44]:
#mengekstrak nama sebagai tag pada related
df_movies['titles'] = df_movies['title'].map(lambda x: x.lower().split(' '))
df_movies['titles'] = df_movies['titles'].map(lambda x: ','.join(set(x)))

In [45]:
df_movies['related'] = df_movies['related'] + df_movies['titles']
df_movies.drop(['titles', 'genre'], axis = 1) 

Unnamed: 0,movie_id,title,related
0,1,Toy Story,"adventure,animation,children,comedy,fantasy, 1..."
1,2,Jumanji,"adventure,children,fantasy, 1995sjumanji,"
2,3,Grumpier Old Men,"comedy,romance, 1995s,grumpier,men,old"
3,4,Waiting to Exhale,"comedy,drama,romance, 1995swaiting,,to,exhale"
4,5,Father of the Bride Part II,"comedy, 1995s,ii,part,father,the,bride,of"
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic,"action,animation,comedy,fantasy, 2017s,book,at..."
9738,193583,No Game No Life: Zero,"animation,comedy,fantasy, 2017s,life:,game,no,..."
9739,193585,Flint,"drama, 2017s,flint"
9740,193587,Bungo Stray Dogs: Dead Apple,"action,animation, 2018sbungo,,apple,dead,dogs:..."


In [48]:
header = ['user_id', 'movie_idt', 'tags', 'timestamp']
df_tags = pd.read_csv('tags.csv', sep=',', names =header, skiprows=1)

In [49]:
#Menggrupkan berdasarkan film kemudian tags yang ada pada film tersebut digabungkan
df_tags = df_tags.groupby(['movie_idt']).agg(lambda x: ','.join(set(x))).reset_index()
df_tags

  results[key] = self.aggregate(func)


Unnamed: 0,movie_idt,tags
0,1,"fun,pixar"
1,2,"fantasy,magic board game,game,Robin Williams"
2,3,"moldy,old"
3,5,"pregnancy,remake"
4,7,remake
...,...,...
1567,183611,"Rachel McAdams,funny,Comedy"
1568,184471,"adventure,Alicia Vikander,video game adaptation"
1569,187593,"Ryan Reynolds,Josh Brolin,sarcasm"
1570,187595,"star wars,Emilia Clarke"


In [50]:
#praprosessing tag
df_tags['tags'] = df_tags['tags'].map(lambda x: x.lower().split(' '))
df_tags['tags'] = df_tags['tags'].map(lambda x: ','.join(set(x)))
df_tags.head()

Unnamed: 0,movie_idt,tags
0,1,"fun,pixar"
1,2,"board,williams,fantasy,magic,game,game,robin"
2,3,"moldy,old"
3,5,"pregnancy,remake"
4,7,remake


### Menggabungkan data menjadi satu kolom

In [51]:
#menggabungkan genre dan tags untuk setiap film
df_cbf = pd.concat([df_tags, df_movies], axis=1, sort=False)
df_cbff = df_cbf
df_cbf = df_cbf.replace(np.nan, '', regex=True)
df_cbf['related'] = df_cbf['related'] +','+ df_cbf['tags']
df_cbf

Unnamed: 0,movie_idt,tags,movie_id,title,genre,related,titles
0,1.0,"fun,pixar",1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,"adventure,animation,children,comedy,fantasy, 1...","toy,,story"
1,2.0,"board,williams,fantasy,magic,game,game,robin",2,Jumanji,Adventure|Children|Fantasy,"adventure,children,fantasy, 1995sjumanji,,boar...","jumanji,"
2,3.0,"moldy,old",3,Grumpier Old Men,Comedy|Romance,"comedy,romance, 1995s,grumpier,men,old,moldy,old",",grumpier,men,old"
3,5.0,"pregnancy,remake",4,Waiting to Exhale,Comedy|Drama|Romance,"comedy,drama,romance, 1995swaiting,,to,exhale,...","waiting,,to,exhale"
4,7.0,remake,5,Father of the Bride Part II,Comedy,"comedy, 1995s,ii,part,father,the,bride,of,remake",",ii,part,father,the,bride,of"
...,...,...,...,...,...,...,...
9737,,,193581,Black Butler: Book of the Atlantic,Action|Animation|Comedy|Fantasy,"action,animation,comedy,fantasy, 2017s,book,at...",",book,atlantic,the,butler:,of,black"
9738,,,193583,No Game No Life: Zero,Animation|Comedy|Fantasy,"animation,comedy,fantasy, 2017s,life:,game,no,...",",life:,game,no,zero"
9739,,,193585,Flint,Drama,"drama, 2017s,flint,",",flint"
9740,,,193587,Bungo Stray Dogs: Dead Apple,Action|Animation,"action,animation, 2018sbungo,,apple,dead,dogs:...","bungo,,apple,dead,dogs:,stray"


### TF-IDF

In [52]:
df_cbf['related']

0       adventure,animation,children,comedy,fantasy, 1...
1       adventure,children,fantasy, 1995sjumanji,,boar...
2        comedy,romance, 1995s,grumpier,men,old,moldy,old
3       comedy,drama,romance, 1995swaiting,,to,exhale,...
4        comedy, 1995s,ii,part,father,the,bride,of,remake
                              ...                        
9737    action,animation,comedy,fantasy, 2017s,book,at...
9738    animation,comedy,fantasy, 2017s,life:,game,no,...
9739                                  drama, 2017s,flint,
9740    action,animation, 2018sbungo,,apple,dead,dogs:...
9741               comedy, 1991s,rules,clay:,dice,andrew,
Name: related, Length: 9742, dtype: object

<img src = 'https://cdn-media-1.freecodecamp.org/images/1*q3qYevXqQOjJf6Pwdlx8Mw.png'>

In [53]:
#tf-idf untuk kolom related
tf = TfidfVectorizer(analyzer='word')
tfidf_matrix = tf.fit_transform(df_cbf['related'])

In [54]:
tfidf_matrix

<9742x11311 sparse matrix of type '<class 'numpy.float64'>'
	with 66380 stored elements in Compressed Sparse Row format>

### Cosine Similarity

In [55]:
#cosine similarity antara film-film
cos_sim_cbf = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_sim_cbf

array([[1.        , 0.10892624, 0.00991964, ..., 0.        , 0.04238822,
        0.011425  ],
       [0.10892624, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00991964, 0.        , 1.        , ..., 0.        , 0.        ,
        0.00934358],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.04238822, 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.011425  , 0.        , 0.00934358, ..., 0.        , 0.        ,
        1.        ]])

### Recommendation process

In [56]:
# fungsi membuat rekomendasi berdasarkan judul film. Fungsi ini akan mencari film yang memiliki kemiripan dengan judul film dan mengembalikan film recommended
def recommendations(title, cosine_sim = cos_sim_cbf):
    
    recommended_movies = []
    # idx untuk mendapatkan index film yang sama dengan judul
    idx = indices[indices == title].index[0]

    # cosine similarity terurut secara descending
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)

    # index film secara terurut
    top_rec = list(score_series.iloc[1:].index)

    # judul film yang direkomendasikan
    for i in top_rec:
        recommended_movies.append(list(df_cbf.index)[i])
        
    return recommended_movies

In [57]:
df_cbf.set_index('title', inplace = True)
df_cbf.head()

Unnamed: 0_level_0,movie_idt,tags,movie_id,genre,related,titles
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Toy Story,1.0,"fun,pixar",1,Adventure|Animation|Children|Comedy|Fantasy,"adventure,animation,children,comedy,fantasy, 1...","toy,,story"
Jumanji,2.0,"board,williams,fantasy,magic,game,game,robin",2,Adventure|Children|Fantasy,"adventure,children,fantasy, 1995sjumanji,,boar...","jumanji,"
Grumpier Old Men,3.0,"moldy,old",3,Comedy|Romance,"comedy,romance, 1995s,grumpier,men,old,moldy,old",",grumpier,men,old"
Waiting to Exhale,5.0,"pregnancy,remake",4,Comedy|Drama|Romance,"comedy,drama,romance, 1995swaiting,,to,exhale,...","waiting,,to,exhale"
Father of the Bride Part II,7.0,remake,5,Comedy,"comedy, 1995s,ii,part,father,the,bride,of,remake",",ii,part,father,the,bride,of"


In [58]:
#mendapatkan index setiap film
indices = pd.Series(df_cbf.index)
indices

0                                Toy Story 
1                                  Jumanji 
2                         Grumpier Old Men 
3                        Waiting to Exhale 
4              Father of the Bride Part II 
                       ...                 
9737    Black Butler: Book of the Atlantic 
9738                 No Game No Life: Zero 
9739                                 Flint 
9740          Bungo Stray Dogs: Dead Apple 
9741          Andrew Dice Clay: Dice Rules 
Name: title, Length: 9742, dtype: object

#### Movie recommendation based on movie-similarity (Content Based Filtering)

In [59]:
#Collaborative filtering dengan inputan judul film
sim_movies = recommendations('Toy Story ')

In [60]:
#20 list film yang mirip dengan judul masukkan
sim_movies[:20]

['Toy Story 2 ',
 'Fun ',
 'Toy Story 3 ',
 'In Search of the Castaways ',
 'Wild, The ',
 "We're Back! A Dinosaur's Story ",
 'Home ',
 'Shrek ',
 'Turbo ',
 'NeverEnding Story III, The ',
 'Antz ',
 'Moana ',
 'Fun with Dick and Jane ',
 'Enchanted ',
 'Nelly & Monsieur Arnaud ',
 'Robots ',
 'Valiant ',
 'Inside Out ',
 'Goonies, The ',
 'A Story of Children and Film ']

## Hybrid process

In [61]:
df_movieforuser = pd.DataFrame(columns=['user','movie_recommendation'])
for userid in range(len(train_pred)):
    count = 0
    movrec_u = []
    train_pred_c = train_pred.reshape(9724,610)
    for i in range(0,len(sim_movies)):
        idx = indices[indices == sim_movies[i]].index[0]
        mov_id = df_cbff.loc[idx].movie_id
        col_rate = np.where(movierec.columns == mov_id)[0][0]
        rate_user_u = train_pred_c[col_rate][userid]
        if(rate_user_u > 3):
            count+=1
            movrec_u.append(mov_id)
        if(count >= 20):
            break
            
    mvr = []
    for i in range(len(movrec_u)):
        mvr.append(df_movies.loc[df_movies.movie_id == movrec_u[i]].title.values[0])
    df_movieforuser.loc[userid,'user'] = userid
    df_movieforuser.loc[userid, 'movie_recommendation'] = mvr

In [62]:
df_movieforuser['movie_recommendation'][214]

['Fun ',
 'In Search of the Castaways ',
 'Home ',
 'Shrek ',
 'Turbo ',
 'NeverEnding Story III, The ',
 'Antz ',
 'Moana ',
 'Enchanted ',
 'Nelly & Monsieur Arnaud ',
 'Robots ',
 'Inside Out ',
 'Goonies, The ',
 'A Story of Children and Film ',
 'The Good Dinosaur ',
 'Up ',
 'Monsters, Inc. ',
 'Madagascar ',
 'Rio 2 ',
 'Rio ']

In [63]:
df_movieforuser['movie_recommendation'][23]

['Fun ',
 'In Search of the Castaways ',
 "We're Back! A Dinosaur's Story ",
 'Home ',
 'Shrek ',
 'Antz ',
 'Moana ',
 'Fun with Dick and Jane ',
 'Enchanted ',
 'Valiant ',
 'Inside Out ',
 'Goonies, The ',
 'A Story of Children and Film ',
 'NeverEnding Story II: The Next Chapter, The ',
 'Labyrinth ',
 'Up ',
 'Monsters, Inc. ',
 'G.I. Joe: The Movie ',
 'Rio ',
 'FairyTale: A True Story ']

### Recommendation result

In [64]:
df_movieforuser

Unnamed: 0,user,movie_recommendation
0,0,"[Toy Story 2 , Fun , In Search of the Castaway..."
1,1,"[In Search of the Castaways , Wild, The , We'r..."
2,2,"[Toy Story 2 , Fun , In Search of the Castaway..."
3,3,"[Toy Story 2 , Fun , In Search of the Castaway..."
4,4,"[Toy Story 2 , In Search of the Castaways , Wi..."
...,...,...
605,605,"[Toy Story 2 , Fun , In Search of the Castaway..."
606,606,"[Toy Story 2 , Fun , In Search of the Castaway..."
607,607,"[Toy Story 2 , Fun , In Search of the Castaway..."
608,608,"[Toy Story 2 , In Search of the Castaways , Sh..."
