In [196]:
import pandas as pd
import numpy as np
import re
from scipy import sparse
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster import KMeans, AgglomerativeClustering,DBSCAN
from scipy.cluster import hierarchy
from sklearn.metrics import silhouette_score
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

### Loading data

In [158]:
movies = pd.read_csv('./data/movies.csv')
ratings = pd.read_csv('./data/ratings.csv')

#Drop unnecessary columns
# ratings.drop('timestamp', axis=1, inplace=True)
# movies.drop('genres', axis=1, inplace=True)

## Creating new columns

In [159]:
#creating list with unique genres
genres = list(set('|'.join(list(movies["genres"].unique())).split('|')))
genres.remove('(no genres listed)')

#Creating dummy columns for each genre
for genre in genres:
    movies[genre] = movies['genres'].map(lambda val: 1 if genre in val else 0)

In [160]:
#Creating colum with film year
movies['year'] = movies['title'].map(lambda val: int(re.search('\(([0-9]{4})\)',val).group(1)) 
                                     if re.search('\(([0-9]{4})\)',val)!= None 
                                     else 0)   
# Film Decade
for decade in range(1930,2020,10):
    movies['decade_'+str(decade)] = np.where((movies['year'] < decade+10) & (movies['year'] >= decade) ,1,0)
#     print('column created','decade_' + str(decade))
    
movies['decade_none'] = np.where(movies['year'] == 0 ,1,0)
movies['decade_other'] = np.where((movies['year'] != 0) & (movies['year'] <1930) ,1,0)

In [161]:
for col in movies.columns:
    if col[0:6] =='decade':
        print(col+':',movies[movies[col] ==1].shape[0])

decade_1930: 136
decade_1940: 197
decade_1950: 279
decade_1960: 401
decade_1970: 500
decade_1980: 1177
decade_1990: 2212
decade_2000: 2849
decade_2010: 1931
decade_none: 13
decade_other: 47


# Dropping columns

In [162]:
#Droping genres
movies.drop('genres', axis=1,inplace= True)  
ratings.drop('timestamp', axis=1,inplace= True) 

## Joining Data Frames

In [163]:
df = pd.merge(ratings, movies, on='movieId')
print(df.shape)
df.head()

(100836, 35)


Unnamed: 0,userId,movieId,rating,title,IMAX,Animation,Thriller,Western,Film-Noir,Documentary,...,decade_1940,decade_1950,decade_1960,decade_1970,decade_1980,decade_1990,decade_2000,decade_2010,decade_none,decade_other
0,1,1,4.0,Toy Story (1995),0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,5,1,4.0,Toy Story (1995),0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,7,1,4.5,Toy Story (1995),0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,15,1,2.5,Toy Story (1995),0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,17,1,4.5,Toy Story (1995),0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


## Building item-based collaborative recommender

In [164]:
pivot = pd.pivot_table(df, index='title', columns=['userId'], values='rating')
pivot.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,


In [165]:
sparse_pivot = sparse.csr_matrix(pivot.fillna(0))
print("size", sparse_pivot.shape[0])
print(sparse_pivot[0:5])

size 9719
  (0, 609)	4.0
  (1, 331)	4.0
  (2, 331)	3.5
  (2, 376)	3.5
  (3, 344)	5.0
  (4, 112)	3.0
  (4, 344)	5.0


In [166]:
recommender = pairwise_distances(sparse_pivot, metric='cosine')
recommender_df = pd.DataFrame(recommender, columns=pivot.index, index=pivot.index)
recommender_df.head(2)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.858347,1.0,...,1.0,0.657945,0.456695,0.292893,1.0,1.0,0.860569,0.672673,1.0,1.0
'Hellboy': The Seeds of Creation (2004),1.0,0.0,0.292893,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [167]:
# results = pd.DataFrame({'title':cosine_top_scores.index.values,
#               'cosine_score':cosine_top_scores.values}
#             )
# results = pd.merge(results, movies, on='title')
# # movies[movies.columns.difference(['movieId'])]
# results.head(2)

In [168]:
# print('Simple cosine similarity Top 5')
# print(results[['title','cosine_score']][0:5])

In [169]:
# movies.columns.difference(['movieId','title'])

## Using KNN to get neighboors

In [170]:
film_name = 'True Lies (1994)'

In [171]:
top_cos_sim = pd.DataFrame(recommender_df[film_name].sort_values()[1:51])
top_cos_sim.reset_index(level=0, inplace=True)
top_cos_sim.columns = ['title','cosine_sim']
top_cos_sim.head()

Unnamed: 0,title,cosine_sim
0,Batman (1989),0.303245
1,"Fugitive, The (1993)",0.316269
2,Speed (1994),0.32718
3,Stargate (1994),0.339251
4,Die Hard: With a Vengeance (1995),0.341742


In [210]:
#Joining DF
# movies.loc[movies['title'] == film_name, movies.columns]
results = pd.merge(top_cos_sim, movies, on='title')

In [183]:
ss = StandardScaler()
X_train = results[results.columns.difference(['movieId','title', 'year'])]
X_train_sc = ss.fit_transform(X_train)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [None]:
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)

In [197]:
model_dic = {"kmeans": {"init_seed": [10],
                  "inits"  :  ["k-means++","random"],
                  "clusters_list": range(2,20),
                  "tol": 0.0001},
             "hierarchy": {"linkage_method":['complete','ward', 'single',
                           'centroid','median','weighted'],"t": range(4,15),
                          "criterion": "maxclust"},
             
           "Agglomerative": {"linkage_method":['complete','ward', 'single',
                             'centroid','median','weighted'],'clusters_list':range(4,15),
                             "affinity": ['euclidean', 'l1', 'l2', 'manhattan',
                             'cosine', 'precomputed'],"clusters_list": range(4,10)},
             
            "dbscan": {"epslons":np.linspace(0.5,20,20),
                       "min_samples": [1,3,5,7,9,11,13,15,17,19,21,23,25,27,31]            
            }}
df_dic = {'df_scaled_values': X_train_sc}
model_results = grid_clusters(df_dic,model_dic)



In [198]:
# Top kmeans silhouette score
display(model_results[(model_results.model == 'Agglomerative')]\
.sort_values(by=['silhouette'],ascending = False).head(4))

display(model_results[(model_results.model == 'kmeans')]\
.sort_values(by=['silhouette'],ascending = False).head(4))

display(model_results[(model_results.model == 'hierarchy')]\
.sort_values(by=['silhouette'],ascending = False).head(4))

display(model_results[(model_results.model == 'dbscan')]\
.sort_values(by=['silhouette'],ascending = False).head(4))

Unnamed: 0,Data_frame,model,inertia,silhouette,Numb_clusters,Cluster_counts,model_params
126,df_scaled_values,Agglomerative,0.0,0.222618,8,"[19, 18, 3, 3, 3, 2, 1, 1]","Afin:euclidean, Link:, complete"
238,df_scaled_values,Agglomerative,0.0,0.222618,8,"[19, 18, 3, 3, 3, 2, 1, 1]","Afin:cosine, Link:, centroid"
128,df_scaled_values,Agglomerative,0.0,0.222618,8,"[19, 18, 3, 3, 3, 2, 1, 1]","Afin:l2, Link:, complete"
129,df_scaled_values,Agglomerative,0.0,0.222618,8,"[19, 18, 3, 3, 3, 2, 1, 1]","Afin:manhattan, Link:, complete"


Unnamed: 0,Data_frame,model,inertia,silhouette,Numb_clusters,Cluster_counts,model_params
1,df_scaled_values,kmeans,738.238672,0.335297,3,"[45, 4, 1]",n_init_Seed: 10 n_init_met: k-means++
0,df_scaled_values,kmeans,851.596557,0.324359,2,"[46, 4]",n_init_Seed: 10 n_init_met: k-means++
18,df_scaled_values,kmeans,851.596557,0.324359,2,"[46, 4]",n_init_Seed: 10 n_init_met: random
15,df_scaled_values,kmeans,175.441758,0.233633,17,"[8, 8, 5, 4, 4, 3, 3, 3, 2, 2, 2, 1, 1, 1, 1, ...",n_init_Seed: 10 n_init_met: k-means++


Unnamed: 0,Data_frame,model,inertia,silhouette,Numb_clusters,Cluster_counts,model_params
69,df_scaled_values,hierarchy,0.0,0.345771,4,"[45, 3, 1, 1]",centroid
58,df_scaled_values,hierarchy,0.0,0.345771,4,"[45, 3, 1, 1]",single
91,df_scaled_values,hierarchy,0.0,0.345771,4,"[45, 3, 1, 1]",weighted
80,df_scaled_values,hierarchy,0.0,0.345771,4,"[45, 3, 1, 1]",median


Unnamed: 0,Data_frame,model,inertia,silhouette,Numb_clusters,Cluster_counts,model_params
419,df_scaled_values,dbscan,0.0,0.490033,2,"[49, 1]","Epslon:9.74, min_samp: 31"
385,df_scaled_values,dbscan,0.0,0.490033,2,"[49, 1]","Epslon:7.68, min_samp: 21"
395,df_scaled_values,dbscan,0.0,0.490033,2,"[49, 1]","Epslon:8.71, min_samp: 11"
393,df_scaled_values,dbscan,0.0,0.490033,2,"[49, 1]","Epslon:8.71, min_samp: 7"


In [229]:
#DBSCAN RESULTS ANALIZING
display(model_results[(model_results.model == 'dbscan') &
               (model_results.Numb_clusters >= 3 )]\
.sort_values(by=['silhouette'],ascending = False))

Unnamed: 0,Data_frame,model,inertia,silhouette,Numb_clusters,Cluster_counts,model_params
345,df_scaled_values,dbscan,0.0,0.332978,3,"[48, 1, 1]","Epslon:5.63, min_samp: 1"
360,df_scaled_values,dbscan,0.0,0.332978,3,"[48, 1, 1]","Epslon:6.66, min_samp: 1"
322,df_scaled_values,dbscan,0.0,0.174536,34,"[6, 4, 3, 3, 3, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, ...","Epslon:2.55, min_samp: 1"
331,df_scaled_values,dbscan,0.0,0.157889,8,"[42, 2, 1, 1, 1, 1, 1, 1]","Epslon:4.61, min_samp: 1"
326,df_scaled_values,dbscan,0.0,0.136107,3,"[33, 14, 3]","Epslon:3.58, min_samp: 3"
320,df_scaled_values,dbscan,0.0,0.128829,44,"[3, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","Epslon:1.53, min_samp: 1"
325,df_scaled_values,dbscan,0.0,0.085899,15,"[33, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]","Epslon:3.58, min_samp: 1"
329,df_scaled_values,dbscan,0.0,0.057663,3,"[21, 15, 14]","Epslon:3.58, min_samp: 9"
318,df_scaled_values,dbscan,0.0,0.044548,48,"[3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","Epslon:0.5, min_samp: 1"
330,df_scaled_values,dbscan,0.0,0.004806,3,"[27, 12, 11]","Epslon:3.58, min_samp: 11"


In [230]:
## DBSCAN Parameters Chosen:
display(model_results.loc[[345]])
# saving parameters in instantiated model 
dbsc_1 = DBSCAN(eps = 5.63, min_samples = 1).fit(X_train_sc).labels_
results['dbsc_1'] = dbsc_1

Unnamed: 0,Data_frame,model,inertia,silhouette,Numb_clusters,Cluster_counts,model_params
345,df_scaled_values,dbscan,0.0,0.332978,3,"[48, 1, 1]","Epslon:5.63, min_samp: 1"


In [235]:
display(results['dbsc_1'].value_counts())

1    48
2     1
0     1
Name: dbsc_1, dtype: int64

In [237]:
#Films 
display(results[results['dbsc_1']==0])
display(results[results['dbsc_1']==2])
# display(results[results['dbsc_1']==3])

Unnamed: 0,title,cosine_sim,movieId,IMAX,Animation,Thriller,Western,Film-Noir,Documentary,Action,...,decade_1950,decade_1960,decade_1970,decade_1980,decade_1990,decade_2000,decade_2010,decade_none,decade_other,dbsc_1
0,Batman (1989),0.303245,592,0,0,1,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0


Unnamed: 0,title,cosine_sim,movieId,IMAX,Animation,Thriller,Western,Film-Noir,Documentary,Action,...,decade_1950,decade_1960,decade_1970,decade_1980,decade_1990,decade_2000,decade_2010,decade_none,decade_other,dbsc_1
11,Dances with Wolves (1990),0.391378,590,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,2


## KNN

In [245]:
 pd.DataFrame(recommender_df[film_name].sort_values()[0:1])

Unnamed: 0_level_0,True Lies (1994)
title,Unnamed: 1_level_1
True Lies (1994),0.0


In [239]:
from sklearn.neighbors import KNeighborsClassifier
y_train = results['dbsc_1']
knn = KNeighborsClassifier()
knn.fit(X_train_sc,y_train)
print(knn.predict([[1.1]]))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [13]:
film_name = 'True Lies (1994)'
print(movies.loc[movies['title']==film_name, 'title'])
print(pd.DataFrame({"title", recommender_df[film_name].head().index}
                   recommender_df[film_name].sort_values()[1:51].values,columns=['cosine_similarity']))

337    True Lies (1994)
Name: title, dtype: object
    cosine_similarity
0            0.303245
1            0.316269
2            0.327180
3            0.339251
4            0.341742
5            0.345414
6            0.357256
7            0.361425
8            0.372662
9            0.380683
10           0.388836
11           0.391378
12           0.397836
13           0.398235
14           0.400094
15           0.400950
16           0.402724
17           0.411024
18           0.416341
19           0.419639
20           0.423825
21           0.447750
22           0.449832
23           0.451686
24           0.454128
25           0.459386
26           0.461931
27           0.462669
28           0.465299
29           0.467112
30           0.470883
31           0.474974
32           0.477674
33           0.484754
34           0.485527
35           0.490025
36           0.491132
37           0.495456
38           0.499583
39           0.503650
40           0.511486
41           0.514127
42 

In [99]:
search = 'Die Hard'

for title in movies.loc[movies['title'].str.contains(search), 'title']:
    print(title)
    print('Average rating', pivot.loc[title, :].mean())
    print('Number of ratings', pivot.T[title].count())
    print('')
    print('10 closest movies')
    print(recommender_df[title].sort_values()[1:11])
    print('')
    print('*******************************************************************************************')
    print('')

Die Hard: With a Vengeance (1995)
Average rating 3.5555555555555554
Number of ratings 144

10 closest movies
title
True Lies (1994)                     0.341742
Speed (1994)                         0.365913
Cliffhanger (1993)                   0.377932
Ace Ventura: Pet Detective (1994)    0.381457
GoldenEye (1995)                     0.384206
Clear and Present Danger (1994)      0.399911
Fugitive, The (1993)                 0.408297
Batman (1989)                        0.413284
Outbreak (1995)                      0.423053
Batman Forever (1995)                0.423242
Name: Die Hard: With a Vengeance (1995), dtype: float64

*******************************************************************************************

Die Hard (1988)
Average rating 3.8620689655172415
Number of ratings 145

10 closest movies
title
Indiana Jones and the Last Crusade (1989)                                         0.336844
Terminator, The (1984)                                                            0.356

In [184]:
def grid_clusters(df_dic, par_dic):
    result_dic = {"Data_frame" :[],
                  "model": [],
                  "inertia": [],
                  "silhouette": [],
                  "Numb_clusters": [],
                  "Cluster_counts": [],
                  "model_params": []
                  }
    # loop dataFrames
    for df_name, df_data in df_dic.items():
        X = df_data
        # loop models
        for n_model in par_dic.keys():
            if n_model == "kmeans":
                # loop centroid seeds
                for n_init_seed in par_dic['kmeans']['init_seed']:
                    # loop Method for initialization
                    for n_init_method in par_dic['kmeans']['inits']:
                            # loop number of clusters
                            for n_cluster in par_dic['kmeans']['clusters_list']:
                                kmeans = KMeans(n_clusters=n_cluster,
                                                init=n_init_method,
                                                n_init=n_init_seed,
                                                tol=par_dic['kmeans']['tol'],
                                                random_state=42)
                                kmeans.fit(X)
                                #Saving results in dic
                                result_dic['Data_frame'].append(df_name)
                                result_dic['model'].append(n_model)
                                result_dic['inertia'].append(kmeans.inertia_)
                                result_dic['silhouette'].append(silhouette_score(X, kmeans.labels_))
                                result_dic['Numb_clusters'].append(n_cluster)
                                result_dic['Cluster_counts'].append(list(pd.Series(kmeans.labels_).value_counts()))
                                result_dic['model_params'].append("n_init_Seed: "
                                                                  + str(n_init_seed) +
                                                                  " n_init_met: " + 
                                                                  str(n_init_method)) 
                                
            if n_model == "hierarchy":
                # loop linkage_method
                for n_link_met in par_dic['hierarchy']['linkage_method']:
                    for n_clusters in par_dic['hierarchy']['t']:
                        z = hierarchy.linkage(X, method=n_link_met)
                        cls = hierarchy.fcluster(z, n_clusters, "maxclust")
                        # Saving results in dic
                        result_dic['Data_frame'].append(df_name)
                        result_dic['model'].append(n_model)
                        result_dic['inertia'].append(0)
                        result_dic['silhouette'].append(silhouette_score(X, cls))
                        result_dic['Numb_clusters'].append(n_clusters)
                        result_dic['Cluster_counts'].append(list(pd.Series(cls).value_counts()))
                        result_dic['model_params'].append(n_link_met)
                        
            if n_model == "Agglomerative":
                # loop linkage criterion
                for n_link_met in par_dic['Agglomerative']['linkage_method']:
                    #loop number of clusters
                    for n_clusters in par_dic['Agglomerative']['clusters_list']:
                        # loop method to compute linkage
                        for n_affinity in par_dic['Agglomerative']['affinity']:
                            ac = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
                            ac.fit(X)
                            # Saving results in dic
                            result_dic['Data_frame'].append(df_name)
                            result_dic['model'].append(n_model)
                            result_dic['inertia'].append(0)
                            result_dic['silhouette'].append(silhouette_score(X, ac.labels_))
                            result_dic['Numb_clusters'].append(n_clusters)
                            result_dic['Cluster_counts'].append(list(pd.Series(ac.labels_).value_counts()))
                            result_dic['model_params'].append("Afin:" + n_affinity + ", Link:, " + n_link_met)
            
            if n_model == "dbscan":
                for e in par_dic['dbscan']['epslons']:
                    for samp in par_dic['dbscan']['min_samples']:
                        dbsc = DBSCAN(eps = e,
                                      min_samples = samp,
                                      metric='euclidean',
                                      leaf_size=30
                                      ).fit(X)
                        n_clusters = len(list(pd.Series(dbsc.labels_).value_counts()))
                        if n_clusters > 1 :      
                            result_dic['Data_frame'].append(df_name)
                            result_dic['model'].append(n_model)
                            result_dic['inertia'].append(0)
                            result_dic['silhouette'].append(silhouette_score(X, dbsc.labels_))
                            result_dic['Numb_clusters'].append(n_clusters)
                            result_dic['Cluster_counts'].append(list(pd.Series(dbsc.labels_).value_counts()))
                            result_dic['model_params'].append("Epslon:" + str(round(e,2)) + ", min_samp: " + str(samp))                           
    return pd.DataFrame(result_dic)