In [10]:
import pandas as pd
from mlxtend.preprocessing import MeanCenterer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import time
start_time = time.time()

data_path = {
    'ratings': 'ratings.txt',
    'movies': 'netflix_titles.csv'
}

def load_data(data: str) -> pd.DataFrame:
    df = pd.read_csv(data_path[data], \
                        sep=',', \
                        engine='python')

    return df

In [2]:

def movie_data_treatment(df_movies: pd.DataFrame) -> pd.DataFrame:
    # crea columna de id
    df_movies['idMovie'] = [i for i in range(1, len(df_movies) + 1)]
    # coge las primeras 3952 para tener las mismas que en el otro dataset
    movies_id = [i for i in range(0, 3952)]
    df_movies = df_movies.iloc[movies_id]
    # selecciona columnas necesarias
    movie_keep_cols = ['title', 'idMovie', 'type', 'listed_in']
    df_movies = df_movies[movie_keep_cols]
    
    return df_movies


In [3]:

def create_rating_matrix(df_movies: pd.DataFrame, df_ratings: pd.DataFrame) -> pd.DataFrame:
    df = pd.merge(df_ratings, df_movies, on='idMovie', how='left')
    # crea matriz de valoraciones
    ratings_matrix = df.pivot( index = "idUser", columns = "idMovie", values = "rating")
    # 0 si no se ha valorado la pelicula
    ratings_matrix.fillna( 0, inplace = True ) 
    # elimina posibles duplicados
    ratings_matrix = ratings_matrix.loc[:, ~ratings_matrix.columns.duplicated()]
    
    return ratings_matrix


In [4]:


def group_movies_Kmeans(df_rating: pd.DataFrame) -> pd.DataFrame:
    # traspone df para aplicar kmeans en los objetos
    df_transpose = df_rating.transpose()
    film_index = df_transpose.index
    user_index = df_transpose.columns
    # centra en la media la matriz
    df_centered = MeanCenterer().fit(df_transpose.to_numpy())
    df_centered = df_centered.transform(df_transpose.to_numpy())
    df_centered = pd.DataFrame(df_centered)

    # resetea indices matriz
    df_centered.columns = user_index.astype(str)
    df_centered.index = film_index

    # encuentra numero optimo de agrupaciones
    kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
    }
    sse = []
    for k in range(1, 11):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(df_centered)
        sse.append(kmeans.inertia_)
   
    kl = KneeLocator(
        range(1, 11), sse, curve="convex", direction="decreasing")
    kl.elbow 

    # aplica kmeans
    kmeans = KMeans(n_clusters=kl.elbow, **kmeans_kwargs)
    kmeans.fit(df_centered)

    # adds groups to rating matrix
    df_centered['Group'] = kmeans.labels_

    return df_centered


In [5]:

def get_user_not_rated_movies(idUser: int, df_ratings: pd.DataFrame, \
                                           df_movies: pd.DataFrame) -> pd.DataFrame:
    df_rated = df_ratings[ df_ratings ['idUser'] == idUser][['idMovie', 'rating']]
    # peliculas no valoradas
    not_rated = df_movies[~df_movies['idMovie'].isin(df_rated['idMovie'])]
    # peliculas que tienen al menos una valoracion ( si no se se pueden comparar )
    rated_movies = not_rated[not_rated['idMovie'].isin(df_ratings['idMovie'])] 
    df_not_rated = pd.DataFrame(rated_movies['idMovie'], columns=['idMovie'])
    df_not_rated = df_not_rated.reset_index(drop=True)

    return df_not_rated


In [6]:

def get_movie_similarity_scores(idMovie: int, df_rating_matrix: pd.DataFrame) -> pd.DataFrame:
    # movie group
    movie_group = df_rating_matrix[df_rating_matrix.index == idMovie]['Group']
    # movies of same group
    df_same_group = df_rating_matrix[df_rating_matrix['Group'] == movie_group.values[0]]
    # similatity matrix between target movie and the rest of same group
    cosineSim_matrix = cosine_similarity( df_same_group, df_same_group)
    cosineSim_matrix = pd.DataFrame( cosineSim_matrix, \
                                    index = df_same_group.index, \
                                    columns = df_same_group.index) 
    # target movie similarity scores
    similarity_scores = cosineSim_matrix[idMovie]
    #create dataframe with data
    index = similarity_scores.index
    values = similarity_scores.values
    df_similarity = pd.DataFrame({'idMovie': index, 'similarity score': values})
    #sort by similarity score
    df_similarity = df_similarity.sort_values(by='similarity score', \
                                            axis=0, ascending=False)
    df_similarity = df_similarity.reset_index(drop=True)
    # drop target movie sim. score with itself 
    df_similarity = df_similarity[1:]
    
    return df_similarity



In [7]:


def get_predicted_rating(df_similarity: pd.DataFrame, \
                         df_ratings: pd.DataFrame) -> float:
    # coge las peliculas valoradas y su puntuacion
    df_similarity_rated = pd.merge(df_similarity, df_ratings, how='inner', on='idMovie')
    df_top_10 = df_similarity_rated[:10] 
    
    #predice la valoracion
    nominator = 0
    denominator = 0
    ratings = df_top_10['rating'].to_list()
    simmilarity_score = df_top_10['similarity score'].to_list()

    for i in range(len(ratings)):
        if simmilarity_score[i] >= 0:
            nominator += ratings[i]*simmilarity_score[i]
            denominator += simmilarity_score[i]
    try:
        ratingPrediction = nominator / denominator
    except ZeroDivisionError: #division por 0
        ratingPrediction = 0
    return ratingPrediction


In [8]:


def get_recommendation(df_predicted_ratings: pd.DataFrame, \
                                df_movies: pd.DataFrame) -> pd.DataFrame:
    df_sorted = df_predicted_ratings.sort_values(by='predictedRating', axis=0, ascending=False)
    df_top_ratings = df_sorted[0:5] # recomienda cinco peliculas
    
    df_titles = pd.merge(df_top_ratings, df_movies, how='left', on='idMovie')
    df_titles = df_titles[['idMovie', 'predictedRating', 'title', 'listed_in']]
    
    return df_titles


In [11]:

idUser = 2

ratings = load_data('ratings')
ratings


Unnamed: 0,idUser,idMovie,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [12]:

movies = load_data('movies')
movies

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [13]:

movies = movie_data_treatment(movies)
movies

Unnamed: 0,title,idMovie,type,listed_in
0,Dick Johnson Is Dead,1,Movie,Documentaries
1,Blood & Water,2,TV Show,"International TV Shows, TV Dramas, TV Mysteries"
2,Ganglands,3,TV Show,"Crime TV Shows, International TV Shows, TV Act..."
3,Jailbirds New Orleans,4,TV Show,"Docuseries, Reality TV"
4,Kota Factory,5,TV Show,"International TV Shows, Romantic TV Shows, TV ..."
...,...,...,...,...
3947,Ricardo Quevedo: Los amargados somos más,3948,Movie,Stand-Up Comedy
3948,Suzzanna: Buried Alive,3949,Movie,"Horror Movies, International Movies"
3949,Tango,3950,TV Show,"International TV Shows, Romantic TV Shows, TV ..."
3950,Ek Ladki Ko Dekha Toh Aisa Laga,3951,Movie,"Comedies, Dramas, International Movies"


In [14]:

rating_matrix = create_rating_matrix(movies, ratings)
rating_matrix

idMovie,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
idUser,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:

grouped_rating_matrix = group_movies_Kmeans(rating_matrix)
grouped_rating_matrix

found 0 physical cores < 1
  File "C:\Users\acasa\AppData\Roaming\Python\Python311\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


idUser,1,2,3,4,5,6,7,8,9,10,...,6032,6033,6034,6035,6036,6037,6038,6039,6040,Group
idMovie,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.940097,-0.12925,-0.053697,-0.023745,-0.168106,3.925256,-0.036158,3.85429,4.893146,4.554776,...,3.883972,-0.062331,-0.023206,3.802752,-0.791419,-0.202644,-0.020507,-0.12871,2.670804,0
2,-0.059903,-0.12925,-0.053697,-0.023745,-0.168106,-0.074744,-0.036158,-0.14571,-0.106854,4.554776,...,-0.116028,-0.062331,-0.023206,-0.197248,-0.791419,-0.202644,-0.020507,-0.12871,-0.329196,1
3,-0.059903,-0.12925,-0.053697,-0.023745,-0.168106,-0.074744,-0.036158,-0.14571,-0.106854,-0.445224,...,-0.116028,-0.062331,-0.023206,0.802752,-0.791419,-0.202644,-0.020507,-0.12871,-0.329196,1
4,-0.059903,-0.12925,-0.053697,-0.023745,-0.168106,-0.074744,-0.036158,2.85429,-0.106854,-0.445224,...,-0.116028,-0.062331,-0.023206,1.802752,1.208581,-0.202644,-0.020507,-0.12871,-0.329196,2
5,-0.059903,-0.12925,-0.053697,-0.023745,-0.168106,-0.074744,-0.036158,-0.14571,-0.106854,-0.445224,...,-0.116028,-0.062331,-0.023206,0.802752,-0.791419,-0.202644,-0.020507,-0.12871,-0.329196,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,-0.059903,-0.12925,-0.053697,-0.023745,-0.168106,-0.074744,-0.036158,-0.14571,2.893146,3.554776,...,-0.116028,-0.062331,-0.023206,-0.197248,-0.791419,-0.202644,-0.020507,-0.12871,-0.329196,1
3949,-0.059903,-0.12925,-0.053697,-0.023745,-0.168106,-0.074744,-0.036158,-0.14571,-0.106854,-0.445224,...,-0.116028,-0.062331,-0.023206,-0.197248,-0.791419,-0.202644,-0.020507,-0.12871,-0.329196,2
3950,-0.059903,-0.12925,-0.053697,-0.023745,-0.168106,-0.074744,-0.036158,-0.14571,-0.106854,-0.445224,...,-0.116028,-0.062331,-0.023206,-0.197248,-0.791419,-0.202644,-0.020507,-0.12871,-0.329196,2
3951,-0.059903,-0.12925,-0.053697,-0.023745,-0.168106,-0.074744,-0.036158,-0.14571,-0.106854,-0.445224,...,-0.116028,-0.062331,-0.023206,-0.197248,-0.791419,-0.202644,-0.020507,-0.12871,-0.329196,2


In [16]:

df_not_rated = get_user_not_rated_movies(idUser, ratings, movies)
df_not_rated


Unnamed: 0,idMovie
0,1
1,2
2,3
3,4
4,5
...,...
3572,3948
3573,3949
3574,3950
3575,3951


In [17]:

df_predicted_ratings = pd.DataFrame()

for idMovie in df_not_rated['idMovie']:
    print(idMovie)
    similarity_scores = get_movie_similarity_scores(idMovie, grouped_rating_matrix)
    predicted_rating = get_predicted_rating(similarity_scores, ratings)
    new_prediction = {'idMovie': idMovie, 'predictedRating': predicted_rating}
    df_predicted_ratings = pd.concat([df_predicted_ratings, pd.DataFrame([new_prediction])], ignore_index=True)


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
92
93
94
96
97
98
99
100
101
102
103
104
105
106
107
108
111
112
113
114
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
164
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
222
223
224
225
226
227
228
229
230
231
232
233
234
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
286
287
288
289
290
291



KeyboardInterrupt



In [None]:

    
df_predicted_ratings

recommendation = get_recommendation(df_predicted_ratings, movies)


In [None]:
recommendation