In [None]:
%pip install lightfm pandas
%pip install --upgrade pip

In [179]:
import pandas as pd

links = pd.read_csv('5_GybridRS/ml-latest-small/links.csv')
movies = pd.read_csv('5_GybridRS/ml-latest-small/movies.csv')
ratings = pd.read_csv('5_GybridRS/ml-latest-small/ratings.csv')
tags = pd.read_csv('5_GybridRS/ml-latest-small/tags.csv')


In [180]:
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId').reset_index(drop=True)
movies_with_ratings.dropna(inplace=True)
movies_with_ratings.tail()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
100849,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184.0,4.0,1537109000.0
100850,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184.0,3.5,1537110000.0
100851,193585,Flint (2017),Drama,184.0,3.5,1537110000.0
100852,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184.0,3.5,1537110000.0
100853,193609,Andrew Dice Clay: Dice Rules (1991),Comedy,331.0,4.0,1537158000.0


In [181]:
interactions = movies_with_ratings.pivot_table(index='userId', columns='title', values='rating')
interactions.fillna(0, inplace=True)
interactions.tail()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0
609.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
610.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,...,0.0,4.0,3.5,3.0,0.0,0.0,2.0,1.5,0.0,0.0


In [182]:
from scipy.sparse import csr_matrix

interactions_matrix = csr_matrix(interactions)
interactions_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 100832 stored elements and shape (610, 9719)>

In [183]:
from lightfm import LightFM

model = LightFM(loss='warp', no_components=30, random_state=42)
model.fit(interactions_matrix, epochs=10)

<lightfm.lightfm.LightFM at 0x7f269054f650>

In [184]:
%pip install numpy

[0mNote: you may need to restart the kernel to use updated packages.


In [185]:
import numpy as np

user_id = 1
scores = model.predict(user_id, np.arange(interactions.shape[1]))
scores.shape

(9719,)

In [186]:
scores = pd.Series(scores, index=interactions.columns)
scores.sort_values(ascending=False).head()

title
Inception (2010)                 2.599602
Dark Knight, The (2008)          2.318889
Dark Knight Rises, The (2012)    2.032946
Django Unchained (2012)          2.018382
Inglourious Basterds (2009)      1.978894
dtype: float32

In [187]:
known_items = interactions.loc[user_id][interactions.loc[user_id] != 0].index
unknown_items = list(set(interactions.columns) - set(known_items))
scores[unknown_items].sort_values(ascending=False)[:10]

title
Inception (2010)                  2.599602
Dark Knight, The (2008)           2.318889
Dark Knight Rises, The (2012)     2.032946
Django Unchained (2012)           2.018382
Inglourious Basterds (2009)       1.978894
Shutter Island (2010)             1.963411
Interstellar (2014)               1.919477
Up (2009)                         1.807924
Hangover, The (2009)              1.789987
Guardians of the Galaxy (2014)    1.711558
dtype: float32

In [188]:
tags['tag'] = tags['tag'].astype(str)
tags_groupped_by_movie = tags.groupby('movieId')['tag'].apply(lambda tags: ' '.join(tags)).reset_index()
movies_with_tags = movies.merge(tags_groupped_by_movie, on='movieId', how='left')

grouped_rating = ratings.groupby('movieId')['rating']
average_ratings_median = grouped_rating.median().reset_index()

movies_total = movies_with_tags.merge(average_ratings_median, on='movieId', how='left')
movies_total.rename(columns={'rating': 'median_rating'}, inplace=True)
movies_total['genres'] = movies_total['genres'].str.replace('|', ' ').replace('-', ' ').replace('  ', ' ')
movies_total['genres_tag'] = movies_total['genres'] + ' ' + movies_total['tag'].fillna('')
movies_total = movies_total[['movieId', 'title', 'genres_tag', 'median_rating']]
movies_total.dropna(subset=['median_rating'], inplace=True)
movies_total.count()

movieId          9724
title            9724
genres_tag       9724
median_rating    9724
dtype: int64

In [189]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(movies_total['genres_tag'])
X_train_tfidf

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, movies_total['median_rating'], test_size=0.2, random_state=42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((7779, 1746), (7779,), (1945, 1746), (1945,))

In [190]:
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error

model = Ridge(alpha=1.0, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
root_mean_squared_error(y_test, y_pred)

0.8175520810068921

In [199]:
des_movies = scores[unknown_items].sort_values(ascending=False)[:10]
des_movies

title
Inception (2010)                  2.599602
Dark Knight, The (2008)           2.318889
Dark Knight Rises, The (2012)     2.032946
Django Unchained (2012)           2.018382
Inglourious Basterds (2009)       1.978894
Shutter Island (2010)             1.963411
Interstellar (2014)               1.919477
Up (2009)                         1.807924
Hangover, The (2009)              1.789987
Guardians of the Galaxy (2014)    1.711558
dtype: float32

In [200]:
movie = movies_total[movies_total['title'].isin(des_movies.index)].copy()
movie['score'] = movie['title'].map(des_movies) # ладно я не понял как это работает
movie.sort_values('score', ascending=False)

Unnamed: 0,movieId,title,genres_tag,median_rating,score
7372,79132,Inception (2010),Action Crime Drama Mystery Sci-Fi Thriller IMA...,4.0,2.599602
6710,58559,"Dark Knight, The (2008)",Action Crime Drama IMAX psychology superhero d...,4.5,2.318889
7768,91529,"Dark Knight Rises, The (2012)",Action Adventure Crime IMAX Anne Hathaway Chri...,4.0,2.032946
8063,99114,Django Unchained (2012),Action Drama Western action Christoph Waltz fu...,4.0,2.018382
7010,68157,Inglourious Basterds (2009),Action Drama War black comedy Brad Pitt Christ...,4.0,1.978894
7258,74458,Shutter Island (2010),Drama Mystery Thriller insanity Leonardo DiCap...,4.5,1.963411
8376,109487,Interstellar (2014),Sci-Fi IMAX black hole sci-fi time-travel Chri...,4.0,1.919477
7039,68954,Up (2009),Adventure Animation Children Drama adventure B...,4.0,1.807924
7043,69122,"Hangover, The (2009)",Comedy Crime casino comedy funny hotel Las Vegas,4.0,1.789987
8475,112852,Guardians of the Galaxy (2014),Action Adventure Sci-Fi funny Great Visuals hu...,4.0,1.711558


In [201]:
movies_sp_matrix = tfidf.transform(movie['genres_tag'])
m_pred = model.predict(movies_sp_matrix)
movie['pred_rating'] = m_pred
movie.sort_values('pred_rating', ascending=False)

Unnamed: 0,movieId,title,genres_tag,median_rating,score,pred_rating
7372,79132,Inception (2010),Action Crime Drama Mystery Sci-Fi Thriller IMA...,4.0,2.599602,4.254891
6710,58559,"Dark Knight, The (2008)",Action Crime Drama IMAX psychology superhero d...,4.5,2.318889,4.239658
7258,74458,Shutter Island (2010),Drama Mystery Thriller insanity Leonardo DiCap...,4.5,1.963411,4.076827
7010,68157,Inglourious Basterds (2009),Action Drama War black comedy Brad Pitt Christ...,4.0,1.978894,3.884677
7039,68954,Up (2009),Adventure Animation Children Drama adventure B...,4.0,1.807924,3.832478
8063,99114,Django Unchained (2012),Action Drama Western action Christoph Waltz fu...,4.0,2.018382,3.822921
8376,109487,Interstellar (2014),Sci-Fi IMAX black hole sci-fi time-travel Chri...,4.0,1.919477,3.806945
7768,91529,"Dark Knight Rises, The (2012)",Action Adventure Crime IMAX Anne Hathaway Chri...,4.0,2.032946,3.802619
8475,112852,Guardians of the Galaxy (2014),Action Adventure Sci-Fi funny Great Visuals hu...,4.0,1.711558,3.728198
7043,69122,"Hangover, The (2009)",Comedy Crime casino comedy funny hotel Las Vegas,4.0,1.789987,3.673155


In [202]:
movie['total_rating'] = movie.apply(
    lambda row: (row['median_rating'] + row['score'] + row['pred_rating']) / 3, axis=1)
movie.sort_values('total_rating', ascending=False)[:5]

Unnamed: 0,movieId,title,genres_tag,median_rating,score,pred_rating,total_rating
6710,58559,"Dark Knight, The (2008)",Action Crime Drama IMAX psychology superhero d...,4.5,2.318889,4.239658,3.686183
7372,79132,Inception (2010),Action Crime Drama Mystery Sci-Fi Thriller IMA...,4.0,2.599602,4.254891,3.618164
7258,74458,Shutter Island (2010),Drama Mystery Thriller insanity Leonardo DiCap...,4.5,1.963411,4.076827,3.513413
7010,68157,Inglourious Basterds (2009),Action Drama War black comedy Brad Pitt Christ...,4.0,1.978894,3.884677,3.287857
8063,99114,Django Unchained (2012),Action Drama Western action Christoph Waltz fu...,4.0,2.018382,3.822921,3.280434


<center>end</center>