# Домашнее задание по теме «Рекомендации на основе содержания»

Задание
1. Использовать dataset [MovieLens](https://grouplens.org/datasets/movielens/latest/)
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
    - TF-IDF на тегах и жанрах
    - Средние оценки (+ median, variance, etc.) пользователя и фильма
3. Оценить RMSE на тестовой выборке

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

%matplotlib inline


In [2]:
links_df = pd.read_csv('data/links.csv')
movies_df = pd.read_csv('data/movies.csv')
ratings_df = pd.read_csv('data/ratings.csv')
tags_df = pd.read_csv('data/tags.csv')


In [3]:
tags_df.head()


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [4]:
movies_df.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))


In [6]:
movies_with_tags_df = movies_df.join(
    tags_df.set_index('movieId'), on='movieId')
movies_with_tags_df.head()


Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [7]:
tag_strings = []
movies = []

for movie, group in tqdm(movies_with_tags_df.groupby('title')):
    tag_strings.append(
        ' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)


  0%|          | 0/9737 [00:00<?, ?it/s]

In [8]:
movies[:5]

["'71 (2014)",
 "'Hellboy': The Seeds of Creation (2004)",
 "'Round Midnight (1986)",
 "'Salem's Lot (2004)",
 "'Til There Was You (1997)"]

In [9]:
movies_with_tags_df = pd.DataFrame(list(zip(movies, tag_strings)), columns=['title', 'tag'])
movies_with_genres_tags_df = movies_df.join(movies_with_tags_df.set_index('title'), on='title')
movies_with_genres_tags_df['genres'] = movies_with_genres_tags_df['genres'].apply(change_string)
movies_with_genres_tags_df.head()


Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magicboardgame RobinWilliams game
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old
3,4,Waiting to Exhale (1995),Comedy Drama Romance,
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake


In [10]:
# Average and median movie ratings
mean_ratings = ratings_df.groupby('movieId').mean().drop(
    ['userId', 'timestamp'], axis=1)
mean_ratings.rename(columns={'rating': 'mean_ratings'}, inplace=True)

median_ratings = ratings_df.groupby('movieId').median().drop(
    ['userId', 'timestamp'], axis=1)
median_ratings.rename(columns={'rating': 'med_ratings'}, inplace=True)

variance_ratings = ratings_df.groupby('movieId').var().drop(
    ['userId', 'timestamp'], axis=1)
variance_ratings.rename(columns={'rating': 'var_ratings'}, inplace=True)

# adding a column with the average and median movie ratings
movies_with_genres_tags_ratings = movies_with_genres_tags_df.join(
    mean_ratings, on='movieId')
movies_with_genres_tags_ratings = movies_with_genres_tags_ratings.join(
    median_ratings, on='movieId')
movies_with_genres_tags_ratings = movies_with_genres_tags_ratings.join(
    variance_ratings, on='movieId')

movies_with_genres_tags_ratings.head()


Unnamed: 0,movieId,title,genres,tag,mean_ratings,med_ratings,var_ratings
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun,3.92093,4.0,0.69699
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magicboardgame RobinWilliams game,3.431818,3.5,0.777419
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old,3.259615,3.0,1.112651
3,4,Waiting to Exhale (1995),Comedy Drama Romance,,2.357143,3.0,0.72619
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake,3.071429,3.0,0.822917


### Genre recommendation

In [11]:
# genres list
genres_list = movies_with_genres_tags_ratings.genres.to_list()

# train model
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(genres_list)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

neigh = KNeighborsRegressor(n_neighbors=7, n_jobs=-1, metric='euclidean')
neigh.fit(X_train_counts, X_train_tfidf)


KNeighborsRegressor(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [12]:
# test
test = change_string('Adventure|Comedy|Fantasy|Crime')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

predicted_movies = neigh.kneighbors(X_tfidf2, return_distance=True)
predicted_movies


(array([[0.88433496, 0.88433496, 0.88433496, 0.88433496, 0.88433496,
         0.88433496, 0.88433496]]),
 array([[6957, 4137, 4152, 3638, 6955, 4076, 6912]], dtype=int64))

In [13]:
# list movies in descending order of rating
movies_with_genres_tags_ratings.iloc[predicted_movies[1][0]].sort_values(by=['mean_ratings'], ascending=False)


Unnamed: 0,movieId,title,genres,tag,mean_ratings,med_ratings,var_ratings
3638,4993,"Lord of the Rings: The Fellowship of the Ring,...",Adventure Fantasy,fantasy highfantasy Magic mythology tolkien wi...,4.106061,4.5,0.874481
4137,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure Fantasy,Myth Tolkein,4.021277,4.0,0.796336
4076,5816,Harry Potter and the Chamber of Secrets (2002),Adventure Fantasy,Magic Wizards,3.598039,3.5,0.85663
6957,65685,Inkheart (2008),Adventure Fantasy,,3.5,3.5,0.25
6912,64167,Dinotopia (2002),Adventure Fantasy,,3.333333,3.5,0.083333
4152,5974,"Thief of Bagdad, The (1940)",Adventure Fantasy,,3.0,3.0,
6955,65651,Fire and Ice (2008),Adventure Fantasy,,2.0,2.0,


### Tag recommendation

In [14]:
# number of unique tags
movies_with_genres_tags_ratings.tag.unique().shape


(1040,)

In [15]:
movies_with_genres_tags_ratings.dropna(inplace=True)


In [16]:
# tags list
tag_strings = movies_with_genres_tags_ratings.tag.to_list()

# train model
count_vect_tags = CountVectorizer()
X_train_counts_tags = count_vect_tags.fit_transform(tag_strings)

tfidf_tags_transformer = TfidfTransformer()
X_train_tags_tfidf = tfidf_tags_transformer.fit_transform(X_train_counts_tags)

neig_reg_tags = KNeighborsRegressor(
    n_neighbors=10, n_jobs=-1, metric='manhattan')
neig_reg_tags.fit(X_train_counts_tags, X_train_tags_tfidf)


KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [17]:
tag_strings[1]


'fantasy magicboardgame RobinWilliams game'

In [18]:
# test
test_tags = 'fantasy magicboardgame RobinWilliams game'

predict_tags = count_vect_tags.transform([test_tags])
X_tfidf2_tags = tfidf_tags_transformer.transform(predict_tags)

res_tags = neig_reg_tags.kneighbors(X_tfidf2_tags, return_distance=True)
res_tags


(array([[1.99701857, 2.00298143, 2.99701857, 2.99701857, 2.99701857,
         2.99701857, 2.99701857, 2.99701857, 2.99701857, 2.99701857]]),
 array([[2067,    1, 4090, 4092, 4088, 4089, 4085, 4093, 4091, 4094]],
       dtype=int64))

In [19]:
# list movies in descending order of rating
movies_with_genres_tags_ratings.iloc[res_tags[1][0]].sort_values(by=['mean_ratings'], ascending=False)


Unnamed: 0,movieId,title,genres,tag,mean_ratings,med_ratings,var_ratings
5335,8874,Shaun of the Dead (2004),Comedy Horror,zombies,4.006494,4.0,0.865089
5347,8914,Primer (2004),Drama SciFi,timetravel,3.794118,4.0,0.658088
2382,3160,Magnolia (1999),Drama,L.A.,3.711538,4.0,0.983786
5343,8908,Ladder 49 (2004),Action Drama Thriller,,3.666667,3.5,0.266667
5344,8910,I Heart Huckabees (2004),Comedy,,3.452381,3.5,0.997619
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magicboardgame RobinWilliams game,3.431818,3.5,0.777419
5349,8916,Shall We Dance? (2004),Comedy Romance,,3.0,3.0,0.25
5341,8906,Cannibal Holocaust (1980),Horror,,2.666667,2.5,0.583333
5342,8907,Shark Tale (2004),Animation Children Comedy,,2.346154,2.5,0.932692
5346,8912,Taxi (2004),Action Comedy,,1.75,1.75,0.125


### RMSE

In [20]:
# remove movies without rating
print(movies_with_genres_tags_ratings.mean_ratings.isna().any())
movies_with_genres_tags_ratings3 = movies_with_genres_tags_ratings.dropna()

False


In [21]:
# split to train, test
X_train, X_test, y_train, y_test = train_test_split(movies_with_genres_tags_ratings3.drop(['mean_ratings'], axis=1),
                                                    movies_with_genres_tags_ratings3.mean_ratings, test_size=0.33,
                                                    random_state=42)
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)


In [22]:
# train

train_genres_strings = train_df.genres.to_list()
count_vect_tags = CountVectorizer()
X_train_count4 = count_vect_tags.fit_transform(train_genres_strings)

tfidf_transformer = TfidfTransformer()
X_train_tfidf4 = tfidf_transformer.fit_transform(X_train_count4)

neig4 = KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan')
neig4.fit(X_train_count4, train_df.mean_ratings)


KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [23]:
# predict test
test_genres_strings = test_df.genres.to_list()

X_test_count4 = count_vect_tags.transform(test_genres_strings)
X_test_tfidf4 = tfidf_transformer.transform(X_test_count4)

predicted = neig4.predict(X_test_count4)


In [24]:
# RMSE
mean_squared_error(test_df.mean_ratings, predicted)


0.4383887660873539