In [147]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error as rmse

## Import datasets

In [3]:
links = pd.read_csv('dataset\\links.csv')
movies = pd.read_csv('dataset\\movies.csv')
ratings = pd.read_csv('dataset\\ratings.csv')
tags = pd.read_csv('dataset\\tags.csv')

## Preprocess genres

In [78]:
df_movie_genre = movies.groupby('movieId').agg({'genres': lambda x: x.str.lower().str.replace(' ', '', regex=False).str.replace('-', '', regex=False).str.replace('|', ' ', regex=False)}).reset_index()
df_movie_genre.head()

Unnamed: 0,movieId,genres
0,1,adventure animation children comedy fantasy
1,2,adventure children fantasy
2,3,comedy romance
3,4,comedy drama romance
4,5,comedy


## Preprocess tags

In [82]:
df_movie_tag = tags.dropna().groupby('movieId').agg({'tag': lambda x: ' '.join(set(x.str.lower()))}).reset_index()
df_movie_tag.head()

Unnamed: 0,movieId,tag
0,1,pixar fun
1,2,game robin williams magic board game fantasy
2,3,old moldy
3,5,remake pregnancy
4,7,remake


## Preprocess ratings

In [154]:
movie_ratings = ratings.groupby(['movieId'], as_index=False).agg(rating_mean=('rating', 'mean'), rating_median=('rating', 'median'), rating_std=('rating', np.std)).reset_index(drop=True)
movie_ratings.head()

Unnamed: 0,movieId,rating_mean,rating_median,rating_std
0,1,3.92093,4.0,0.834859
1,2,3.431818,3.5,0.881713
2,3,3.259615,3.0,1.054823
3,4,2.357143,3.0,0.852168
4,5,3.071429,3.0,0.907148


## Merge into one df

In [155]:
df_movie_genre_tag_mean = df_movie_genre.merge(df_movie_tag, on='movieId')
df_movie_genre_tag_mean = df_movie_genre_tag_mean.merge(movie_ratings, on='movieId')
df_movie_genre_tag_mean.head()

Unnamed: 0,movieId,genres,tag,rating_mean,rating_median,rating_std
0,1,adventure animation children comedy fantasy,pixar fun,3.92093,4.0,0.834859
1,2,adventure children fantasy,game robin williams magic board game fantasy,3.431818,3.5,0.881713
2,3,comedy romance,old moldy,3.259615,3.0,1.054823
3,5,comedy,remake pregnancy,3.071429,3.0,0.907148
4,7,comedy romance,remake,3.185185,3.0,0.977561


## Modelling

In [122]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)

In [160]:
df_movie_genre_tag_mean['tf_idf_genre'] = tfidf_vectorizer.fit_transform(df_movie_genre_tag_mean['genres']).todense()
df_movie_genre_tag_mean['tf_idf_tag'] = tfidf_vectorizer.fit_transform(df_movie_genre_tag_mean['tag']).todense()
X = df_movie_genre_tag_mean[['tf_idf_genre', 'tf_idf_tag']]
y = df_movie_genre_tag_mean['rating_mean']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan')
model.fit(X_train, y_train)

In [161]:
y_pred = model.predict(X_test)
rmse(y_test, y_pred)

0.2811624608534696