In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tqdm import tqdm_notebook

In [2]:
movies = pd.read_csv('movies.csv')
tags = pd.read_csv('tags.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
movies_new = movies.copy()
def new_str(s):
    return ' '.join(s.lower().replace(' ', '').replace('-', '').split('|'))

In [7]:
movies_new['new_genres'] = movies_new['genres'].apply(new_str)
movies_new.drop(columns=['genres'], inplace=True)

In [8]:
movies_new.head()

Unnamed: 0,movieId,title,new_genres
0,1,Toy Story (1995),adventure animation children comedy fantasy
1,2,Jumanji (1995),adventure children fantasy
2,3,Grumpier Old Men (1995),comedy romance
3,4,Waiting to Exhale (1995),comedy drama romance
4,5,Father of the Bride Part II (1995),comedy


In [9]:
tags.drop(columns=['userId','timestamp']).groupby('movieId').head()[tags['movieId']==1]

Unnamed: 0,movieId,tag
629,1,pixar
981,1,pixar
2886,1,fun


In [10]:
group_tags = tags.groupby('movieId').tag.agg(lambda column: ' '.join(column).lower())

In [11]:
new_tags = pd.DataFrame()

In [12]:
new_tags['movieId'] = group_tags.index

In [13]:
new_tags['new_tag'] = np.array(group_tags)

In [14]:
new_tags.head()

Unnamed: 0,movieId,new_tag
0,1,pixar pixar fun
1,2,fantasy magic board game robin williams game
2,3,moldy old
3,5,pregnancy remake
4,7,remake


In [15]:
df = pd.merge(movies_new, new_tags, on ='movieId', how='left').fillna("")

In [16]:
df.head()

Unnamed: 0,movieId,title,new_genres,new_tag
0,1,Toy Story (1995),adventure animation children comedy fantasy,pixar pixar fun
1,2,Jumanji (1995),adventure children fantasy,fantasy magic board game robin williams game
2,3,Grumpier Old Men (1995),comedy romance,moldy old
3,4,Waiting to Exhale (1995),comedy drama romance,
4,5,Father of the Bride Part II (1995),comedy,pregnancy remake


In [17]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [18]:
new_ratings = ratings.groupby('movieId')

In [19]:
new_ratings = new_ratings.mean()[['rating']]

In [20]:
new_ratings.rename(columns={'rating': 'mean_rating'}, inplace=True)
new_ratings = new_ratings.reset_index().head()

In [21]:
df = pd.merge(df, new_ratings, on='movieId', how='left').fillna(0)
df.head()

Unnamed: 0,movieId,title,new_genres,new_tag,mean_rating
0,1,Toy Story (1995),adventure animation children comedy fantasy,pixar pixar fun,3.92093
1,2,Jumanji (1995),adventure children fantasy,fantasy magic board game robin williams game,3.431818
2,3,Grumpier Old Men (1995),comedy romance,moldy old,3.259615
3,4,Waiting to Exhale (1995),comedy drama romance,,2.357143
4,5,Father of the Bride Part II (1995),comedy,pregnancy remake,3.071429


In [22]:
y = df[['mean_rating']]
df.drop(columns=['mean_rating'], inplace=True)

In [23]:
y.shape

(9742, 1)

Делим данные на тестовые и учебные
=====

In [24]:
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.2)

TF-IDF
=====

In [51]:
c_tags = CountVectorizer()
c_genres = CountVectorizer()

In [52]:
t_tags = TfidfTransformer()
t_genres = TfidfTransformer()

In [53]:
tags_train_c = c_tags.fit_transform(x_train.new_tag)
tags_train_t = t_tags.fit_transform(tags_train_c)

genres_train_c = c_genres.fit_transform(x_train.new_genres)
genres_train_t = t_genres.fit_transform(genres_train_c)

In [54]:
tags_test_c = c_tags.transform(x_test.new_tag)
tags_test_t = t_tags.transform(tags_test_c)

genres_test_c = c_genres.transform(x_test.new_genres)
genres_test_t = t_genres.transform(genres_test_c)

Модель Регрессии - Тэги
====

In [55]:
lr_t = LinearRegression()
lr_t.fit(tags_train_t, y_train)
y_pred_t = lr_t.predict(tags_test_t)
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred_t)))

RMSE:  0.06456813540751602


Модель Регрессии - Жанры
====

In [56]:
lr_g = LinearRegression()
lr_g.fit(genres_train_t, y_train)
y_pred_g = lr_g.predict(genres_test_t)
print('RMSE: ', np.sqrt(mean_squared_error(y_test, y_pred_g)))

RMSE:  0.004676967787386213
