In [616]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LinearRegression

%matplotlib inline

    Построить рекомендации (регрессия, предсказываем оценку) на фичах:

    TF-IDF на тегах и жанрах
    Средние оценки (+ median, variance, etc.) пользователя и фильма

    Оценить RMSE на тестовой выборке


In [617]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [618]:
ratings = ratings[['movieId', 'rating']]
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 2 columns):
movieId    100836 non-null int64
rating     100836 non-null float64
dtypes: float64(1), int64(1)
memory usage: 1.5 MB


In [619]:
len(ratings['movieId'].unique())

9724

In [620]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')
len(movies_with_tags['movieId'].unique())

9742

In [621]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [622]:
movies_with_tags[movies_with_tags.title == 'Gintama: The Movie (2010)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,anime,1537099000.0
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,comedy,1537099000.0
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,gintama,1537099000.0
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,remaster,1537099000.0


In [623]:
ratings.head()

Unnamed: 0,movieId,rating
0,1,4.0
1,3,4.0
2,6,4.0
3,47,5.0
4,50,5.0


In [624]:
ratings[ratings.movieId == int('2')]

Unnamed: 0,movieId,rating
560,2,4.0
1026,2,4.0
1773,2,3.0
2275,2,3.0
2977,2,3.0
...,...,...
95102,2,4.0
95965,2,4.0
97044,2,5.0
97144,2,3.5


In [625]:
movies_with_ratings = movies_with_tags.join(ratings.set_index(['movieId']), on='movieId').dropna()
movies_with_ratings.tail()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,rating
9710,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,star wars,1528935000.0,5.0
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,anime,1537099000.0,3.5
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,comedy,1537099000.0,3.5
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,gintama,1537099000.0,3.5
9732,193565,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi,184.0,remaster,1537099000.0,3.5


In [626]:
movies_with_ratings.title.unique()

array(['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)',
       ..., 'Deadpool 2 (2018)', 'Solo: A Star Wars Story (2018)',
       'Gintama: The Movie (2010)'], dtype=object)

In [627]:
movies_with_ratings[movies_with_ratings.title == 'Solo: A Star Wars Story (2018)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,rating
9710,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,Emilia Clarke,1528935000.0,4.0
9710,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,Emilia Clarke,1528935000.0,4.0
9710,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,Emilia Clarke,1528935000.0,3.5
9710,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,Emilia Clarke,1528935000.0,3.0
9710,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,Emilia Clarke,1528935000.0,5.0
9710,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,star wars,1528935000.0,4.0
9710,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,star wars,1528935000.0,4.0
9710,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,star wars,1528935000.0,3.5
9710,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,star wars,1528935000.0,3.0
9710,187595,Solo: A Star Wars Story (2018),Action|Adventure|Children|Sci-Fi,62.0,star wars,1528935000.0,5.0


In [628]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [629]:
vectorizer = TfidfVectorizer()

In [630]:
movies_with_ratings['genres'] = movies_with_ratings['genres'].apply(change_string)

In [631]:
movies_with_ratings[movies_with_ratings.title == 'Solo: A Star Wars Story (2018)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,rating
9710,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,62.0,Emilia Clarke,1528935000.0,4.0
9710,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,62.0,Emilia Clarke,1528935000.0,4.0
9710,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,62.0,Emilia Clarke,1528935000.0,3.5
9710,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,62.0,Emilia Clarke,1528935000.0,3.0
9710,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,62.0,Emilia Clarke,1528935000.0,5.0
9710,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,62.0,star wars,1528935000.0,4.0
9710,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,62.0,star wars,1528935000.0,4.0
9710,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,62.0,star wars,1528935000.0,3.5
9710,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,62.0,star wars,1528935000.0,3.0
9710,187595,Solo: A Star Wars Story (2018),Action Adventure Children SciFi,62.0,star wars,1528935000.0,5.0


In [632]:
movies_with_ratings['genres'] = vectorizer.fit_transform(movies_with_ratings['genres'].apply(change_string))

In [633]:
movies_with_ratings[movies_with_ratings.title == 'Solo: A Star Wars Story (2018)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,rating
9710,187595,Solo: A Star Wars Story (2018),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",62.0,Emilia Clarke,1528935000.0,4.0
9710,187595,Solo: A Star Wars Story (2018),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",62.0,Emilia Clarke,1528935000.0,4.0
9710,187595,Solo: A Star Wars Story (2018),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",62.0,Emilia Clarke,1528935000.0,3.5
9710,187595,Solo: A Star Wars Story (2018),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",62.0,Emilia Clarke,1528935000.0,3.0
9710,187595,Solo: A Star Wars Story (2018),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",62.0,Emilia Clarke,1528935000.0,5.0
9710,187595,Solo: A Star Wars Story (2018),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",62.0,star wars,1528935000.0,4.0
9710,187595,Solo: A Star Wars Story (2018),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",62.0,star wars,1528935000.0,4.0
9710,187595,Solo: A Star Wars Story (2018),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",62.0,star wars,1528935000.0,3.5
9710,187595,Solo: A Star Wars Story (2018),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",62.0,star wars,1528935000.0,3.0
9710,187595,Solo: A Star Wars Story (2018),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",62.0,star wars,1528935000.0,5.0


In [634]:
movies_with_ratings.genres.value_counts()

  (0, 114)\t1.0\n  (1, 114)\t1.0\n  (2, 114)\t1.0\n  (3, 114)\t1.0\n  (4, 114)\t1.0\n  (5, 114)\t1.0\n  (6, 114)\t1.0\n  (7, 114)\t1.0\n  (8, 114)\t1.0\n  (9, 114)\t1.0\n  (10, 114)\t1.0\n  (11, 114)\t1.0\n  (12, 114)\t1.0\n  (13, 114)\t1.0\n  (14, 114)\t1.0\n  (15, 114)\t1.0\n  (16, 114)\t1.0\n  (17, 114)\t1.0\n  (18, 114)\t1.0\n  (19, 114)\t1.0\n  (20, 114)\t1.0\n  (21, 114)\t1.0\n  (22, 114)\t1.0\n  (23, 114)\t1.0\n  (24, 114)\t1.0\n  :\t:\n  (233188, 64)\t1.0\n  (233189, 64)\t1.0\n  (233190, 64)\t1.0\n  (233191, 64)\t1.0\n  (233192, 64)\t1.0\n  (233193, 64)\t1.0\n  (233194, 64)\t1.0\n  (233195, 64)\t1.0\n  (233196, 64)\t1.0\n  (233197, 64)\t1.0\n  (233198, 64)\t1.0\n  (233199, 11)\t1.0\n  (233200, 11)\t1.0\n  (233201, 11)\t1.0\n  (233202, 11)\t1.0\n  (233203, 11)\t1.0\n  (233204, 11)\t1.0\n  (233205, 11)\t1.0\n  (233206, 11)\t1.0\n  (233207, 11)\t1.0\n  (233208, 11)\t1.0\n  (233209, 47)\t1.0\n  (233210, 47)\t1.0\n  (233211, 47)\t1.0\n  (233212, 47)\t1.0    233213
Name: genres, dtyp

In [635]:
def change_tag(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split(' '))

In [636]:
movies_with_ratings['tag'] = movies_with_ratings['tag'].apply(change_tag)

In [637]:
movies_with_ratings[movies_with_ratings.title == 'Gintama: The Movie (2010)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,rating
9732,193565,Gintama: The Movie (2010),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",184.0,anime,1537099000.0,3.5
9732,193565,Gintama: The Movie (2010),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",184.0,comedy,1537099000.0,3.5
9732,193565,Gintama: The Movie (2010),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",184.0,gintama,1537099000.0,3.5
9732,193565,Gintama: The Movie (2010),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",184.0,remaster,1537099000.0,3.5


In [638]:
movies_with_ratings['tag'] = vectorizer.fit_transform(movies_with_ratings['tag'].apply(change_tag))

In [639]:
movies_with_ratings[movies_with_ratings.title == 'Gintama: The Movie (2010)']

Unnamed: 0,movieId,title,genres,userId,tag,timestamp,rating
9732,193565,Gintama: The Movie (2010),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",184.0,"(0, 1011)\t1.0\n (1, 1011)\t1.0\n (2, 1011...",1537099000.0,3.5
9732,193565,Gintama: The Movie (2010),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",184.0,"(0, 1011)\t1.0\n (1, 1011)\t1.0\n (2, 1011...",1537099000.0,3.5
9732,193565,Gintama: The Movie (2010),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",184.0,"(0, 1011)\t1.0\n (1, 1011)\t1.0\n (2, 1011...",1537099000.0,3.5
9732,193565,Gintama: The Movie (2010),"(0, 114)\t1.0\n (1, 114)\t1.0\n (2, 114)\t...",184.0,"(0, 1011)\t1.0\n (1, 1011)\t1.0\n (2, 1011...",1537099000.0,3.5


In [640]:
movies_with_ratings.tag.value_counts()

  (0, 1011)\t1.0\n  (1, 1011)\t1.0\n  (2, 1011)\t1.0\n  (3, 1011)\t1.0\n  (4, 1011)\t1.0\n  (5, 1011)\t1.0\n  (6, 1011)\t1.0\n  (7, 1011)\t1.0\n  (8, 1011)\t1.0\n  (9, 1011)\t1.0\n  (10, 1011)\t1.0\n  (11, 1011)\t1.0\n  (12, 1011)\t1.0\n  (13, 1011)\t1.0\n  (14, 1011)\t1.0\n  (15, 1011)\t1.0\n  (16, 1011)\t1.0\n  (17, 1011)\t1.0\n  (18, 1011)\t1.0\n  (19, 1011)\t1.0\n  (20, 1011)\t1.0\n  (21, 1011)\t1.0\n  (22, 1011)\t1.0\n  (23, 1011)\t1.0\n  (24, 1011)\t1.0\n  :\t:\n  (233188, 1141)\t1.0\n  (233189, 1141)\t1.0\n  (233190, 1141)\t1.0\n  (233191, 1141)\t1.0\n  (233192, 1141)\t1.0\n  (233193, 1141)\t1.0\n  (233194, 1141)\t1.0\n  (233195, 1141)\t1.0\n  (233196, 1141)\t1.0\n  (233197, 1141)\t1.0\n  (233198, 1141)\t1.0\n  (233199, 435)\t1.0\n  (233200, 435)\t1.0\n  (233201, 435)\t1.0\n  (233202, 435)\t1.0\n  (233203, 435)\t1.0\n  (233204, 1256)\t1.0\n  (233205, 1256)\t1.0\n  (233206, 1256)\t1.0\n  (233207, 1256)\t1.0\n  (233208, 1256)\t1.0\n  (233209, 75)\t1.0\n  (233210, 300)\t1.0\n  (233

In [None]:
count_vect = CountVectorizer()
X_genres = count_vect.fit_transform(movie_genres)

In [None]:
tfidf_transformer = TfidfTransformer()
X_tfidf_genres = tfidf_transformer.fit_transform(X_genres)

In [None]:
X_tfidf_genres.toarray()

In [None]:
movies_with_ratings.tag.unique()

In [None]:
tag_strings = []
movies = []

for movie, group in tqdm_notebook(movies_with_ratings.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)

In [None]:
tag_strings[:10]

In [None]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(tag_strings)

In [None]:
tfidf_transformer = TfidfTransformer()
X_tfidf_tags = tfidf_transformer.fit_transform(X_counts)

In [None]:
X = [X_tfidf_genres, X_tfidf_tags]

In [None]:
X.shape

In [None]:
model.fit(X_train_tfidf_genres)

In [None]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [None]:
res

In [None]:
movies.iloc[res[1][0]]

In [None]:
movies.head()

In [None]:
tags.head()

In [None]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [None]:
movies_with_tags.head()

In [None]:
movies_with_tags[movies_with_tags.title == 'Toy Story (1995)']

In [None]:
movies_with_tags.tag.unique()

In [None]:
movies_with_tags.dropna(inplace=True)

In [None]:
movies_with_tags.title.unique().shape

In [None]:
tag_strings = []
movies = []

for movie, group in tqdm_notebook(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)

In [None]:
tag_strings[:5]

In [None]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [None]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [None]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

In [None]:
for i in range(len(movies)):
    if 'Magnolia (1999)' == movies[i]:
        print(i)

In [None]:
tag_strings[822]

In [None]:
test = change_string('pixar pixar fun')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [None]:
res

In [None]:
for i in res[1][0]:
    print(movies[i])