In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

from tqdm import tqdm_notebook 

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
links = pd.read_csv('ml-latest-small/links.csv')
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')
tags = pd.read_csv('ml-latest-small/tags.csv')

In [3]:
#Построить рекомендации (регрессия, предсказываем оценку) на фичах:
#1) TF-IDF на тегах и жанрах

In [4]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [5]:
movies.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [6]:
tags.columns

Index(['userId', 'movieId', 'tag', 'timestamp'], dtype='object')

In [7]:
cols_tags=['userId', 'movieId', 'tag']
cols_ratings = ['userId', 'movieId', 'rating']

In [8]:
#remove the timestamp, as it is not required
tags=tags[cols_tags]
tags

Unnamed: 0,userId,movieId,tag
0,2,60756,funny
1,2,60756,Highly quotable
2,2,60756,will ferrell
3,2,89774,Boxing story
4,2,89774,MMA
...,...,...,...
3678,606,7382,for katie
3679,606,7936,austere
3680,610,3265,gun fu
3681,610,3265,heroic bloodshed


In [9]:
#remove the timestamp, as it is not requiered
ratings=ratings[cols_ratings]
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [10]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [11]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [12]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [13]:
movie_genres

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller',
 'Comedy Drama Romance',
 'Comedy Horror',
 'Adventure Animation Children',
 'Drama',
 'Action Adventure Romance',
 'Crime Drama',
 'Drama Romance',
 'Comedy',
 'Comedy',
 'Action Comedy Crime Drama Thriller',
 'Comedy Crime Thriller',
 'Crime Drama Horror Mystery Thriller',
 'Action Crime Thriller',
 'Drama SciFi',
 'Drama Romance',
 'Drama',
 'Children Drama',
 'Drama Romance',
 'Adventure Drama Fantasy Mystery SciFi',
 'Crime Drama',
 'Drama',
 'Mystery SciFi Thriller',
 'Children Drama',
 'Crime Drama',
 'Children Comedy',
 'Comedy Romance',
 'Drama',
 'Drama War',
 'Action Crime Drama',
 'Drama',
 'Action Adventure Fantasy',
 'Comedy Drama Thriller',
 'Drama Romance',
 'Mystery Thriller',
 'Animation Children Drama Musical Romance',
 'Drama R

In [14]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)

In [15]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [16]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [17]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [18]:
res

(array([[0.42079615, 0.53300564, 0.54288608, 0.54288608, 0.54288608,
         0.54288608, 0.54288608]]),
 array([[6774, 9096, 5636, 6723, 3376, 7496, 9717]]))

In [19]:
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
6774,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
9096,143559,L.A. Slasher (2015),Comedy|Crime|Fantasy
5636,27368,Asterix & Obelix: Mission Cleopatra (Astérix &...,Adventure|Comedy|Fantasy
6723,58972,Nim's Island (2008),Adventure|Comedy|Fantasy
3376,4591,Erik the Viking (1989),Adventure|Comedy|Fantasy
7496,82854,Gulliver's Travels (2010),Adventure|Comedy|Fantasy
9717,188833,The Man Who Killed Don Quixote (2018),Adventure|Comedy|Fantasy


In [20]:
#Построить рекомендации (регрессия, предсказываем оценку) на фичах:
#1) TF-IDF на тегах и жанрах
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [21]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game


In [22]:
movies_with_tags[movies_with_tags.title == 'Toy Story (1995)']

Unnamed: 0,movieId,title,genres,userId,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun


In [23]:
movies_with_tags.tag.unique()

array(['pixar', 'fun', 'fantasy', ..., 'star wars', 'gintama', 'remaster'],
      dtype=object)

In [24]:
movies_with_tags.tag.unique().shape

(1590,)

In [25]:
movies_with_tags.dropna(inplace=True)

In [26]:
movies_with_tags.title.unique().shape

(1572,)

In [27]:
tag_strings = []
movies = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies.append(movie)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1572.0), HTML(value='')))




In [28]:
tag_strings[:5]

['artistic Funny humorous inspiring intelligent quirky romance ZooeyDeschanel',
 'lawyers',
 'creepy suspense',
 'Shakespearesortof',
 'dogs remake']

In [29]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(tag_strings)

In [30]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [31]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [32]:
for i in range(len(movies)):
    if 'Magnolia (1999)' == movies[i]:
        print(i)

822


In [33]:
tag_strings[822]

'L.A.'

In [34]:
c = change_string('pixar pixar fun')

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [35]:
res

(array([[1.5807704 , 1.99391256, 1.99391256, 2.07184591, 2.07184591,
         2.07184591, 2.07184591, 2.07184591, 2.07184591, 2.07184591]]),
 array([[1254,  822,  661, 1013,  242,  558,   36,  124,  213, 1042]]))

In [36]:
for i in res[1][0]:
    print(movies[i])

Sintel (2010)
Magnolia (1999)
In a Lonely Place (1950)
Panic Room (2002)
Chalet Girl (2011)
Grifters, The (1990)
Airheads (1994)
Beat the Devil (1953)
Butch Cassidy and the Sundance Kid (1969)
Phone Booth (2002)


In [37]:
#Построить рекомендации (регрессия, предсказываем оценку) на фичах:
#2) Средние оценки (+ median, variance, etc.) пользователя и фильма

In [38]:
movies = pd.read_csv('ml-latest-small/movies.csv')
movies_with_ratings = movies.join(ratings.set_index('movieId'), on='movieId')

In [39]:
movies_with_ratings.head()

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5


In [40]:
movies_with_ratings[movies_with_ratings.title == 'Toy Story (1995)']

Unnamed: 0,movieId,title,genres,userId,rating
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5
...,...,...,...,...,...
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,606.0,2.5
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,607.0,4.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,608.0,2.5
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,609.0,3.0


In [41]:
movies_with_ratings.rating.unique()

array([4. , 4.5, 2.5, 3.5, 3. , 5. , 0.5, 2. , 1.5, 1. , nan])

In [42]:
movies_with_ratings.rating.unique().shape

(11,)

In [43]:
movies_with_ratings.dropna(inplace=True)

In [44]:
num_title=movies_with_ratings.userId.unique().shape[0]
num_title

610

In [45]:
#movies_with_ratings[movies_with_ratings.title == 'Toy Story (1995)'].rating.mean()

In [46]:
movie_vector={}

for movie, group in tqdm_notebook(movies_with_ratings.groupby('title')):
    movie_vector[movie] = np.zeros(num_title)
    
    for i in range(len(group.userId.values)):
        u=group.userId.values[i]
        r=group.rating.values[i]
        movie_vector[movie][int(u-1)] = r
    

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movie, group in tqdm_notebook(movies_with_ratings.groupby('title')):


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9719.0), HTML(value='')))




In [47]:
len(movie_vector)

9719

In [57]:
movies_with_ratings['mean_rate'] = 0

In [58]:
for name in movies_with_ratings.title:
    movies_with_ratings['mean_rate'][movies_with_ratings.title == name] = movie_vector[name].mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_with_ratings['mean_rate'][movies_with_ratings.title == name] = movie_vector[name].mean()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_with_ratings['mean_rate'][movies_with_ratings.title == name] = movie_vector[name].mean()


In [59]:
movies_with_ratings

Unnamed: 0,movieId,title,genres,userId,rating,mean_rate
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,1.381967
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,1.381967
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1.381967
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1.381967
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1.381967
...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184.0,4.0,0.006557
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184.0,3.5,0.005738
9739,193585,Flint (2017),Drama,184.0,3.5,0.005738
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184.0,3.5,0.005738


In [60]:
movies_with_ratings_title = [change_string(g) for g in movies_with_ratings.title.values]

In [61]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movies_with_ratings_title)

In [62]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [63]:
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [64]:
test = 'Black Butler: Book of the Atlantic (2017)'

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [65]:
movies_with_ratings.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres,userId,rating,mean_rate
9469,168366,Beauty and the Beast (2017),Fantasy|Romance,212.0,4.0,0.011475
9469,168366,Beauty and the Beast (2017),Fantasy|Romance,596.0,3.0,0.011475
9584,175303,It (2017),Horror,125.0,3.0,0.04918
9584,175303,It (2017),Horror,153.0,0.5,0.04918
9584,175303,It (2017),Horror,279.0,5.0,0.04918
9584,175303,It (2017),Horror,212.0,4.0,0.04918
9584,175303,It (2017),Horror,305.0,3.0,0.04918


In [81]:
mean_rate_strings = []
movies = []

for movie, group in tqdm(movies_with_ratings.groupby('title')):
    mean_rate_strings.append(' '.join([str(s) for s in movies_with_ratings.mean_rate]))
    movies.append(movie)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=9719.0), HTML(value='')))




In [83]:
#mean_rate_strings[:5]

['1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967213114754 1.381967

In [117]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(mean_rate_strings)

In [118]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [119]:
neigh = NearestNeighbors(n_neighbors=10, n_jobs=-1, metric='manhattan') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [120]:
test = 'Black Butler: Book of the Atlantic (2017)'

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = neigh.kneighbors(X_tfidf2, return_distance=True)

In [121]:
for i in res[1][0]:
    print(movies[i])

Particle Fever (2013)
Partisan (2015)
Parent Trap, The (1998)
Parental Guidance (2012)
Parenthood (1989)
Paris Is Burning (1990)
Paris, I Love You (Paris, je t'aime) (2006)
Paris, Texas (1984)
Parasyte: Part 2 (2015)
Party Girl (1995)


In [167]:
from sklearn.model_selection import train_test_split

In [152]:
from sklearn.linear_model import LinearRegression
movies = pd.read_csv('ml-latest-small/movies.csv')

In [153]:
lr = LinearRegression()

In [154]:
movies_modified = movies.copy()

In [155]:
movies_modified['list_genres'] = [change_string(g) for g in movies.genres.values]

In [156]:
movies_modified.head()

Unnamed: 0,movieId,title,genres,list_genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Comedy


In [158]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_transformer = TfidfVectorizer()

In [159]:
tfidf_transformer.fit(movies_modified['list_genres'])

TfidfVectorizer()

In [161]:
with_ratings = movies_modified.merge(ratings)

In [164]:
for_model_df = with_ratings[with_ratings['userId']==100]

In [168]:
X,y = tfidf_transformer.transform(for_model_df['list_genres']), for_model_df['rating']

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [170]:
lr.fit(X_train, y_train)

LinearRegression()

In [175]:
y_pred = lr.predict(X_test)

In [176]:
y_test

759      4.0
53221    4.0
11146    4.0
12294    4.0
25471    4.0
72254    4.5
37390    4.5
56631    4.0
68060    5.0
51488    4.0
66344    3.5
39657    3.5
16517    3.0
6487     4.0
37180    3.5
57002    4.5
6380     1.0
43322    4.0
16812    4.5
69968    4.5
966      4.5
54855    2.0
21290    4.5
47137    3.5
31761    4.5
28980    4.5
41182    3.5
15367    4.0
28598    4.0
65179    4.0
Name: rating, dtype: float64

In [174]:
from sklearn.metrics import mean_squared_error

In [177]:
mean_squared_error(y_test, y_pred)

0.5641081339685357