In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [58]:
tags = pd.read_csv(r"D:\ML\Рекомендации 2\ml-latest-small\tags.csv")
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [72]:
tags.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   userId     3683 non-null   int64 
 1   movieId    3683 non-null   int64 
 2   tag        3683 non-null   object
 3   timestamp  3683 non-null   int64 
dtypes: int64(3), object(1)
memory usage: 115.2+ KB


In [59]:
genres = pd.read_csv(r"D:\ML\Рекомендации 2\ml-latest-small\movies.csv")
genres.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [60]:
ratings = pd.read_csv(r"D:\ML\Рекомендации 2\ml-latest-small\ratings.csv")
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
# функция для разделения жанров на слова
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [6]:
movie_genres = genres.genres.apply(change_string)

In [7]:
movie_genres

0       Adventure Animation Children Comedy Fantasy
1                        Adventure Children Fantasy
2                                    Comedy Romance
3                              Comedy Drama Romance
4                                            Comedy
                           ...                     
9737                Action Animation Comedy Fantasy
9738                       Animation Comedy Fantasy
9739                                          Drama
9740                               Action Animation
9741                                         Comedy
Name: genres, Length: 9742, dtype: object

In [61]:
rating_genres = ratings.merge(genres, on = 'movieId', how = 'left')

In [62]:
rating_genres_tags = rating_genres.merge(tags, on = 'movieId', how = 'left')

In [63]:
rating_genres_tags.head()

Unnamed: 0,userId_x,movieId,rating,timestamp_x,title,genres,userId_y,tag,timestamp_y
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
1,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
2,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
3,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,289.0,moldy,1143425000.0
4,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance,289.0,old,1143425000.0


In [64]:
rating_genres_tags['genres'] = rating_genres_tags.genres.apply(change_string)

In [65]:
rating_genres_tags.head()

Unnamed: 0,userId_x,movieId,rating,timestamp_x,title,genres,userId_y,tag,timestamp_y
0,1,1,4.0,964982703,Toy Story (1995),Adventure Animation Children Comedy Fantasy,336.0,pixar,1139046000.0
1,1,1,4.0,964982703,Toy Story (1995),Adventure Animation Children Comedy Fantasy,474.0,pixar,1137207000.0
2,1,1,4.0,964982703,Toy Story (1995),Adventure Animation Children Comedy Fantasy,567.0,fun,1525286000.0
3,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy Romance,289.0,moldy,1143425000.0
4,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy Romance,289.0,old,1143425000.0


In [66]:
pivot = pd.pivot_table(rating_genres_tags, values = ['rating'], index = 'movieId', aggfunc = [np.mean, np.median, np.var]).reset_index()

In [67]:
pivot

Unnamed: 0_level_0,movieId,mean,median,var
Unnamed: 0_level_1,Unnamed: 1_level_1,rating,rating,rating
0,1,3.920930,4.0,0.694825
1,2,3.431818,3.5,0.772106
2,3,3.259615,3.0,1.101848
3,4,2.357143,3.0,0.726190
4,5,3.071429,3.0,0.814433
...,...,...,...,...
9719,193581,4.000000,4.0,
9720,193583,3.500000,3.5,
9721,193585,3.500000,3.5,
9722,193587,3.500000,3.5,


In [68]:
rating_genres_tags_mean = rating_genres_tags.merge(pivot, on = 'movieId', how = 'left')
rating_genres_tags_mean.drop(['userId_x','timestamp_x', 'userId_y', 'userId_y', 'timestamp_y'], axis = 1).head()

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


Unnamed: 0,movieId,rating,title,genres,tag,"(mean, rating)","(median, rating)","(var, rating)"
0,1,4.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,3.92093,4.0,0.694825
1,1,4.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar,3.92093,4.0,0.694825
2,1,4.0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,fun,3.92093,4.0,0.694825
3,3,4.0,Grumpier Old Men (1995),Comedy Romance,moldy,3.259615,3.0,1.101848
4,3,4.0,Grumpier Old Men (1995),Comedy Romance,old,3.259615,3.0,1.101848


In [43]:
ratings = rating_genres_tags_mean.rating.astype(int).to_list()

In [44]:
ratings

[4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 3,
 5,
 5,
 5,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 3,
 3,
 3,
 3,
 3,
 3,
 5,
 4,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,
 3,


In [16]:
# Обучаем на жанрах
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(rating_genres_tags_mean.genres)

In [17]:
X_train_counts

<285762x20 sparse matrix of type '<class 'numpy.int64'>'
	with 905153 stored elements in Compressed Sparse Row format>

In [18]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [19]:
X_train_tfidf

<285762x20 sparse matrix of type '<class 'numpy.float64'>'
	with 905153 stored elements in Compressed Sparse Row format>

In [21]:
from sklearn.ensemble import GradientBoostingClassifier

In [45]:
clf = GradientBoostingClassifier(n_estimators=20, learning_rate=1.0, max_depth=5, random_state=2021).fit(X_train_tfidf, ratings)

In [46]:
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)

res = clf.predict(X_tfidf2)

In [47]:
res

array([4])

In [50]:
# удалим пустые теги
tags = rating_genres_tags_mean.tag.dropna().to_list()

In [73]:
len(tags)

3683

In [92]:
rating_tag = rating_genres_tags_mean['rating'][~rating_genres_tags_mean['tag'].isnull()]

In [93]:
rating_tag = rating_tag.astype(int).to_list()

In [53]:
# Обучаем на тегах
X_train_counts_tag = count_vect.fit_transform(tags)

In [54]:
X_train_tfidf_tag = tfidf_transformer.fit_transform(X_train_counts_tag)

In [101]:
clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=1.0, max_depth=5, random_state=2021).fit(X_train_tfidf_tag, rating_tag)

In [156]:
mean = rating_genres_tags_mean.iloc[:,[2,9,10,11]]

In [157]:
mean

Unnamed: 0,rating,"(mean, rating)","(median, rating)","(var, rating)"
0,4.0,3.920930,4.0,0.694825
1,4.0,3.920930,4.0,0.694825
2,4.0,3.920930,4.0,0.694825
3,4.0,3.259615,3.0,1.101848
4,4.0,3.259615,3.0,1.101848
...,...,...,...,...
285757,5.0,4.280000,4.5,0.404839
285758,5.0,4.280000,4.5,0.404839
285759,5.0,4.280000,4.5,0.404839
285760,5.0,4.280000,4.5,0.404839


In [158]:
mean = mean.dropna()

In [159]:
mean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 282353 entries, 0 to 285761
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   rating            282353 non-null  float64
 1   (mean, rating)    282353 non-null  float64
 2   (median, rating)  282353 non-null  float64
 3   (var, rating)     282353 non-null  float64
dtypes: float64(4)
memory usage: 10.8 MB


In [161]:
# Обучаем на средних значениях
clf3 = GradientBoostingClassifier(n_estimators=20, learning_rate=1.0, max_depth=5, random_state=2021).fit(mean, mean['rating'].astype(int))