#### Загрузка необходимых данных и библиотек в рабочую среду

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
movies = pd.read_csv('movies.csv', usecols = ['movieId', 'genres'])
ratings = pd.read_csv('ratings.csv', usecols = ['userId', 'movieId', 'rating'])
tags = pd.read_csv('tags.csv', usecols = ['movieId', 'tag'])

#### TF-IDF для жанров

In [4]:
def change_string(s):
    return s.replace(' ', '').replace('-', '')

In [5]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [6]:
tfidf_genres = TfidfVectorizer()
genres_tfidf = tfidf_genres.fit_transform(movie_genres)

In [7]:
genres_tfidf_df = pd.DataFrame(genres_tfidf.toarray(), columns=tfidf_genres.get_feature_names_out())

In [8]:
genres_tfidf_df['movieId'] = movies['movieId']

In [9]:
movies_with_genres_tfidf = pd.merge(movies, genres_tfidf_df, on='movieId')

In [10]:
movies_with_genres_tfidf = movies_with_genres_tfidf.drop('genres', axis=1)

In [11]:
movies_with_genres_tfidf.head(3)

Unnamed: 0,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0


#### TF-IDF для тегов

In [12]:
movies_with_tags = tags.copy()

In [13]:
movies_with_tags.head()

Unnamed: 0,movieId,tag
0,60756,funny
1,60756,Highly quotable
2,60756,will ferrell
3,89774,Boxing story
4,89774,MMA


In [14]:
movies_with_tags.dropna(inplace=True)

In [15]:
tag_strings = []
movie_id = []

for movie, group in movies_with_tags.groupby('movieId'):
    tag_strings.append(' '.join([change_string(s) for s in group.tag.values]))
    movie_id.append(group['movieId'].iloc[0])

In [17]:
tfidf_tag = TfidfVectorizer()
tags_tfidf = tfidf_tag.fit_transform(tag_strings)

In [18]:
tags_tfidf_df = pd.DataFrame(tags_tfidf.toarray(), columns=tfidf_tag.get_feature_names_out()).fillna(0)

In [19]:
tags_tfidf_df['movieId'] = movie_id

In [20]:
tags_tfidf_df.head()

Unnamed: 0,06oscarnominatedbestmovieanimation,1900s,1920s,1950s,1960s,1970s,1980s,1990s,2001like,2danimation,...,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel,movieId
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7


In [21]:
movies_with_genres_and_tags_tfidf = pd.merge(movies_with_genres_tfidf,
                                             tags_tfidf_df,
                                             on='movieId',
                                             how='left',
                                             suffixes=('_genre', '_tag')).fillna(0)

In [22]:
movies_with_genres_and_tags_tfidf.head()

Unnamed: 0,movieId,action_genre,adventure_genre,animation_genre,children_genre,comedy_genre,crime_genre,documentary_genre,drama_genre,fantasy_genre,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Создание фичей для пользователей и фильмов

In [23]:
user_stats = ratings.groupby('userId')['rating'].agg(['mean', 'median', 'var']).reset_index()
user_stats.columns = ['userId', 'user_mean_rating', 'user_median_rating', 'user_var_rating']
movie_stats = ratings.groupby('movieId')['rating'].agg(['mean', 'median', 'var']).reset_index()
movie_stats.columns = ['movieId', 'movie_mean_rating', 'movie_median_rating', 'movie_var_rating']

In [24]:
user_stats.head(3)

Unnamed: 0,userId,user_mean_rating,user_median_rating,user_var_rating
0,1,4.366379,5.0,0.640077
1,2,3.948276,4.0,0.649015
2,3,2.435897,0.5,4.370783


In [25]:
movie_stats.head(3)

Unnamed: 0,movieId,movie_mean_rating,movie_median_rating,movie_var_rating
0,1,3.92093,4.0,0.69699
1,2,3.431818,3.5,0.777419
2,3,3.259615,3.0,1.112651


In [26]:
ratings_with_users_stats = pd.merge(ratings, user_stats, on='userId', how='left').fillna(0)

In [27]:
ratings_res = pd.merge(ratings_with_users_stats, movie_stats, on='movieId', how='left').fillna(0)

In [28]:
ratings_res.head(3)

Unnamed: 0,userId,movieId,rating,user_mean_rating,user_median_rating,user_var_rating,movie_mean_rating,movie_median_rating,movie_var_rating
0,1,1,4.0,4.366379,5.0,0.640077,3.92093,4.0,0.69699
1,1,3,4.0,4.366379,5.0,0.640077,3.259615,3.0,1.112651
2,1,6,4.0,4.366379,5.0,0.640077,3.946078,4.0,0.667856


In [29]:
data_res = pd.merge(ratings_res, movies_with_genres_and_tags_tfidf, on='movieId', how='left').fillna(0)

In [30]:
data_res.head(3)

Unnamed: 0,userId,movieId,rating,user_mean_rating,user_median_rating,user_var_rating,movie_mean_rating,movie_median_rating,movie_var_rating,action_genre,...,worldwari,worldwarii,writing,wrongfulimprisonment,wry,youngermen,zither,zoekazan,zombies,zooeydeschanel
0,1,1,4.0,4.366379,5.0,0.640077,3.92093,4.0,0.69699,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,4.366379,5.0,0.640077,3.259615,3.0,1.112651,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,6,4.0,4.366379,5.0,0.640077,3.946078,4.0,0.667856,0.549328,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Построение модели линейной регрессии

In [32]:
# Разделим выборку на обучающее и тестовое подмножество
X = data_res.drop(['rating'], axis=1)
y = data_res['rating']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Обучим модель регрессии
model = LinearRegression()
model.fit(X_train, y_train)

In [34]:
# Получим предсказания на тестовом подмножестве
y_pred = model.predict(X_test)

In [35]:
# Оценим метрику RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print('RMSE:', rmse)

RMSE: 0.8200567495247204


In [36]:
# Визуально оценим различие реальных значений и предсказанных
results = pd.DataFrame({'Real': y_test, 'Predicted': y_pred})
print(results)

       Real  Predicted
67037   4.5   3.319743
42175   3.0   3.419318
93850   3.0   2.657420
6187    4.0   3.613460
12229   4.0   3.525612
...     ...        ...
57416   2.0   2.612735
67290   3.5   3.852147
33423   4.5   3.815284
98552   3.0   3.416220
87803   3.0   3.250939

[20168 rows x 2 columns]
