In [1]:
import os
import pandas as pd
import numpy as np

### 1. Загружу данные в таблицы и избавлюсь от ненужных столбцов

In [2]:
prefix = 'ml-latest-small'
movies = pd.read_csv(os.path.join(prefix, 'movies.csv')).drop('title', axis=1)
ratings = pd.read_csv(os.path.join(prefix, 'ratings.csv')).drop('timestamp', axis=1)
tags = pd.read_csv(os.path.join(prefix, 'tags.csv')).drop(['userId', 'timestamp'], axis=1)

### 2. Преобразую информацию о жанрах и тегах в TF-IDF - векторное представление

In [3]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [4]:
genres_strings = movies.genres.apply(lambda x: ' '.join(x.replace(' ', '').replace('-', '').split('|')))

count_vect = CountVectorizer()
genres_counts = count_vect.fit_transform(genres_strings)

tfidf_transformer = TfidfTransformer()
genres_tfidf = tfidf_transformer.fit_transform(genres_counts)

genres_df = pd.DataFrame(genres_tfidf.todense(), columns=sorted(count_vect.vocabulary_))
genres_df.head()

Unnamed: 0,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,filmnoir,horror,imax,musical,mystery,nogenreslisted,romance,scifi,thriller,war,western
0,0.0,0.410433,0.531527,0.496423,0.266469,0.0,0.0,0.0,0.48123,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.510466,0.0,0.617414,0.0,0.0,0.0,0.0,0.598519,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.586999,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.809588,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.523605,0.0,0.0,0.452029,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.722155,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
tag_strings = []

for _, group in movies.join(tags.set_index('movieId'), on='movieId').groupby('movieId'):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))

count_vect = CountVectorizer()
tags_counts = count_vect.fit_transform(tag_strings)

tfidf_transformer = TfidfTransformer()
tags_tfidf = tfidf_transformer.fit_transform(tags_counts)

tags_df = pd.DataFrame(tags_tfidf.todense(), columns=sorted(count_vect.vocabulary_))
tags_df.head()

Unnamed: 0,1940,80,abigailbreslin,action,activist,adamsandler,adaptedfrom,addiction,adventure,afi,...,whimsical,whodoneit,whytheterroristshateus,willfarell,witty,workneedisaymore,worldwarii,wrongfulimprisonment,zombies,zooeydeschanel
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 3. Добавлю к таблице с фильмами признаки, получившиеся на основе жанров и тегов

In [6]:
movies_with_genres_tags = pd.concat([movies.drop('genres', axis=1), genres_df, tags_df], axis=1)
movies_with_genres_tags.head()

Unnamed: 0,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,whimsical,whodoneit,whytheterroristshateus,willfarell,witty,workneedisaymore,worldwarii,wrongfulimprisonment,zombies,zooeydeschanel
0,1,0.0,0.410433,0.531527,0.496423,0.266469,0.0,0.0,0.0,0.48123,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.510466,0.0,0.617414,0.0,0.0,0.0,0.0,0.598519,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.586999,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.523605,0.0,0.0,0.452029,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 4. Посчитаю статистики оценок для каждого пользователя и добавлю эти данные к таблице с фильмами

In [7]:
ratings_with_stats = ratings.join(ratings.groupby('userId').rating.agg(['mean', 'median', 'var']), on='userId')
ratings_with_stats.head()

Unnamed: 0,userId,movieId,rating,mean,median,var
0,1,31,2.5,2.55,2.5,0.786842
1,1,1029,3.0,2.55,2.5,0.786842
2,1,1061,3.0,2.55,2.5,0.786842
3,1,1129,2.0,2.55,2.5,0.786842
4,1,1172,4.0,2.55,2.5,0.786842


In [8]:
movies_with_ratings = movies_with_genres_tags.join(ratings_with_stats.set_index('movieId'), on='movieId')
movies_with_ratings.head()

Unnamed: 0,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,workneedisaymore,worldwarii,wrongfulimprisonment,zombies,zooeydeschanel,userId,rating,mean,median,var
0,1,0.0,0.410433,0.531527,0.496423,0.266469,0.0,0.0,0.0,0.48123,...,0.0,0.0,0.0,0.0,0.0,7.0,3.0,3.465909,3.0,0.872388
0,1,0.0,0.410433,0.531527,0.496423,0.266469,0.0,0.0,0.0,0.48123,...,0.0,0.0,0.0,0.0,0.0,9.0,4.0,3.755556,4.0,0.779798
0,1,0.0,0.410433,0.531527,0.496423,0.266469,0.0,0.0,0.0,0.48123,...,0.0,0.0,0.0,0.0,0.0,13.0,5.0,3.745283,4.0,0.429064
0,1,0.0,0.410433,0.531527,0.496423,0.266469,0.0,0.0,0.0,0.48123,...,0.0,0.0,0.0,0.0,0.0,15.0,2.0,2.621765,3.0,1.585223
0,1,0.0,0.410433,0.531527,0.496423,0.266469,0.0,0.0,0.0,0.48123,...,0.0,0.0,0.0,0.0,0.0,19.0,3.0,3.534279,3.0,0.865528


### 5. Посчитаю статистики оценок для каждого фильма, но сначала избавлюсь от фильмов без рейтинга

In [9]:
movies_with_ratings = movies_with_ratings[movies_with_ratings['rating'].notna()]

In [10]:
movies_with_stats = movies_with_ratings.join(
    movies_with_ratings.groupby('movieId').rating.agg(['mean', 'median', 'var']),
    on='movieId', lsuffix='_user', rsuffix='_movie')
movies_with_stats.head()

Unnamed: 0,movieId,action,adventure,animation,children,comedy,crime,documentary,drama,fantasy,...,zombies,zooeydeschanel,userId,rating,mean_user,median_user,var_user,mean_movie,median_movie,var_movie
0,1,0.0,0.410433,0.531527,0.496423,0.266469,0.0,0.0,0.0,0.48123,...,0.0,0.0,7.0,3.0,3.465909,3.0,0.872388,3.87247,4.0,0.919646
0,1,0.0,0.410433,0.531527,0.496423,0.266469,0.0,0.0,0.0,0.48123,...,0.0,0.0,9.0,4.0,3.755556,4.0,0.779798,3.87247,4.0,0.919646
0,1,0.0,0.410433,0.531527,0.496423,0.266469,0.0,0.0,0.0,0.48123,...,0.0,0.0,13.0,5.0,3.745283,4.0,0.429064,3.87247,4.0,0.919646
0,1,0.0,0.410433,0.531527,0.496423,0.266469,0.0,0.0,0.0,0.48123,...,0.0,0.0,15.0,2.0,2.621765,3.0,1.585223,3.87247,4.0,0.919646
0,1,0.0,0.410433,0.531527,0.496423,0.266469,0.0,0.0,0.0,0.48123,...,0.0,0.0,19.0,3.0,3.534279,3.0,0.865528,3.87247,4.0,0.919646


### 6. Пустые значения встречаются в столбце с дисперсией оценок фильмов. Происходит это у фильмов, для которых есть всего одна оценка. Я заменю пустые значения нулем

In [11]:
movies_with_stats.isna().any()[lambda x: x]

var_movie    True
dtype: bool

In [12]:
movies_with_stats.fillna(0, inplace=True)

### 7. Избавлюсь в данных для моделирования от идентификаторов фильмов и пользователей, и поделю данные на тренировочные и тестовые

In [13]:
from sklearn.model_selection import train_test_split

X = movies_with_stats.drop(['movieId', 'userId', 'rating'], axis=1).reset_index(drop=True)
y = movies_with_stats.rating.reset_index(drop=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### 8. Обучу модель линейной регрессии с добавлением L2 регуляризации для борьбы с переобучением

In [14]:
from sklearn.linear_model import Ridge

reg = Ridge(alpha=200, positive=True)
reg.fit(X_train, y_train)

Ridge(alpha=200, positive=True)

In [15]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error

y_pred = reg.predict(X_test)
print('root_mean_squared_error:', round(mean_squared_error(y_test, y_pred, squared=False), 2))
print('mean_absolute_error:', round(mean_absolute_error(y_test, y_pred), 2))
print('mean_absolute_percentage_error:', round(mean_absolute_percentage_error(y_test, y_pred), 2))

root_mean_squared_error: 0.83
mean_absolute_error: 0.63
mean_absolute_percentage_error: 0.27


### 9. Вывод
На мой взгляд результат получился весьма неплохой для простейшей линейной модели, обученной на небольшом объеме данных