<a href="https://colab.research.google.com/github/aleks-haksly/Simulative/blob/main/ML/recomendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://a.teleboss.ru/play/c851b57a-ccd5-408c-9484-e3d00c79b46e

https://www.kaggle.com/datasets/parasharmanas/movie-recommendation-system/data?select=movies.csv

# Загрузка датасета с kaggle

In [1]:
!curl -L -o movie-recommendation-system.zip https://www.kaggle.com/api/v1/datasets/download/parasharmanas/movie-recommendation-system
!unzip -o /content/movie-recommendation-system.zip
!rm *.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  164M  100  164M    0     0  68.8M      0  0:00:02  0:00:02 --:--:--  104M
Archive:  /content/movie-recommendation-system.zip
  inflating: movies.csv              
  inflating: ratings.csv             


In [2]:
import numpy as np
import pandas as pd
import os
import warnings
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

#warnings.filterwarnings("ignore")
pd.options.display.max_columns = 100

In [3]:
ratings_full_df = pd.read_csv("/content/ratings.csv", nrows=200_000)
movies_df = pd.read_csv("/content/movies.csv")

In [4]:
ratings_full_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828


In [5]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [6]:
X_ = ratings_full_df.drop("rating", axis=1)
y_ = ratings_full_df['rating']

# Подготовка pipline для обработки
## Часть 1 - Пишем cамый примитивный Estimator

Он будет использоваться как baseline для дальнейших экспериментов

In [7]:
class NaivePredictor(ClassifierMixin, BaseEstimator):

    def __init__(self, a=0.5, avg=np.median):
        self.rating_movie_mean_ = None  # Для хранения средней оценки фильма
        self.rating_user_mean_ = None  # Для хранения средней оценки которую ставит пользователь
        self.a = a  # Параметр, который будем менять при подборе по сетке GridSearchCV для определения долей влияния средней оценки фильма и среднй оценки пользователя
        self.avg = avg # Параметр, который будем менять при подборе по сетке GridSearchCV для выбора способа оценки среднего выборки
        self.classes_ = [0] # Технический момент, никак не используется, но атрибут должен существовать для корректной работы GridSearchCV

    def fit(self, X, y):
        concatenated_df = pd.concat([X, y], axis=1)
        self.rating_movie_mean_ = concatenated_df.groupby('movieId')['rating'].apply(self.avg) # Средняя оценка фильма
        self.rating_user_mean_ = concatenated_df.groupby('userId')['rating'].apply(self.avg) # Средняя оценка, которую ставит пользователь
        self.global_avg = y.pipe(self.avg) # Средняя всех оценок в тренировочном датасете

        return self

    def predict(self, X):
        """
        Предсказывает средний рейтинг фильма как сумму долей средней оценки фильма и среднй оценки, которую обычно ставит пользователь
        Если `movieId` нет в обучающей выборке, возвращает средний рейтинг по всем фильмам.
        """
        if self.rating_movie_mean_ is None:
            raise ValueError("Model is not fitted yet. Call `fit` before `predict`.")

        predictions = (X['movieId'].map(self.rating_movie_mean_) * self.a +
                       X['userId'].map(self.rating_user_mean_) * (1 - self.a))\
                       .fillna(self.global_avg) # Заполняем пропущенные значения средним рейтингом всех фильмов

        return predictions

In [8]:
# Базовый пайплайн, без трансформеров датасета
pipe = Pipeline(
    [
        ("NaivePredictor", NaivePredictor())
    ]
)

In [95]:
# Для кросс валидации будем использовать KFold
splitter = KFold(
    n_splits=2, # Разбиваем на 5 частей и поочередно используем каждую часть как test
    shuffle=True,
    random_state=42
)

In [10]:
# Для подбора по сетке будем пробовать разные коэффициенты a и разные оценки среднего
param_grid = {
    "NaivePredictor__a": np.arange(start=0.1, stop=1.1, step=0.1),
    "NaivePredictor__avg": [np.mean, np.median]
}

In [11]:
# Перебираем все комбинации из переметров из param_grid
%%time
search_naive = GridSearchCV(pipe,
                      param_grid,
                      cv=splitter,
                      scoring='neg_mean_absolute_error', # В качестве метрики для выбора лучшей модели возьмем MAE
                      verbose=1,
                      return_train_score=True,
                      error_score="raise")

search_naive.fit(X_, y_)
print(f"Best parameter (CV score={search_naive.best_score_:.5f}):")
print(search_naive.best_params_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameter (CV score=-0.69944):
{'NaivePredictor__a': 0.4, 'NaivePredictor__avg': <function median at 0x7d4a87768db0>}
CPU times: user 2min 13s, sys: 3.14 s, total: 2min 16s
Wall time: 2min 19s


Best parameter (CV score=-0.69944):
{'NaivePredictor__a': 0.4, 'NaivePredictor__avg': <function median at 0x7bffe353caf0>}

In [12]:
def evaluate_model(y_true, y_predicted):
  '''
  Функция для расчета прочих оценочных метрик модели
  '''
  result = pd.DataFrame.from_dict({
          "MAE": f'{mean_absolute_error(y_true, y_predicted):.3f}',
          "MSE": f'{mean_squared_error(y_true, y_predicted):.3f}',
          "MAPE": f'{mean_absolute_percentage_error(y_true, y_predicted):.2%}',
          "R2_score": f'{r2_score(y_true, y_predicted):.3f}'
  },
  orient='index', columns=['value'])
  return result

In [13]:
# Воспользуемся базовым train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_,
    y_,
    test_size=0.2, # Доля теста к тотал данным
    random_state=42  # Мешать ли данные или делить по индексам
)

In [14]:
evaluate_model(y_test, search_naive.predict(X_test))

Unnamed: 0,value
MAE,0.666
MSE,0.792
MAPE,30.93%
R2_score,0.273


In [106]:
mlb_genres = MultiLabelBinarizer()
mlb_genres.fit(movies_df.genres.apply(lambda x: x.split('|')))

In [258]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        movies_data=movies_df,
        avg=np.mean,
        a=0

    ):

        self.mlb = mlb_genres
        self.movies_data = movies_data
        self.avg = avg
        self.a = a

    def fit(self, X, y):


        X_train_copy = X.copy()
        X_train_copy = pd.concat([X_train_copy, y], axis = 1)
        X_train_copy = X_train_copy.merge(movies_df[['movieId', 'genres']], on='movieId', how='left')

        self.rating_user_mean = X_train_copy.groupby("userId")\
                                               .rating.apply(self.avg)\
                                               .rename("rating_user_mean")

        self.rating_movie_mean = X_train_copy.groupby("movieId", as_index=False)\
                                                .rating.apply(self.avg)\
                                                .rename({"rating": "rating_movie_mean"}, axis=1)

        self.mlb_encoded_genres = pd.concat([
            X_train_copy.userId,
            pd.DataFrame(self.mlb.transform(X_train_copy.genres.apply(lambda x: x.split('|'))), \
                                          columns=self.mlb.classes_, index = X_train_copy.index).mul(X_train_copy.rating, axis=0)
        ], axis=1)

        self.mean_genres_by_users = self.mlb_encoded_genres.groupby("userId").agg(lambda x: self.avg([_ for _ in x if _ > 0] or 0)).apply(lambda row: row.where(row != 0, self.rating_user_mean[row.name] * self.a), axis=1)
        self.mean_rating = y.pipe(self.avg)
        return self

    def transform(self, X:pd.DataFrame):

        X_copy= X.copy()
        X_copy = X_copy.merge(self.movies_data[['movieId', 'genres']], on='movieId', how='left')

        datetime_ = pd.to_datetime(X_copy.timestamp, unit='s')

        X_copy['month'] = datetime_.dt.month
        X_copy['day_of_week'] = datetime_.dt.day_of_week
        X_copy['time'] = pd.cut(datetime_.dt.hour,
                                include_lowest=True,
                                bins=[0, 5, 12, 17, 21, 24],
                                right=False,
                                ordered=False,
                                labels=['night', 'morining', 'afternoon', 'evening', 'night'])



        X_copy = pd.concat([X_copy, pd.DataFrame(self.mlb.transform( X_copy.genres.apply(lambda x: x.split('|'))), \
                                         columns=self.mlb.classes_)], axis=1)

        X_copy = X_copy.merge(self.mean_genres_by_users, how='left', left_on='userId', right_index=True)


        X_copy.drop(columns=['timestamp', 'genres', 'userId',  'movieId'], inplace=True) #'rating_user_mean', 'rating_movie_mean',
        return X_copy.set_index(X.index)#.fillna(self.mean_rating)
        """
        return self.mlb_encoded_genres2

    def get_feature_names_out(self, input_features=None):
        return self.mlb.classes_

    def inverse_transform(self):
        pass
        """

In [135]:
onehot_columns = ['time'] # список колонок, которые будем кодировать c OneHotEncoder

In [241]:
col_transformer = ColumnTransformer(
    transformers=[
        ('OneHotEncoder', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'), onehot_columns),
    ],
    remainder='passthrough',          # Оставляем необрабатываемые колонки как есть, не удаляем их
    verbose_feature_names_out=False   # Оставляем оригинальные названия колонок
).set_output(transform='pandas')      # Трансформер будет возвращать pandas


In [108]:
research_pipe = Pipeline(
    [
        ("Custom_transformer", CustomTransformer(movies_data=movies_df, avg=np.mean)),
        ("OneHotEncoder", col_transformer)
    ]
)

In [20]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [83]:
df = research_pipe.fit(X_, y_).transform(X_)

In [31]:
vif_df = pd.DataFrame()
vif_df['feature'] = df.columns
vif_df['VIF'] = [vif(df.values, i) for i in range(len(df.columns))]
vif_df.query("VIF >= 10")

Unnamed: 0,feature,VIF
4,rating_movie_mean,60.804564
5,year,71.274536


In [260]:
research_final_pipe = Pipeline(
    [
        ("Custom_transformer", CustomTransformer(movies_data=movies_df, avg=np.mean)),
        ("OneHotEncoder", col_transformer),
        ("LinearRegression", LinearRegression())
    ]
)


In [228]:
X_test

Unnamed: 0,userId,movieId,timestamp
119737,861,21,949776476
72272,552,6796,1096938889
158154,1128,7373,1107415253
65426,526,9,874928861
30074,228,588,1294667103
...,...,...,...
4174,23,2470,943135264
91537,695,2986,941479467
156449,1115,1230,985128049
184376,1307,4963,1568142006


In [255]:
lasso_pipe = Pipeline(
    [
        ("Custom_transformer", CustomTransformer(movies_data=movies_df, avg=np.mean)),
        ("OneHotEncoder", col_transformer),
        ("simple_model", Lasso())
    ]
)

ridge_pipe = Pipeline(
    [
        ("Custom_transformer", CustomTransformer(movies_data=movies_df, avg=np.mean)),
        ("OneHotEncoder", col_transformer),
        ("simple_model", Ridge())
    ]
)

In [256]:
param_grid = {
    "simple_model__alpha": np.logspace(-1, 3, 10),
    "simple_model__max_iter": [100, 1000],
    "Custom_transformer__avg": [np.mean, np.median],
    "Custom_transformer__a": list(np.arange(0.1, 1.1, 0.1))
}

In [257]:
%%time
for model in [lasso_pipe, ridge_pipe]:

    search = GridSearchCV(model,
                          param_grid,
                          cv=splitter,
                          scoring='neg_mean_absolute_error', # В качестве метрики для выбора лучшей модели возьмем MAE
                          #n_jobs=-1,
                          verbose=3,
                          return_train_score=True,
                          error_score="raise")

    search.fit(X_, y_)

    print(f"Best parameter (CV score={search.best_score_:.5f}):")
    print(search.best_params_)

Fitting 2 folds for each of 400 candidates, totalling 800 fits
[CV 1/2] END Custom_transformer__a=0.1, Custom_transformer__avg=<function mean at 0x7d4a8a328f40>, simple_model__alpha=0.1, simple_model__max_iter=100;, score=(train=-0.757, test=-0.763) total time=   5.0s
[CV 2/2] END Custom_transformer__a=0.1, Custom_transformer__avg=<function mean at 0x7d4a8a328f40>, simple_model__alpha=0.1, simple_model__max_iter=100;, score=(train=-0.757, test=-0.762) total time=   8.6s
[CV 1/2] END Custom_transformer__a=0.1, Custom_transformer__avg=<function mean at 0x7d4a8a328f40>, simple_model__alpha=0.1, simple_model__max_iter=1000;, score=(train=-0.757, test=-0.763) total time=   4.9s
[CV 2/2] END Custom_transformer__a=0.1, Custom_transformer__avg=<function mean at 0x7d4a8a328f40>, simple_model__alpha=0.1, simple_model__max_iter=1000;, score=(train=-0.757, test=-0.762) total time=   7.0s
[CV 1/2] END Custom_transformer__a=0.1, Custom_transformer__avg=<function mean at 0x7d4a8a328f40>, simple_model

KeyboardInterrupt: 

In [261]:
evaluate_model(y_test, research_final_pipe.fit(X_train, y_train).predict(X_test))

Unnamed: 0,value
MAE,0.718
MSE,0.855
MAPE,31.47%
R2_score,0.214


In [None]:
search.best_estimator_.named_steps.simple_model.coef_

array([ 1.04059101e-02, -2.49558826e-03,  4.40923125e-02,  3.32142263e-06,
        2.91875474e-07,  7.95015684e-01,  8.42661449e-01, -6.23495386e-03,
        2.38087477e-04,  4.83806125e-03,  4.24583253e-02,  8.46547380e-03,
        5.09682247e-03,  1.65008812e-02,  1.22474934e-03,  1.33240856e-02,
        1.22784120e-02,  5.88045315e-02,  1.85343304e-02,  8.13458326e-03,
        2.26325030e-02,  3.19501603e-02,  3.82566361e-03,  1.24249082e-02,
        8.71857186e-03,  6.19804279e-03,  8.36666108e-03,  7.40376974e-03,
        1.32564675e-02,  3.00967059e-02])

In [None]:
evaluate_model(y_test, final_pipe.fit(X_train, y_train).predict(X_test))

In [None]:
from sklearn.model_selection import KFold

splitter = KFold(
    n_splits=2,
    shuffle=True,
    random_state=42
)

In [None]:
from sklearn.model_selection import cross_validate

cv_result = cross_validate(pipe,
                           X_, y_,
                           scoring='neg_mean_absolute_error',
                           cv=splitter, return_train_score=True)

cv_result

In [None]:
param_grid = {}
search = GridSearchCV(final_pipe, param_grid,
                      cv=splitter, scoring='neg_mean_absolute_error')

In [None]:
search.fit(X_, y_)
print(f"Best parameter (CV score={search.best_score_:.5f}):")
print(search.best_params_)

In [None]:
search.best_score_

In [None]:
list(splitter.split(X_, y_))

In [None]:
movies_df.genres.isna().sum()

In [None]:
CustomTransformer2(movies_data=movies_df).fit(X_train, y_train).transform(X_test)

In [None]:
 CustomTransformer2(movies_data=movies_df).fit(X_train, y_train).transform(X_test).isna().sum().loc[lambda x: x > 0]