<a href="https://colab.research.google.com/github/aleks-haksly/Simulative/blob/main/ML/recomendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://a.teleboss.ru/play/c851b57a-ccd5-408c-9484-e3d00c79b46e

https://www.kaggle.com/datasets/parasharmanas/movie-recommendation-system/data?select=movies.csv

# Загрузка датасета с kaggle

In [1]:
!curl -L -o movie-recommendation-system.zip https://www.kaggle.com/api/v1/datasets/download/parasharmanas/movie-recommendation-system
!unzip -o /content/movie-recommendation-system.zip
!rm *.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  164M  100  164M    0     0  54.6M      0  0:00:03  0:00:03 --:--:-- 88.8M
Archive:  /content/movie-recommendation-system.zip
  inflating: movies.csv              
  inflating: ratings.csv             


In [2]:
import numpy as np
import pandas as pd
import os
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = 100

In [3]:
ratings_full_df = pd.read_csv("/content/ratings.csv", nrows=200_000)
movies_df = pd.read_csv("/content/movies.csv")

In [4]:
ratings_full_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828


In [5]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


# Подготовка pipline для обработки

In [6]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import KFold

In [7]:
onehot_columns = ['time'] # список колонок, которые будем кодировать c OneHotEncoder

In [8]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        movies_data,
        date_col='timestamp'

    ):
        self.date_col = date_col
        self.mlb = MultiLabelBinarizer()
        self.movies_data = movies_data

    def fit(self, X, y):

        concatenated_df = pd.concat([X, y], axis = 1)
        self.rating_user_mean = concatenated_df.groupby("userId", as_index=False)\
                                               .rating.mean()\
                                               .rename({"rating": "rating_user_mean"}, axis=1)

        self.rating_movie_mean = concatenated_df.groupby("movieId", as_index=False)\
                                                .rating.mean()\
                                                .rename({"rating": "rating_movie_mean"}, axis=1)

        self.mlb.fit(self.movies_data.genres.apply(lambda x: x.split('|')))
        self.ratings = y.copy()

        return self

    def transform(self, X:pd.DataFrame):

        X_copy= X.copy()
        X_copy = X_copy.merge(self.rating_user_mean, how='left', on='userId') \
                       .merge(self.rating_movie_mean, how='left', on='movieId') \
                       .merge(movies_df[['movieId', 'genres']], on='movieId', how='left').fillna(2.5)

        datetime_ = pd.to_datetime(X_copy.timestamp, unit='s')

        X_copy['year'] = datetime_.dt.year
        X_copy['month'] = datetime_.dt.month
        X_copy['day_of_week'] = datetime_.dt.day_of_week
        X_copy['time'] = pd.cut(datetime_.dt.hour,
                                include_lowest=True,
                                bins=[0, 5, 12, 17, 21, 24],
                                right=False,
                                ordered=False,
                                labels=['night', 'morining', 'afternoon', 'evening', 'night'])

        X_copy['genres'] = X_copy.genres.apply(lambda x: x.split('|'))

        mlb_encoded_genres = pd.DataFrame(self.mlb.transform(X_copy.genres), \
                                         columns=self.mlb.classes_)

        X_copy = pd.concat([X_copy, mlb_encoded_genres], axis=1)


        X_copy = X_copy.merge(pd.concat([mlb_encoded_genres.mul(self.ratings, axis=0),\
                                X_copy.userId], axis = 1)
                               .groupby("userId", as_index = False).mean(), \
                     how='left',
                     on='userId', \
                     suffixes = ('', '_gen_mean'))


        X_copy.drop(columns=['timestamp', 'genres'], inplace=True)
        return X_copy.set_index(X.index)

    def get_feature_names_out(self, input_features=None):
        return self.mlb.classes_

    def inverse_transform(self):
        pass

In [9]:
col_transformer = ColumnTransformer(
    transformers=[
        ('OneHotEncoder', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'), onehot_columns),
    ],
    remainder='passthrough',          # Оставляем необрабатываемые колонки как есть, не удаляем их
    verbose_feature_names_out=False   # Оставляем оригинальные названия колонок
).set_output(transform='pandas')      # Трансформер будет возвращать pandas


In [10]:
class NaivePredictor(ClassifierMixin, BaseEstimator):

    def __init__(self):
        self.rating_movie_mean_ = None  # Для хранения средней оценки фильма

    def fit(self, X, y):
        concatenated_df = pd.concat([X, y], axis=1)
        self.rating_movie_mean_ = concatenated_df.groupby('movieId')['rating'].mean()
        self.global_mean = y.mean()
        return self

    def predict(self, X):
        """
        Предсказывает средний рейтинг фильма для каждого примера.
        Если `movieId` нет в обучающей выборке, возвращает средний рейтинг по всем фильмам.
        """
        if self.rating_movie_mean_ is None:
            raise ValueError("Model is not fitted yet. Call `fit` before `predict`.")

        # Заполняем пропущенные значения средним рейтингом всех фильмов
        predictions = X['movieId'].map(self.rating_movie_mean_).fillna(self.global_mean)
        return predictions

In [11]:
class NaivePredictor2(ClassifierMixin, BaseEstimator):

    def __init__(self, a=0.5):
        self.rating_movie_mean_ = None  # Для хранения средней оценки фильма
        self.rating_user_mean_ = None
        self.a = a

    def fit(self, X, y):
        concatenated_df = pd.concat([X, y], axis=1)
        self.rating_movie_mean_ = concatenated_df.groupby('movieId')['rating'].mean()
        self.rating_user_mean_ = concatenated_df.groupby('userId')['rating'].mean()
        self.global_mean = y.mean()
        return self

    def predict(self, X):
        """
        Предсказывает средний рейтинг фильма для каждого примера.
        Если `movieId` нет в обучающей выборке, возвращает средний рейтинг по всем фильмам.
        """
        if self.rating_movie_mean_ is None:
            raise ValueError("Model is not fitted yet. Call `fit` before `predict`.")

        # Заполняем пропущенные значения средним рейтингом всех фильмов
        predictions = (X['movieId'].map(self.rating_movie_mean_) * self.a + X['userId'].map(self.rating_user_mean_) * (1 - self.a)).fillna(self.global_mean)
        return predictions

In [12]:
pipe = Pipeline(
    [
        ("Custom_transformer", CustomTransformer(movies_data=movies_df)),
        ("OneHotEncoder", col_transformer)
    ]
)

In [13]:
class CustomTransformer2(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        movies_data,

    ):

        self.mlb = MultiLabelBinarizer()
        self.movies_data = movies_data

    def fit(self, X, y):

        X_train_copy = X.copy()
        X_train_copy = pd.concat([X_train_copy, y], axis = 1)
        X_train_copy = X_train_copy.merge(movies_df[['movieId', 'genres']], on='movieId', how='left')

        self.rating_user_mean = X_train_copy.groupby("userId", as_index=False)\
                                               .rating.mean()\
                                               .rename({"rating": "rating_user_mean"}, axis=1)

        self.rating_movie_mean = X_train_copy.groupby("movieId", as_index=False)\
                                                .rating.mean()\
                                                .rename({"rating": "rating_movie_mean"}, axis=1)


        self.mlb.fit(self.movies_data.genres.apply(lambda x: x.split('|')))


        mlb_encoded_genres = pd.DataFrame(self.mlb.transform(X_train_copy.genres.apply(lambda x: x.split('|'))), \
                                          columns=self.mlb.classes_)

        temp = pd.concat([X_train_copy.userId, mlb_encoded_genres.mul(y, axis=0)], axis=1).groupby("userId").agg(lambda x: np.mean([_ for _ in x if _ > 0]))
        self.user_genres_rating = temp.T.fillna(temp.mean(axis=1)).T
        self.mean_rating = y.mean()
        return self

    def transform(self, X:pd.DataFrame):

        X_copy= X.copy()
        X_copy = X_copy.merge(self.rating_user_mean, how='left', on='userId') \
                       .merge(self.rating_movie_mean, how='left', on='movieId') \
                       .merge(movies_df[['movieId', 'genres']], on='movieId', how='left').fillna(2.5)

        datetime_ = pd.to_datetime(X_copy.timestamp, unit='s')

        X_copy['year'] = datetime_.dt.year
        X_copy['month'] = datetime_.dt.month
        X_copy['day_of_week'] = datetime_.dt.day_of_week
        X_copy['time'] = pd.cut(datetime_.dt.hour,
                                include_lowest=True,
                                bins=[0, 5, 12, 17, 21, 24],
                                right=False,
                                ordered=False,
                                labels=['night', 'morining', 'afternoon', 'evening', 'night'])

        X_copy['genres'] = X_copy.genres.apply(lambda x: x.split('|'))

        mlb_encoded_genres = pd.DataFrame(self.mlb.transform(X_copy.genres), \
                                         columns=self.mlb.classes_)

        #X_copy = pd.concat([X_copy, mlb_encoded_genres], axis=1)
        X_copy = pd.concat([X_copy, self.user_genres_rating.merge(X_copy.userId, how='right', right_on='userId', left_index=True).iloc[:,:-1].apply(lambda col: col.fillna(col.mean()), axis=0) * mlb_encoded_genres], axis=1)

        """
        X_copy = X_copy.merge(pd.concat([mlb_encoded_genres,\
                                X_copy.userId], axis = 1)
                               .groupby("userId", as_index = False).mean(), \
                     how='left',
                     on='userId', \
                     suffixes = ('', '_gen_mean'))
        """


        X_copy.drop(columns=['timestamp', 'genres'], inplace=True)
        return X_copy.set_index(X.index)

    def get_feature_names_out(self, input_features=None):
        return self.mlb.classes_

    def inverse_transform(self):
        pass

In [14]:
final_pipe = Pipeline(
    [
        ("Custom_transformer", CustomTransformer2(movies_data=movies_df)),
        ("OneHotEncoder", col_transformer),
        ("linear_regression", Ridge())
    ]
)


In [15]:
X_ = ratings_full_df.drop("rating", axis=1)
y_ = ratings_full_df['rating']

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score
from sklearn.model_selection import GridSearchCV

In [17]:
X_train, X_test, y_train, y_test = train_test_split(
    X_,
    y_,
    test_size=0.2, # <--- Доля теста к тотал данным
    random_state=42  # <--- Мешать ли данные или делить по индексам
)

In [18]:
def evaluate_model(y_true, y_predicted):
  result = pd.DataFrame.from_dict({
          "MAE": mean_absolute_error(y_true, y_predicted),
          "MSE": mean_squared_error(y_true, y_predicted),
          "MAPE": mean_absolute_percentage_error(y_true, y_predicted),
          "R2_score": r2_score(y_true, y_predicted )
  },
  orient='index', columns=['value'])
  return result

In [19]:
splitter = KFold(
    n_splits=2,
    shuffle=True,
    random_state=100
)

In [20]:
lasso_pipe = Pipeline(
    [
        ("Custom_transformer", CustomTransformer2(movies_data=movies_df)),
        ("OneHotEncoder", col_transformer),
        ("simple_model", Lasso())
    ]
)

ridge_pipe = Pipeline(
    [
        ("Custom_transformer", CustomTransformer2(movies_data=movies_df)),
        ("OneHotEncoder", col_transformer),
        ("simple_model", Ridge())
    ]
)

In [21]:
param_grid = {
    "simple_model__alpha": np.linspace(start=0.1, stop=1000, num=100),
    "simple_model__max_iter": [100, 1000]
}

In [None]:
%%time
for model in [lasso_pipe, ridge_pipe]:
    ### Передадим в GridSearchCV
    search = GridSearchCV(model, param_grid,
                          cv=splitter, scoring='neg_mean_absolute_error', n_jobs=-1, verbose=10)

    search.fit(X_, y_)

    print(f"Best parameter (CV score={search.best_score_:.5f}):")
    print(search.best_params_)

Fitting 2 folds for each of 200 candidates, totalling 400 fits


In [None]:
evaluate_model(y_test, final_pipe.fit(X_train, y_train).predict(X_test))

In [None]:
from sklearn.model_selection import KFold

splitter = KFold(
    n_splits=2,
    shuffle=True,
    random_state=42
)

In [None]:
from sklearn.model_selection import cross_validate

cv_result = cross_validate(pipe,
                           X_, y_,
                           scoring='neg_mean_absolute_error',
                           cv=splitter, return_train_score=True)

cv_result

In [None]:
param_grid = {}
search = GridSearchCV(final_pipe, param_grid,
                      cv=splitter, scoring='neg_mean_absolute_error')

In [None]:
search.fit(X_, y_)
print(f"Best parameter (CV score={search.best_score_:.5f}):")
print(search.best_params_)

In [None]:
search.best_score_

In [None]:
list(splitter.split(X_, y_))

In [None]:
movies_df.genres.isna().sum()

In [None]:
CustomTransformer2(movies_data=movies_df).fit(X_train, y_train).transform(X_test)

In [None]:
 CustomTransformer2(movies_data=movies_df).fit(X_train, y_train).transform(X_test).isna().sum().loc[lambda x: x > 0]