<a href="https://colab.research.google.com/github/aleks-haksly/Simulative/blob/main/ML/recomendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://a.teleboss.ru/play/c851b57a-ccd5-408c-9484-e3d00c79b46e

https://www.kaggle.com/datasets/parasharmanas/movie-recommendation-system/data?select=movies.csv

# Загрузка датасета с kaggle

In [1]:
!curl -L -o movie-recommendation-system.zip https://www.kaggle.com/api/v1/datasets/download/parasharmanas/movie-recommendation-system
!unzip -o /content/movie-recommendation-system.zip
!rm *.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  164M  100  164M    0     0  54.8M      0  0:00:03  0:00:03 --:--:-- 75.9M
Archive:  /content/movie-recommendation-system.zip
  inflating: movies.csv              
  inflating: ratings.csv             


In [76]:
import numpy as np
import pandas as pd
import os
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = 100

In [77]:
ratings_full_df = pd.read_csv("/content/ratings.csv", nrows=200_000)
movies_df = pd.read_csv("/content/movies.csv")

In [78]:
ratings_full_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828


In [79]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


# Подготовка pipline для обработки

In [100]:
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

In [101]:
onehot_columns = ['time'] # список колонок, которые будем кодировать c OneHotEncoder

In [102]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        movies_data,
        date_col='timestamp'

    ):
        self.date_col = date_col
        self.mlb = MultiLabelBinarizer()
        self.movies_data = movies_data

    def fit(self, X, y):

        concatenated_df = pd.concat([X, y], axis = 1)
        self.rating_user_mean = concatenated_df.groupby("userId", as_index=False)\
                                               .rating.mean()\
                                               .rename({"rating": "rating_user_mean"}, axis=1)

        self.rating_movie_mean = concatenated_df.groupby("movieId", as_index=False)\
                                                .rating.mean()\
                                                .rename({"rating": "rating_movie_mean"}, axis=1)

        self.mlb.fit(self.movies_data.genres.apply(lambda x: x.split('|')))
        self.ratings = y.copy()

        return self

    def transform(self, X:pd.DataFrame):

        X_copy= X.copy()
        X_copy = X_copy.merge(self.rating_user_mean, how='left', on='userId') \
                       .merge(self.rating_movie_mean, how='left', on='movieId') \
                       .merge(movies_df[['movieId', 'genres']], on='movieId', how='left')

        datetime_ = pd.to_datetime(X_copy.timestamp, unit='s')

        X_copy['year'] = datetime_.dt.year
        X_copy['month'] = datetime_.dt.month
        X_copy['day_of_week'] = datetime_.dt.day_of_week
        X_copy['time'] = pd.cut(datetime_.dt.hour,
                                include_lowest=True,
                                bins=[0, 5, 12, 17, 21, 24],
                                right=False,
                                ordered=False,
                                labels=['night', 'morining', 'afternoon', 'evening', 'night'])

        X_copy['genres'] = X_copy.genres.apply(lambda x: x.split('|'))

        mlb_encoded_genres = pd.DataFrame(self.mlb.transform(X_copy.genres), \
                                         columns=self.mlb.classes_)

        X_copy = pd.concat([X_copy, mlb_encoded_genres], axis=1)


        X_copy = X_copy.merge(pd.concat([mlb_encoded_genres.mul(self.ratings, axis=0),\
                                X_copy.userId], axis = 1)
                               .groupby("userId", as_index = False).mean(), \
                     how='left',
                     on='userId', \
                     suffixes = ('', '_gen_mean'))


        X_copy.drop(columns=['timestamp', 'genres', ], inplace=True)
        return X_copy

    def get_feature_names_out(self, input_features=None):
        return self.mlb.classes_

    def inverse_transform(self):
        pass

In [103]:
col_transformer = ColumnTransformer(
    transformers=[
        ('OneHotEncoder', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'), onehot_columns),
    ],
    remainder='passthrough',          # Оставляем необрабатываемые колонки как есть, не удаляем их
    verbose_feature_names_out=False   # Оставляем оригинальные названия колонок
).set_output(transform='pandas')      # Трансформер будет возвращать pandas


In [104]:
class NaivePredictor(ClassifierMixin, BaseEstimator):

    def __init__(self):
        self.rating_movie_mean_ = None  # Для хранения средней оценки фильма

    def fit(self, X, y):
        concatenated_df = pd.concat([X, y], axis=1)
        self.rating_movie_mean_ = concatenated_df.groupby('movieId')['rating'].mean()
        return self

    def predict(self, X_):
        """
        Предсказывает средний рейтинг фильма для каждого примера.
        Если `movieId` нет в обучающей выборке, возвращает средний рейтинг по всем фильмам.
        """
        if self.rating_movie_mean_ is None:
            raise ValueError("Model is not fitted yet. Call `fit` before `predict`.")

        # Заполняем пропущенные значения средним рейтингом всех фильмов
        global_mean = self.rating_movie_mean_.mean()
        predictions = X_['movieId'].map(self.rating_movie_mean_).fillna(global_mean)
        return predictions

In [105]:
pipe = Pipeline(
    [
        ("Custom_transformer", CustomTransformer(movies_data=movies_df)),
        ("OneHotEncoder", col_transformer),
        ("NaivePredictor", NaivePredictor())
    ]
)

In [106]:
X_ = ratings_full_df.drop("rating", axis=1)
y_ = ratings_full_df['rating']

In [107]:
sample = pipe.fit(X_, y_).predict(X_)

In [108]:
sample

Unnamed: 0,movieId
0,4.182081
1,4.100000
2,4.185185
3,4.000000
4,4.082474
...,...
199995,4.125592
199996,3.909091
199997,3.837209
199998,3.867769


In [109]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score

In [110]:
X_train, X_test, y_train, y_test = train_test_split(
    X_,
    y_,
    test_size=0.2, # <--- Доля теста к тотал данным
    random_state=42  # <--- Мешать ли данные или делить по индексам
)

In [111]:
def evalueate_model(y_predicted, y_true):
  result = pd.DataFrame.from_dict({
          "MAE": mean_absolute_error(y_predicted, y_true),
          "MSE": mean_squared_error(y_predicted, y_true),
          "MAPE": mean_absolute_percentage_error(y_predicted, y_true),
          "R2_score": r2_score(y_true, y_predicted )
  },
  orient='index', columns=['value'])
  return result

In [112]:
evalueate_model(pipe.fit(X_train, y_train).predict(X_test), y_test)

Unnamed: 0,value
MAE,0.858863
MSE,1.164943
MAPE,0.244633
R2_score,-0.070116


In [113]:
pipe.fit(X_train, y_train).predict(X_train)

Unnamed: 0,movieId
0,3.784615
1,3.529851
2,3.585526
3,3.577491
4,3.579602
...,...
159995,3.714286
159996,3.675926
159997,3.579051
159998,3.536145


In [114]:
y_train.sort_index()

Unnamed: 0,rating
0,5.0
1,3.5
2,5.0
3,5.0
5,4.0
...,...
199994,2.5
199995,4.0
199996,4.0
199998,4.0
