<a href="https://colab.research.google.com/github/aleks-haksly/Simulative/blob/main/ML/recomendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://a.teleboss.ru/play/c851b57a-ccd5-408c-9484-e3d00c79b46e

https://www.kaggle.com/datasets/parasharmanas/movie-recommendation-system/data?select=movies.csv

In [23]:
import numpy as np
import pandas as pd
import os
import warnings

warnings.filterwarnings("ignore")
pd.options.display.max_columns = 100

# Загрузка датасета с kaggle

In [None]:
!curl -L -o movie-recommendation-system.zip https://www.kaggle.com/api/v1/datasets/download/parasharmanas/movie-recommendation-system
!unzip -o /content/movie-recommendation-system.zip
!rm *.zip

In [9]:
ratings_full_df = pd.read_csv("/content/ratings.csv", nrows=200_000)
movies_df = pd.read_csv("/content/movies.csv")

In [10]:
ratings_full_df.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828


In [11]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


# Подготовка pipline для обработки

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

In [None]:
onehot_columns = ['time'] # список колонок, которые будем кодировать c OneHotEncoder

In [1]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        date_col='timestamp'
    ):
        self.date_col = date_col
        self.mlb = MultiLabelBinarizer()
        self.movies_data = movies_df

    def fit(self, X, y):

        concatenated_df = pd.concat([X, y], axis = 1)
        self.rating_user_mean = concatenated_df.groupby("userId", as_index=False)\
                                               .rating.mean()\
                                               .rename({"rating": "rating_user_mean"}, axis=1)

        self.rating_movie_mean = concatenated_df.groupby("movieId", as_index=False)\
                                                .rating.mean()\
                                                .rename({"rating": "rating_movie_mean"}, axis=1)

        self.mlb.fit(self.movies_data.genres.apply(lambda x: x.split('|')))
        self.ratings = y.copy()

        return self

    def transform(self, X:pd.DataFrame):

        X_copy= X.copy()
        X_copy = X_copy.merge(self.rating_user_mean, how='left', on='userId') \
                       .merge(self.rating_movie_mean, how='left', on='movieId') \
                       .merge(movies_df[['movieId', 'genres']], on='movieId', how='left')

        datetime_ = pd.to_datetime(X_copy.timestamp, unit='s')

        X_copy['year'] = datetime_.dt.year
        X_copy['month'] = datetime_.dt.month
        X_copy['day_of_week'] = datetime_.dt.day_of_week
        X_copy['time'] = pd.cut(datetime_.dt.hour,
                                include_lowest=True,
                                bins=[0, 5, 12, 17, 21, 24],
                                right=False,
                                ordered=False,
                                labels=['night', 'morining', 'afternoon', 'evening', 'night'])

        X_copy['genres'] = X_copy.genres.apply(lambda x: x.split('|'))

        mlb_encoded_genres = pd.DataFrame(self.mlb.transform(X_copy.genres), \
                                         columns=self.mlb.classes_)

        X_copy = pd.concat([X_copy, mlb_encoded_genres], axis=1)

        X_copy = X_copy.merge(pd.concat([mlb_encoded_genres.mul(self.ratings),\
                                X_copy.userId], axis = 1
                               .groupby("userId", as_index = False).mean()), \
                     how='left',
                     on='userId', \
                     suffixes = ('', '_gen_mean'))

        X_copy.drop(columns=['timestamp', 'genres', ], inplace=True)
        return X_copy

    def get_feature_names_out(self, input_features=None):
        return self.mlb.classes_

    def inverse_transform(self):
        pass

NameError: name 'BaseEstimator' is not defined

In [52]:
col_transformer = ColumnTransformer(
    transformers=[
        ('OneHotEncoder', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'), onehot_columns),
    ],
    remainder='passthrough',          # Оставляем необрабатываемые колонки как есть, не удаляем их
    verbose_feature_names_out=False   # Оставляем оригинальные названия колонок
).set_output(transform='pandas')      # Трансформер будет возвращать pandas


In [53]:
pipe = Pipeline(
    [
        ("Custom_transformer", CustomTransformer()),
        ("OneHotEncoder", col_transformer),
    ]
)

In [54]:
X_ = ratings_full_df.drop("rating", axis=1)
y_ = ratings_full_df['rating']

In [None]:
sample = pipe.fit_transform(X_, y_)

In [36]:
sample

Unnamed: 0,time_evening,time_morining,time_night,userId,movieId,rating_user_mean,rating_movie_mean,year,day_of_week,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.0,0.0,0.0,1,296,3.814286,4.182081,2006,2,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0
1,0.0,0.0,0.0,1,306,3.814286,4.100000,2006,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,1,307,3.814286,4.185185,2006,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,1,665,3.814286,4.000000,2006,2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
4,0.0,0.0,0.0,1,899,3.814286,4.082474,2006,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0.0,0.0,0.0,1409,48516,3.090018,4.125592,2010,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0
199996,1.0,0.0,0.0,1409,48696,3.090018,3.909091,2010,5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
199997,1.0,0.0,0.0,1409,48738,3.090018,3.837209,2010,5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
199998,0.0,0.0,0.0,1409,48774,3.090018,3.867769,2010,5,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0


In [39]:
sample

Unnamed: 0,time_evening,time_morining,time_night,userId,movieId,rating_user_mean,rating_movie_mean,year,day_of_week,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.0,0.0,0.0,1,296,3.814286,4.182081,2006,2,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0
1,0.0,0.0,0.0,1,306,3.814286,4.100000,2006,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,1,307,3.814286,4.185185,2006,2,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,1,665,3.814286,4.000000,2006,2,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0
4,0.0,0.0,0.0,1,899,3.814286,4.082474,2006,2,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0.0,0.0,0.0,1409,48516,3.090018,4.125592,2010,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0
199996,1.0,0.0,0.0,1409,48696,3.090018,3.909091,2010,5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
199997,1.0,0.0,0.0,1409,48738,3.090018,3.837209,2010,5,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
199998,0.0,0.0,0.0,1409,48774,3.090018,3.867769,2010,5,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0
