<a href="https://colab.research.google.com/github/aleks-haksly/Simulative/blob/main/ML/recomendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://a.teleboss.ru/play/c851b57a-ccd5-408c-9484-e3d00c79b46e

https://www.kaggle.com/datasets/parasharmanas/movie-recommendation-system/data?select=movies.csv

In [31]:
import numpy as np
import pandas as pd
import os
import warnings

In [38]:
!unzip -o /content/archive.zip
!rm *.zip

Archive:  /content/archive.zip
  inflating: movies.csv              
  inflating: ratings.csv             


In [2]:
warnings.filterwarnings("ignore")

In [42]:
ratings_full_df = pd.read_csv("/content/ratings.csv", nrows=200_000)
movies_df = pd.read_csv("/content/movies.csv")

In [43]:
ratings_full_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
199995,1409,48516,4.0,1287843475
199996,1409,48696,4.0,1287854818
199997,1409,48738,3.5,1287854109
199998,1409,48774,4.0,1287849077


In [None]:
# making

In [165]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

In [230]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        date_col='timestamp'
    ):
        self.date_col = date_col
        self.mlb = MultiLabelBinarizer()

    def fit(self, X, y):

        concatenated_df = pd.concat([X, y], axis = 1)
        self.rating_user_mean = concatenated_df.groupby("userId", as_index=False)\
                                               .rating.mean()\
                                               .rename({"rating": "rating_user_mean"}, axis=1)

        self.rating_movie_mean = concatenated_df.groupby("movieId", as_index=False)\
                                                .rating.mean()\
                                                .rename({"rating": "rating_movie_mean"}, axis=1)

        self.genres = X.genres.apply(lambda x: x.split('|'))
        self.mlb.fit(self.genres)
        return self

    def transform(self, X:pd.DataFrame):

        X_copy= X.copy()
        X_copy = X_copy.merge(self.rating_user_mean, how='left', on='userId') \
                       .merge(self.rating_movie_mean, how='left', on='movieId')

        datetime_ = pd.to_datetime(X_copy.timestamp, unit='s')

        X_copy['year'] = datetime_.dt.year
        X_copy['day_of_week'] = datetime_.dt.day_of_week
        X_copy['time'] = pd.cut(datetime_.dt.hour,
                                include_lowest=True,
                                bins=[0, 5, 12, 17, 21, 24],
                                right=False,
                                ordered=False,
                                labels=['night', 'morining', 'afternoon', 'evening', 'night'])

        X_copy['genres'] = self.genres
        X_copy = pd.concat([X_copy, \
                            pd.DataFrame(self.mlb.transform(X_copy.genres), \
                                         columns=self.mlb.classes_)], \
                           axis=1)
        X_copy.drop(columns=['timestamp', 'genres', ], inplace=True)
        return X_copy

    def get_feature_names_out(self, input_features=None):
        return self.mlb.classes_

    def inverse_transform(self):
        pass

In [231]:
onehot_columns = ['time'] # список колонок, которые будем кодировать c OneHotEncoder

In [232]:
col_transformer = ColumnTransformer(
    transformers=[
        ('OneHotEncoder', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'), onehot_columns),
    ],
    remainder='passthrough',          # Оставляем необрабатываемые колонки как есть, не удаляем их
    verbose_feature_names_out=False   # Оставляем оригинальные названия колонок
).set_output(transform='pandas')      # Трансформер будет возвращать pandas


In [233]:
pipe = Pipeline(
    [
        ("Custom_transformer", CustomTransformer()),
        ("OneHotEncoder", col_transformer),
    ]
)

In [234]:
X_ = ratings_full_df.drop("rating", axis=1).merge(movies_df[['movieId', 'genres']], on='movieId', how='left')
y_ = ratings_full_df['rating']

In [235]:
sample = pipe.fit_transform(X_, y_)


In [236]:
sample

Unnamed: 0,time_evening,time_morining,time_night,userId,movieId,rating_user_mean,rating_movie_mean,year,day_of_week,(no genres listed),...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0.0,0.0,0.0,1,296,3.814286,4.182081,2006,2,0,...,0,0,0,0,0,0,0,1,0,0
1,0.0,0.0,0.0,1,306,3.814286,4.100000,2006,2,0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,1,307,3.814286,4.185185,2006,2,0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,1,665,3.814286,4.000000,2006,2,0,...,0,0,0,0,0,0,0,0,1,0
4,0.0,0.0,0.0,1,899,3.814286,4.082474,2006,2,0,...,0,0,0,1,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,0.0,0.0,0.0,1409,48516,3.090018,4.125592,2010,5,0,...,0,0,0,0,0,0,0,1,0,0
199996,1.0,0.0,0.0,1409,48696,3.090018,3.909091,2010,5,0,...,0,0,0,0,0,1,0,0,0,0
199997,1.0,0.0,0.0,1409,48738,3.090018,3.837209,2010,5,0,...,0,0,0,0,0,0,0,1,0,0
199998,0.0,0.0,0.0,1409,48774,3.090018,3.867769,2010,5,0,...,0,0,0,0,0,0,1,1,0,0
