<a href="https://colab.research.google.com/github/aleks-haksly/Simulative/blob/main/ML/recomendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://a.teleboss.ru/play/c851b57a-ccd5-408c-9484-e3d00c79b46e

https://www.kaggle.com/datasets/parasharmanas/movie-recommendation-system/data?select=movies.csv

In [31]:
import numpy as np
import pandas as pd
import os
import warnings

In [38]:
!unzip -o /content/archive.zip
!rm *.zip

Archive:  /content/archive.zip
  inflating: movies.csv              
  inflating: ratings.csv             


In [2]:
warnings.filterwarnings("ignore")

In [42]:
ratings_full_df = pd.read_csv("/content/ratings.csv", nrows=200_000)
movies_df = pd.read_csv("/content/movies.csv")

In [43]:
ratings_full_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
199995,1409,48516,4.0,1287843475
199996,1409,48696,4.0,1287854818
199997,1409,48738,3.5,1287854109
199998,1409,48774,4.0,1287849077


In [None]:
# making

In [98]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer

In [159]:
class CustomTransformer(BaseEstimator, TransformerMixin):
    def __init__(
        self,
        date_col='timestamp'
    ):
        self.date_col = date_col
        pass

    def fit(self, X, y):

        concatenated_df = pd.concat([X, y], axis = 1)
        self.rating_user_mean = concatenated_df.groupby("userId", as_index=False).rating.mean().rename({"rating": "rating_user_mean"}, axis=1)
        self.rating_movie_mean = concatenated_df.groupby("movieId", as_index=False).rating.mean().rename({"rating": "rating_movie_mean"}, axis=1)

        return self

    def transform(self, X:pd.DataFrame):

        X_copy= X.copy()

        X_copy = X_copy.merge(self.rating_user_mean, how='left', on='userId') \
                       .merge(self.rating_movie_mean, how='left', on='movieId')

        datetime_ = pd.to_datetime(X_copy.timestamp, unit='s')

        X_copy['year'] = datetime_.dt.year
        X_copy['day_of_week'] = datetime_.dt.day_of_week
        X_copy['time'] = pd.cut(datetime_.dt.hour,
                                include_lowest=True,
                                bins=[0, 5, 12, 17, 21, 24],
                                right=False,
                                ordered=False,
                                labels=['night', 'morining', 'afternoon', 'evening', 'night'])

        return X_copy

    def inverse_transform(self):
        pass

In [160]:
onehot_columns = ['time'] # список колонок, которые будем кодировать c OneHotEncoder

In [161]:
col_transformer = ColumnTransformer(
    transformers=[
        ('OneHotEncoder', OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore'), onehot_columns),
    ],
    remainder='passthrough',          # Оставляем необрабатываемые колонки как есть, не удаляем их
    verbose_feature_names_out=False   # Оставляем оригинальные названия колонок
).set_output(transform='pandas')      # Трансформер будет возвращать pandas


In [162]:
pipe = Pipeline(
    [
        ("Custom_transformer", CustomTransformer()),
        ("OneHotEncoder", col_transformer),
    ]
)

In [163]:
X_ = ratings_full_df.drop("rating", axis=1)
y_ = ratings_full_df['rating']

In [164]:
sample = pipe.fit_transform(X_, y_)
sample

Unnamed: 0,time_evening,time_morining,time_night,userId,movieId,timestamp,rating_user_mean,rating_movie_mean,year,day_of_week
0,0.0,0.0,0.0,1,296,1147880044,3.814286,4.182081,2006,2
1,0.0,0.0,0.0,1,306,1147868817,3.814286,4.100000,2006,2
2,0.0,0.0,0.0,1,307,1147868828,3.814286,4.185185,2006,2
3,0.0,0.0,0.0,1,665,1147878820,3.814286,4.000000,2006,2
4,0.0,0.0,0.0,1,899,1147868510,3.814286,4.082474,2006,2
...,...,...,...,...,...,...,...,...,...,...
199995,0.0,0.0,0.0,1409,48516,1287843475,3.090018,4.125592,2010,5
199996,1.0,0.0,0.0,1409,48696,1287854818,3.090018,3.909091,2010,5
199997,1.0,0.0,0.0,1409,48738,1287854109,3.090018,3.837209,2010,5
199998,0.0,0.0,0.0,1409,48774,1287849077,3.090018,3.867769,2010,5


In [135]:
ct = CustomTransformer()

In [136]:
ct.fit_transform(X_, y_)

Unnamed: 0,userId,movieId,timestamp,rating_x,rating_y,year,day_of_week,time
0,1,296,1147880044,3.814286,4.182081,2006,2,afternoon
1,1,306,1147868817,3.814286,4.100000,2006,2,afternoon
2,1,307,1147868828,3.814286,4.185185,2006,2,afternoon
3,1,665,1147878820,3.814286,4.000000,2006,2,afternoon
4,1,899,1147868510,3.814286,4.082474,2006,2,afternoon
...,...,...,...,...,...,...,...,...
199995,1409,48516,1287843475,3.090018,4.125592,2010,5,afternoon
199996,1409,48696,1287854818,3.090018,3.909091,2010,5,evening
199997,1409,48738,1287854109,3.090018,3.837209,2010,5,evening
199998,1409,48774,1287849077,3.090018,3.867769,2010,5,afternoon


In [145]:
ct.rating_movie_mean

Unnamed: 0,movieId,rating
0,1,3.960499
1,2,3.384393
2,3,3.267544
3,4,2.937500
4,5,3.272277
...,...,...
12904,206523,2.000000
12905,206805,3.000000
12906,206861,2.500000
12907,207309,3.250000


In [149]:
concatenated_df.groupby("userId", as_index=False).rating.mean().rename({"rating": "rating_user_mean"}, axis=1)

Unnamed: 0,userId,rating_user_mean
0,1,3.814286
1,2,3.630435
2,3,3.697409
3,4,3.378099
4,5,3.752475
...,...,...
1404,1405,3.678369
1405,1406,4.409091
1406,1407,3.710938
1407,1408,4.476190
