# 0. Configuration

In [None]:
# links to shared data MovieLens
# source on kaggle: https://www.kaggle.com/code/quangnhatbui/movie-recommender/data
RATINGS_SMALL_URL = 'https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link'
MOVIES_METADATA_URL = 'https://drive.google.com/file/d/19g6-apYbZb5D-wRj4L7aYKhxS-fDM4Fb/view?usp=share_link'

# 1. Modules and functions

In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
# just to make it available to download w/o SSL verification
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import random
import numpy as np
import pandas as pd
import datetime as dt
from itertools import permutations

import torch
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')


RANDOM_STATE = 42

## 1.1. Helper functions to avoid copy paste

In [None]:
def read_csv_from_gdrive(url):
    """
    gets csv data from a given url (taken from file -> share -> copy link)
    :url: example https://drive.google.com/file/d/1BlZfCLLs5A13tbNSJZ1GPkHLWQOnPlE4/view?usp=share_link
    """
    file_id = url.split('/')[-2]
    file_path = 'https://drive.google.com/uc?export=download&id=' + file_id
    data = pd.read_csv(file_path)

    return data

# 2. Main

## 2.1. RankNet

In [None]:
class RankNet(torch.nn.Module):
    def __init__(self, input_features_len, hidden_dim = 10):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.model = torch.nn.Sequential(
            torch.nn.Linear(input_features_len, self.hidden_dim),
            torch.nn.ReLU(),
            torch.nn.Linear(self.hidden_dim, 1),
        )
        
        self.out_activation = torch.nn.Sigmoid()

    def forward(self, input_1, input_2):
        logits_1 = self.predict(input_1)
        logits_2 = self.predict(input_2)
        
        logits_diff = logits_1 - logits_2
        out = self.out_activation(logits_diff)

        return out
    
    def predict(self, inp):
        logits = self.model(inp)
        return logits

In [None]:
model = RankNet(input_features_len = 8)
model

RankNet(
  (model): Sequential(
    (0): Linear(in_features=8, out_features=10, bias=True)
    (1): ReLU()
    (2): Linear(in_features=10, out_features=1, bias=True)
  )
  (out_activation): Sigmoid()
)

In [None]:
input_1, input_2 = torch.rand(4, 8), torch.rand(4, 8)
input_2

tensor([[0.8710, 0.3286, 0.5940, 0.5358, 0.8869, 0.0105, 0.5112, 0.6997],
        [0.7578, 0.3873, 0.2264, 0.7957, 0.3264, 0.0030, 0.3798, 0.7992],
        [0.5542, 0.2757, 0.8852, 0.7678, 0.4480, 0.9354, 0.4538, 0.9743],
        [0.7285, 0.8647, 0.3815, 0.8428, 0.0471, 0.1055, 0.6811, 0.5060]])

In [None]:
preds = torch.sort(model(input_1, input_2), descending = True, dim = 0)
preds[0]

tensor([[0.5202],
        [0.5140],
        [0.5106],
        [0.5039]], grad_fn=<SortBackward0>)

## 2.2. ListNet

In [None]:
movies_to_rank = {'The Godfather', 'Avatar', 'Ozark'}
permutations_list = list(permutations(movies_to_rank))

for i in permutations_list:
    print(i)

('The Godfather', 'Ozark', 'Avatar')
('The Godfather', 'Avatar', 'Ozark')
('Ozark', 'The Godfather', 'Avatar')
('Ozark', 'Avatar', 'The Godfather')
('Avatar', 'The Godfather', 'Ozark')
('Avatar', 'Ozark', 'The Godfather')


In [None]:
movies_to_rank

{'Avatar', 'Ozark', 'The Godfather'}

In [None]:
pi = random.choice(permutations_list)
print(pi)

('Avatar', 'Ozark', 'The Godfather')


In [None]:
np.random.seed(RANDOM_STATE)
scores_dict = {x: np.random.randn(1)[0] for x in movies_to_rank}  
print(scores_dict)

# unpack pi and assign movies to scores
score_movie_pos_1, score_movie_pos_2, score_movie_pos_3 = scores_dict[pi[0]], scores_dict[pi[1]], scores_dict[pi[2]]


{'The Godfather': 0.4967141530112327, 'Ozark': -0.13826430117118466, 'Avatar': 0.6476885381006925}


In [None]:
first_term = np.exp(score_movie_pos_1) / (np.exp(score_movie_pos_1) + np.exp(score_movie_pos_2)\
                                         + np.exp(score_movie_pos_3))

second_term = np.exp(score_movie_pos_2) / (np.exp(score_movie_pos_2) + np.exp(score_movie_pos_3))

third_term = np.exp(score_movie_pos_3) / np.exp(score_movie_pos_3)

print(f'First term is: {first_term}')
print(f'Second term is: {second_term}')
print(f'Third term is: {third_term}')

First term is: 0.4318619033836114
Second term is: 0.3463825470936087
Third term is: 1.0


$P_{s}(<The Godfather, Avatar, Ozark>) = \prod^3_{j = 1} \frac {\phi(s_{\pi(j)})} {\sum^3_{k = j} \phi(s_{\pi(k)})}$ which is equal to

In [None]:
permutation_proba = first_term * second_term * third_term

print(f'Permutation probability is: {permutation_proba}')


Permutation probability is: 0.14958942608670928


## 2.3. CatBoost Ranker

### 2.3.1. Load Data

`interactions` dataset shows list of movies that users watched, along with given ratings:

In [None]:
# interactions data
interactions = read_csv_from_gdrive(RATINGS_SMALL_URL)
interactions.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


`movies_metadata` dataset shows the list of movies existing on OKKO platform:

In [None]:
# information about films etc
movies_metadata = read_csv_from_gdrive(MOVIES_METADATA_URL)
movies_metadata.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [None]:
movies_metadata['id'] = movies_metadata['id'].astype(str)
interactions['movieId'] = interactions['movieId'].astype(str)

In [None]:
movies_metadata.rename(columns = {'id': 'movieId'}, inplace = True)

In [None]:
# leave only those films that intersect with each other
interactions_filtered = interactions.loc[interactions['movieId'].isin(movies_metadata['movieId'])]
print(interactions.shape, interactions_filtered.shape)

(100004, 4) (44989, 4)


### 2.3.2 Data Preparation

In [None]:
TEST_SIZE = .25

In [None]:
# convert timestamp to date
interactions_filtered['dttm'] = interactions_filtered['timestamp']\
                                .apply(lambda x: pd.to_datetime(dt.datetime.fromtimestamp(x).strftime('%Y-%m-%d')))

Let's calculate some basic features, but keep in mind that our data of historical ratings depends on time.
We need to avoid data leak -- use future values in past data

In [None]:
ITEM_FEATURES_LIST = ['revenue', 'budget', 'runtime']

In [None]:
# calculate avg ratings by users and items daily
daily_users_feature = interactions_filtered.groupby(['userId', 'dttm']).agg({'rating': 'mean',
                                              'movieId': 'count'})\
                                  .reset_index().sort_values(['userId', 'dttm'])\
                                  .rename(columns = {'rating': 'user_mean_rating',
                                                     'movieId': 'user_watch_count'})


daily_users_feature['dttm'] = daily_users_feature['dttm'].apply(lambda x: x + dt.timedelta(days = 1))
daily_users_feature.loc[daily_users_feature['userId'] == 3]

Unnamed: 0,userId,dttm,user_mean_rating,user_watch_count
2,3,2011-03-01,3.54,25


In [None]:
daily_users_feature.loc[daily_users_feature['userId'] == 671]

Unnamed: 0,userId,dttm,user_mean_rating,user_watch_count
3224,671,2003-09-15,3.78125,16
3225,671,2003-09-23,4.428571,7
3226,671,2003-10-01,4.230769,13
3227,671,2003-10-03,4.0,4
3228,671,2003-10-04,4.0,6
3229,671,2003-10-23,4.0,1


In [None]:
cumulative_total_cnt = daily_users_feature.set_index('dttm').groupby(['userId'])['user_watch_count']\
                        .rolling(window = 3, min_periods = 1).sum()\
                        .reset_index()[['userId', 'dttm', 'user_watch_count']]\
                        .rename(columns = {'user_watch_count': 'user_total_watch_count_last_3_days'})

In [None]:
cumulative_total_cnt

Unnamed: 0,userId,dttm,user_total_watch_count_last_3_days
0,383,1995-01-10,1.0
1,409,1996-03-31,36.0
2,224,1996-03-31,64.0
3,224,1996-04-07,65.0
4,511,1996-04-13,24.0
...,...,...,...
3225,73,2016-10-11,3.0
3226,251,2016-10-15,13.0
3227,251,2016-10-16,37.0
3228,251,2016-10-17,40.0


In [None]:
# merge item features
main_df = pd.merge(
    interactions_filtered, movies_metadata[['movieId']+ ITEM_FEATURES_LIST],
    how = 'left', on = 'movieId'
                   ).drop_duplicates().reset_index(drop = True)
assert main_df.shape[0] == interactions_filtered.shape[0]

In [None]:
main_df = main_df.sort_values('dttm').reset_index(drop = True)
daily_users_feature = daily_users_feature.sort_values('dttm').reset_index(drop = True)
cumulative_total_cnt = cumulative_total_cnt.sort_values('dttm').reset_index(drop = True)

In [None]:
# merge user features with watch count
main_df = pd.merge_asof(
    main_df, daily_users_feature,
    on = 'dttm', by = 'userId',
    direction = 'backward',
    allow_exact_matches = True
    )
assert main_df.shape[0] == interactions_filtered.shape[0]

In [None]:
main_df = pd.merge_asof(
    main_df, cumulative_total_cnt,
    on = 'dttm', by = 'userId',
    direction = 'backward',
    allow_exact_matches = True
    )
assert main_df.shape[0] == interactions_filtered.shape[0]

In [None]:
cumulative_total_cnt.loc[cumulative_total_cnt['userId'] == 671]

Unnamed: 0,userId,dttm,user_total_watch_count_last_3_days
1120,671,2003-09-15,16.0
1124,671,2003-09-23,23.0
1127,671,2003-10-01,36.0
1130,671,2003-10-03,24.0
1132,671,2003-10-04,23.0
1136,671,2003-10-23,11.0


In [None]:
# tmp  = main_df.loc[main_df['userId'] == 671][['userId', 'dttm']]
# pd.merge_asof(
#     tmp.sort_values('dttm'), cumulative_total_cnt.sort_values('dttm'),
#     on = 'dttm', by = 'userId', direction = 'backward',
#     allow_exact_matches = True).sort_values('dttm')

In [None]:
# anyway we left some NaN
main_df.isnull().sum() / len(main_df) 

userId                                0.000000
movieId                               0.000000
rating                                0.000000
timestamp                             0.000000
dttm                                  0.000000
revenue                               0.000000
budget                                0.000000
runtime                               0.000089
user_mean_rating                      0.624797
user_watch_count                      0.624797
user_total_watch_count_last_3_days    0.624797
dtype: float64

In [None]:
FINAL_FEATURES_LIST = ['revenue', 'budget', 'runtime', 'user_mean_rating',
                       'user_watch_count', 'user_total_watch_count_last_3_days']

In [None]:
ID_COLS = ['userId', 'movieId']

In [None]:
TARGET = 'rating'

In [None]:
X = main_df[ID_COLS + FINAL_FEATURES_LIST]
y = main_df[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = TEST_SIZE,
    random_state = RANDOM_STATE)

print(f'Shape of train set X, y: {X_train.shape}, {len(y_train)}')
print(f'Shape of train set X, y: {X_test.shape}, {len(y_test)}')

Shape of train set X, y: (33741, 8), 33741
Shape of train set X, y: (11248, 8), 11248


### 2.3.3. Train Model

In [None]:
# init model
model = CatBoostRegressor(
    loss_function = 'MAE',
    iterations = 2000,
    learning_rate = 0.1,
    depth = 6,
    verbose = False
)

In [None]:
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    early_stopping_rounds = 20 # to avoid overfitting,
)

<catboost.core.CatBoostRegressor at 0x7f7d29328730>

In [None]:
model.predict(X_test)

array([3.10498881, 3.71613197, 3.73355612, ..., 3.83442885, 3.68219329,
       4.07859838])

In [None]:
model.best_score_

{'learn': {'MAE': 0.6742540194627192},
 'validation': {'MAE': 0.7266061571016419}}

In [None]:
model

<catboost.core.CatBoostRegressor at 0x7f7d29328730>

# TODO
- Add baseline comparison from the model (well, we discussed what is baseline for MAE metric -- now, you have to define, how you are going to calculate it)

In [None]:
main_df

Unnamed: 0,userId,movieId,rating,timestamp,dttm,revenue,budget,runtime,user_mean_rating,user_watch_count,user_total_watch_count_last_3_days
0,383,21,3.0,789652009,1995-01-09,0.0,0,95.0,,,
1,224,427,3.0,828214011,1996-03-30,0.0,0,110.0,,,
2,224,335,4.0,828214012,1996-03-30,5321508.0,5000000,175.0,,,
3,224,337,4.0,828214012,1996-03-30,11576431.0,0,94.0,,,
4,224,339,4.0,828214012,1996-03-30,2015810.0,3500000,129.0,,,
...,...,...,...,...,...,...,...,...,...,...,...
44984,251,551,5.0,1476551142,2016-10-15,84563118.0,5000000,117.0,4.461538,13.0,13.0
44985,251,44191,3.0,1476623070,2016-10-16,0.0,945000,92.0,4.583333,24.0,37.0
44986,251,81847,4.5,1476623282,2016-10-16,0.0,0,108.0,4.583333,24.0,37.0
44987,251,1265,4.0,1476622980,2016-10-16,137587063.0,60000000,96.0,4.583333,24.0,37.0


In [96]:
#according to baseline median
baseline_mae = main_df.groupby('userId')['user_mean_rating'].median().median()
baseline_mae

3.7142857142857144

In [100]:
#take validation score out of json format and calculate relative value, compare with model result
MAE_v = model.best_score_['validation']['MAE']
MAE_vs= MAE_v / baseline_mae
print(MAE_v)
print(MAE_vs)

0.7266061571016419
0.1956247346042882
