In [None]:
from split import leave_one_out_split, remove_last_item, get_last_item
from make_features import make_features_before, make_features_after
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.metrics import roc_auc_score

In [None]:
zvuk = pd.read_parquet('test_zvuk.parquet')
smm = pd.read_parquet('test_smm.parquet')

In [None]:
ranks = pd.read_csv('SASRec_ranks-2.csv', index_col=0)
ranks.head()

Unnamed: 0,user_id,item_id,prediction
0,1000010,35107,10.983168
1,1000010,2259,10.869387
2,1000010,7748,10.736988
3,1000010,112389,10.6442
4,1000010,114081,10.595426


In [None]:
train_smm, _, test_smm = leave_one_out_split(make_features_before(smm), validation_size=0)

In [None]:
def add_target(train: pd.DataFrame, test: pd.DataFrame) -> pd.DataFrame:
    last_item = get_last_item(test)

    merged = train.merge(last_item, on=['user_id', 'item_id'], how='left', indicator=True)
    train['target'] = (merged['_merge'] == 'both').astype(int)

    return train

In [None]:
output = add_target(ranks, test_smm)

In [None]:
output_featured = make_features_after(output)

In [None]:
output_featured.head()

Unnamed: 0,user_id,item_id,prediction,target,rank
1995750,3298628,56394,15.52358,0,1
1938050,3231979,153868,15.092383,0,1
1938051,3231979,74961,15.04201,0,2
6936450,8941978,125722,15.023755,0,1
5043050,6779197,36553,14.931406,0,1


In [None]:
model = cb.CatBoostClassifier(iterations=100, learning_rate=0.1, random_seed=42)

In [None]:
def train_model(X_train, y_train, model):
    model.fit(X_train, y_train)
    return model

In [None]:
def predict(X_test, model):
    result = model.predict_proba(X_test)
    X_test['score'] = result[:, 0]
    return X_test

In [None]:
def predict_list(X_test, model):
    result = model.predict_proba(X_test)
    X_test['score'] = result[:, 0]
    output = X_test.groupby('user_id').apply(lambda x: x.sort_values('score', ascending=False).head(10)['item_id'].tolist())
    return pd.DataFrame({'user_id': output.index, 'item_id': output.values})

In [None]:
X_train, y_train = output_featured.drop('target', axis=1), output_featured['target']

In [None]:
model = train_model(X_train, y_train, model)

0:	learn: 0.5992761	total: 764ms	remaining: 1m 15s
1:	learn: 0.5227166	total: 1.31s	remaining: 1m 4s
2:	learn: 0.4593001	total: 1.88s	remaining: 1m
3:	learn: 0.4053060	total: 2.44s	remaining: 58.5s
4:	learn: 0.3592009	total: 2.88s	remaining: 54.7s
5:	learn: 0.3195233	total: 3.4s	remaining: 53.3s
6:	learn: 0.2851730	total: 4s	remaining: 53.1s
7:	learn: 0.2552152	total: 4.58s	remaining: 52.7s
8:	learn: 0.2291313	total: 5.11s	remaining: 51.6s
9:	learn: 0.2061326	total: 5.66s	remaining: 50.9s
10:	learn: 0.1858297	total: 6.19s	remaining: 50.1s
11:	learn: 0.1679449	total: 6.58s	remaining: 48.2s
12:	learn: 0.1520727	total: 7.14s	remaining: 47.8s
13:	learn: 0.1380449	total: 7.65s	remaining: 47s
14:	learn: 0.1255415	total: 8.21s	remaining: 46.5s
15:	learn: 0.1144395	total: 8.77s	remaining: 46s
16:	learn: 0.1045341	total: 9.22s	remaining: 45s
17:	learn: 0.0956989	total: 9.7s	remaining: 44.2s
18:	learn: 0.0878246	total: 10.2s	remaining: 43.7s
19:	learn: 0.0807786	total: 10.8s	remaining: 43.1s
20:

In [None]:
predicted = predict(X_train, model)

In [None]:
predicted.sort_index()

Unnamed: 0,user_id,item_id,prediction,sequence_length,freq_item,freq_user_mean,freq_user_max,freq_user_quantile_25,freq_user_quantile_50,freq_user_quantile_75,rank,score
0,1000010,35107,10.983168,50,0.000098,0.000319,0.001337,0.000102,0.000235,0.000389,1,0.954032
1,1000010,2259,10.869387,50,0.001337,0.000319,0.001337,0.000102,0.000235,0.000389,2,0.976395
2,1000010,7748,10.736988,50,0.000217,0.000319,0.001337,0.000102,0.000235,0.000389,3,0.976813
3,1000010,112389,10.644200,50,0.000301,0.000319,0.001337,0.000102,0.000235,0.000389,4,0.983519
4,1000010,114081,10.595426,50,0.000210,0.000319,0.001337,0.000102,0.000235,0.000389,5,0.983593
...,...,...,...,...,...,...,...,...,...,...,...,...
7862695,9999982,102290,8.284018,50,0.000098,0.000674,0.002772,0.000322,0.000497,0.000900,46,0.996921
7862696,9999982,56543,8.247368,50,0.001351,0.000674,0.002772,0.000322,0.000497,0.000900,47,0.997553
7862697,9999982,56929,8.246140,50,0.000203,0.000674,0.002772,0.000322,0.000497,0.000900,48,0.997338
7862698,9999982,37454,8.241319,50,0.000343,0.000674,0.002772,0.000322,0.000497,0.000900,49,0.997516


In [None]:
roc_auc_score(y_train.sort_index(), predicted.sort_index()['score'])

0.21731249971387528