In [1]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error
import pickle

In [2]:
with open('track_embeddings.pkl', 'rb') as f:
    track_embeddings = pickle.load(f)

In [3]:
track_embeddings

array([[-0.12850836,  0.05144424,  0.1324376 , ..., -0.0701643 ,
        -0.11606281, -0.1602558 ],
       [-0.12560181,  0.09551706,  0.10573198, ..., -0.11453907,
        -0.09670042, -0.14838946],
       [-0.13176899,  0.05360903,  0.10417848, ..., -0.09779049,
        -0.11299129, -0.15423141],
       ...,
       [-0.11718541,  0.15378292,  0.12776648, ..., -0.05870862,
        -0.10497359, -0.161321  ],
       [-0.10790123,  0.20756178,  0.10939675, ...,  0.12004782,
        -0.09874647, -0.13628687],
       [-0.12471136,  0.02359954,  0.11788659, ..., -0.09589922,
        -0.10256677, -0.17452659]], dtype=float32)

In [4]:
train_joke_df = pd.read_csv(r'..\data\recsys-in-practice\train_joke_df.csv')
test_joke_df_nofactrating = pd.read_csv(r'..\data\recsys-in-practice\test_joke_df_nofactrating.csv', index_col=0)

In [5]:
train_joke_df["UID"] = train_joke_df["UID"].astype(int)
train_joke_df["JID"] = train_joke_df["JID"].astype(int)

In [6]:
train_joke_df["UID"] = train_joke_df["UID"] - 1
train_joke_df["JID"] = train_joke_df["JID"] - 1

In [7]:
jokes_df = pd.DataFrame(track_embeddings, index=range(0, track_embeddings.shape[0]), columns=[f'j_emb_{i}' for i in range(32)])
jokes_df

Unnamed: 0,j_emb_0,j_emb_1,j_emb_2,j_emb_3,j_emb_4,j_emb_5,j_emb_6,j_emb_7,j_emb_8,j_emb_9,...,j_emb_22,j_emb_23,j_emb_24,j_emb_25,j_emb_26,j_emb_27,j_emb_28,j_emb_29,j_emb_30,j_emb_31
0,-0.128508,0.051444,0.132438,-0.037298,-0.201775,-0.069636,-0.113742,-0.024279,-0.029537,-0.053748,...,-0.196729,-0.140467,0.230395,-0.003536,-0.060885,-0.165479,-0.068899,-0.070164,-0.116063,-0.160256
1,-0.125602,0.095517,0.105732,-0.028557,-0.183307,-0.077972,-0.131677,0.055064,0.003441,-0.082902,...,-0.188492,-0.215398,0.215273,0.228148,-0.045374,-0.168430,-0.083771,-0.114539,-0.096700,-0.148389
2,-0.131769,0.053609,0.104178,-0.046955,-0.200002,0.016845,-0.251092,0.009414,-0.036045,-0.055575,...,-0.190417,-0.176878,0.221158,-0.021510,-0.072628,-0.171978,-0.068281,-0.097790,-0.112991,-0.154231
3,-0.104805,0.194723,0.106647,-0.044433,-0.163179,-0.154253,-0.264780,0.097174,-0.029896,-0.076351,...,-0.150741,-0.001954,0.195477,0.139092,-0.032957,-0.137452,-0.067610,-0.081869,-0.098548,-0.121358
4,-0.110405,0.015444,0.141151,-0.056691,-0.191257,0.052613,0.081804,0.049767,-0.012462,-0.087378,...,-0.190938,-0.172349,0.216513,0.333815,-0.054110,-0.173943,-0.054134,-0.013408,-0.104885,-0.163853
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.142546,0.170686,0.125764,-0.063433,-0.220362,-0.036655,0.155781,0.014916,-0.052736,-0.086768,...,-0.207480,0.127158,0.234253,-0.107324,-0.099028,-0.204563,-0.098598,0.085497,-0.128372,-0.155286
96,-0.134356,0.132201,0.131486,-0.014839,-0.215539,0.029771,0.132847,0.096979,-0.090910,-0.071125,...,-0.202462,0.137410,0.239188,-0.027634,-0.074429,-0.189961,-0.078969,0.093237,-0.127856,-0.168573
97,-0.117185,0.153783,0.127766,-0.031634,-0.203616,-0.118698,0.015570,0.082631,-0.013541,-0.076063,...,-0.190877,-0.027533,0.231235,0.337730,-0.040640,-0.173552,-0.079145,-0.058709,-0.104974,-0.161321
98,-0.107901,0.207562,0.109397,-0.037075,-0.197782,-0.025386,0.032928,0.020050,-0.057390,-0.064281,...,-0.178827,0.125883,0.221786,-0.048951,-0.055854,-0.173269,-0.081489,0.120048,-0.098746,-0.136287


In [8]:
train_joke_df = train_joke_df.merge(jokes_df, how='inner', left_on='JID',  right_index=True)
train_joke_df

Unnamed: 0,UID,JID,Rating,j_emb_0,j_emb_1,j_emb_2,j_emb_3,j_emb_4,j_emb_5,j_emb_6,...,j_emb_22,j_emb_23,j_emb_24,j_emb_25,j_emb_26,j_emb_27,j_emb_28,j_emb_29,j_emb_30,j_emb_31
0,18028,5,-1.26,-0.136766,-0.013906,0.140304,-0.045568,-0.215760,0.001899,-0.029915,...,-0.213626,-0.049605,0.246616,-0.196301,-0.056399,-0.182334,-0.088423,-0.072930,-0.110614,-0.178068
17,5620,5,3.16,-0.136766,-0.013906,0.140304,-0.045568,-0.215760,0.001899,-0.029915,...,-0.213626,-0.049605,0.246616,-0.196301,-0.056399,-0.182334,-0.088423,-0.072930,-0.110614,-0.178068
101,17250,5,-3.74,-0.136766,-0.013906,0.140304,-0.045568,-0.215760,0.001899,-0.029915,...,-0.213626,-0.049605,0.246616,-0.196301,-0.056399,-0.182334,-0.088423,-0.072930,-0.110614,-0.178068
142,2785,5,6.36,-0.136766,-0.013906,0.140304,-0.045568,-0.215760,0.001899,-0.029915,...,-0.213626,-0.049605,0.246616,-0.196301,-0.056399,-0.182334,-0.088423,-0.072930,-0.110614,-0.178068
273,14078,5,2.09,-0.136766,-0.013906,0.140304,-0.045568,-0.215760,0.001899,-0.029915,...,-0.213626,-0.049605,0.246616,-0.196301,-0.056399,-0.182334,-0.088423,-0.072930,-0.110614,-0.178068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1447099,22355,73,1.89,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1447290,4233,73,7.04,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1447538,16114,73,0.34,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1447727,11578,73,6.17,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149


In [9]:
train_joke_df

Unnamed: 0,UID,JID,Rating,j_emb_0,j_emb_1,j_emb_2,j_emb_3,j_emb_4,j_emb_5,j_emb_6,...,j_emb_22,j_emb_23,j_emb_24,j_emb_25,j_emb_26,j_emb_27,j_emb_28,j_emb_29,j_emb_30,j_emb_31
0,18028,5,-1.26,-0.136766,-0.013906,0.140304,-0.045568,-0.215760,0.001899,-0.029915,...,-0.213626,-0.049605,0.246616,-0.196301,-0.056399,-0.182334,-0.088423,-0.072930,-0.110614,-0.178068
17,5620,5,3.16,-0.136766,-0.013906,0.140304,-0.045568,-0.215760,0.001899,-0.029915,...,-0.213626,-0.049605,0.246616,-0.196301,-0.056399,-0.182334,-0.088423,-0.072930,-0.110614,-0.178068
101,17250,5,-3.74,-0.136766,-0.013906,0.140304,-0.045568,-0.215760,0.001899,-0.029915,...,-0.213626,-0.049605,0.246616,-0.196301,-0.056399,-0.182334,-0.088423,-0.072930,-0.110614,-0.178068
142,2785,5,6.36,-0.136766,-0.013906,0.140304,-0.045568,-0.215760,0.001899,-0.029915,...,-0.213626,-0.049605,0.246616,-0.196301,-0.056399,-0.182334,-0.088423,-0.072930,-0.110614,-0.178068
273,14078,5,2.09,-0.136766,-0.013906,0.140304,-0.045568,-0.215760,0.001899,-0.029915,...,-0.213626,-0.049605,0.246616,-0.196301,-0.056399,-0.182334,-0.088423,-0.072930,-0.110614,-0.178068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1447099,22355,73,1.89,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1447290,4233,73,7.04,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1447538,16114,73,0.34,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1447727,11578,73,6.17,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149


In [10]:
train_joke_df.tail(30)

Unnamed: 0,UID,JID,Rating,j_emb_0,j_emb_1,j_emb_2,j_emb_3,j_emb_4,j_emb_5,j_emb_6,...,j_emb_22,j_emb_23,j_emb_24,j_emb_25,j_emb_26,j_emb_27,j_emb_28,j_emb_29,j_emb_30,j_emb_31
1441135,17615,73,-0.34,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1441275,16474,73,5.34,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1441462,14862,73,1.36,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1441506,10654,73,-8.45,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1441597,4791,73,0.58,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1442211,14972,73,0.68,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1443225,9487,73,-3.69,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1443233,19759,73,2.96,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1443376,5465,73,-5.19,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149
1443425,18782,73,3.5,-0.095238,0.240438,0.100838,-0.034751,-0.153996,-0.005386,-0.057446,...,-0.160656,0.074651,0.171376,0.004046,-0.051499,-0.146442,-0.076224,0.033242,-0.095028,-0.138149


In [11]:
train_df, valid_df = train_test_split(train_joke_df, test_size=0.15, random_state=42)

In [12]:
len(valid_df)

217255

In [13]:
from catboost import CatBoostRanker, Pool, MetricVisualizer, CatBoostRegressor
from copy import deepcopy

In [14]:
train_df.drop(columns=['Rating'])

Unnamed: 0,UID,JID,j_emb_0,j_emb_1,j_emb_2,j_emb_3,j_emb_4,j_emb_5,j_emb_6,j_emb_7,...,j_emb_22,j_emb_23,j_emb_24,j_emb_25,j_emb_26,j_emb_27,j_emb_28,j_emb_29,j_emb_30,j_emb_31
565431,2985,38,-0.116937,0.027235,0.129899,-0.063150,-0.206269,-0.071847,-0.080700,0.055479,...,-0.218777,-0.011786,0.245097,0.065426,-0.047003,-0.180288,-0.071317,-0.105609,-0.114432,-0.162669
1163650,17010,58,-0.117519,0.182689,0.107013,-0.056716,-0.183489,-0.023965,-0.123453,0.069680,...,-0.178031,0.005362,0.206191,-0.114193,-0.046412,-0.159581,-0.059557,-0.026490,-0.098173,-0.139039
943563,21660,18,-0.117487,-0.035880,0.119207,-0.091285,-0.200062,0.119184,0.155513,0.091004,...,-0.189732,0.063080,0.228304,-0.012873,-0.059411,-0.179101,-0.083584,0.053878,-0.111008,-0.157622
470159,19446,75,-0.148170,0.051722,0.126933,-0.044049,-0.232752,-0.025153,0.170213,0.041790,...,-0.210623,0.063418,0.260662,-0.024532,-0.083576,-0.203099,-0.092486,0.006496,-0.115705,-0.188261
389968,16045,0,-0.128508,0.051444,0.132438,-0.037298,-0.201775,-0.069636,-0.113742,-0.024279,...,-0.196729,-0.140467,0.230395,-0.003536,-0.060885,-0.165479,-0.068899,-0.070164,-0.116063,-0.160256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1295182,15648,92,-0.137781,0.075025,0.146826,-0.059047,-0.236010,0.053895,0.176647,0.013335,...,-0.220567,0.065307,0.267877,-0.041336,-0.080997,-0.216796,-0.085432,0.052500,-0.132884,-0.187302
1120866,9812,24,-0.116069,0.134366,0.127302,-0.041653,-0.202310,-0.081045,-0.160158,0.063144,...,-0.189329,-0.109325,0.217381,0.122356,-0.043624,-0.178114,-0.081048,-0.070252,-0.105782,-0.158204
933959,5520,43,-0.084254,0.192816,0.102560,-0.016394,-0.161025,-0.096250,-0.226555,0.003800,...,-0.154741,-0.017546,0.167537,-0.093553,-0.023358,-0.128348,-0.047547,-0.063137,-0.081193,-0.107191
1194805,17007,14,-0.099288,-0.276777,0.090556,-0.034549,-0.156977,0.030101,-0.048292,0.022177,...,-0.157225,-0.016791,0.192767,0.200782,-0.065240,-0.130816,-0.058631,0.022297,-0.085119,-0.125974


In [15]:
cat_features = ['UID', 'JID']

In [16]:
train_pool = Pool(train_df.drop(columns=['Rating']), label=train_df['Rating'], cat_features=cat_features)
valid_pool = Pool(valid_df.drop(columns=['Rating']), label=valid_df['Rating'], cat_features=cat_features)
main_pool = Pool(train_joke_df.drop(columns=['Rating']), label=train_joke_df['Rating'], cat_features=cat_features)

test_pool = Pool(test_joke_df_nofactrating, cat_features=cat_features)

In [17]:
#train = Pool(
#    data=X_train,
#    label=y_train,
#    group_id=queries_train,
#    cat_features=[0, 1]
#)

#test = Pool(
#    data=X_test,
#    label=y_test,
#    group_id=queries_test,
#    cat_features=[0, 1]
#)

In [18]:
default_parameters = {
    'iterations': 1000,
    'custom_metric': 'RMSE',
    'random_seed': 0,
    'train_dir':'RMSE',
    'objective':'RMSE',
    'loss_function':'RMSE',
    'eval_metric':'RMSE',
}


In [19]:
model = CatBoostRegressor(**default_parameters)
model.fit(train_pool, eval_set=valid_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.155622
0:	learn: 5.0633657	test: 5.0548218	best: 5.0548218 (0)	total: 491ms	remaining: 8m 10s
1:	learn: 4.9352515	test: 4.9153827	best: 4.9153827 (1)	total: 939ms	remaining: 7m 48s
2:	learn: 4.8381996	test: 4.8103559	best: 4.8103559 (2)	total: 1.24s	remaining: 6m 53s
3:	learn: 4.7652815	test: 4.7288588	best: 4.7288588 (3)	total: 1.56s	remaining: 6m 28s
4:	learn: 4.7092760	test: 4.6653058	best: 4.6653058 (4)	total: 1.74s	remaining: 5m 46s
5:	learn: 4.6656000	test: 4.6150856	best: 4.6150856 (5)	total: 2.03s	remaining: 5m 36s
6:	learn: 4.6327270	test: 4.5778623	best: 4.5778623 (6)	total: 2.18s	remaining: 5m 9s
7:	learn: 4.6059856	test: 4.5461219	best: 4.5461219 (7)	total: 2.56s	remaining: 5m 18s
8:	learn: 4.5874769	test: 4.5241504	best: 4.5241504 (8)	total: 2.72s	remaining: 4m 59s
9:	learn: 4.5719586	test: 4.5055810	best: 4.5055810 (9)	total: 3.06s	remaining: 5m 2s
10:	learn: 4.5593925	test: 4.4896412	best: 4.4896412 (10)	total: 3.41s	remaining: 5m 6s
11:	learn: 4.5

93:	learn: 4.4616124	test: 4.3624238	best: 4.3624238 (93)	total: 35.4s	remaining: 5m 41s
94:	learn: 4.4613581	test: 4.3620770	best: 4.3620770 (94)	total: 35.7s	remaining: 5m 40s
95:	learn: 4.4610972	test: 4.3617340	best: 4.3617340 (95)	total: 36s	remaining: 5m 39s
96:	learn: 4.4609197	test: 4.3614945	best: 4.3614945 (96)	total: 36.5s	remaining: 5m 40s
97:	learn: 4.4608157	test: 4.3613858	best: 4.3613858 (97)	total: 37s	remaining: 5m 40s
98:	learn: 4.4606627	test: 4.3612176	best: 4.3612176 (98)	total: 37.3s	remaining: 5m 39s
99:	learn: 4.4598015	test: 4.3598883	best: 4.3598883 (99)	total: 37.8s	remaining: 5m 40s
100:	learn: 4.4591666	test: 4.3589758	best: 4.3589758 (100)	total: 38.2s	remaining: 5m 40s
101:	learn: 4.4588507	test: 4.3584549	best: 4.3584549 (101)	total: 38.7s	remaining: 5m 40s
102:	learn: 4.4585678	test: 4.3580177	best: 4.3580177 (102)	total: 39s	remaining: 5m 39s
103:	learn: 4.4582605	test: 4.3575396	best: 4.3575396 (103)	total: 39.3s	remaining: 5m 38s
104:	learn: 4.45811

185:	learn: 4.4351197	test: 4.3253437	best: 4.3253437 (185)	total: 1m 10s	remaining: 5m 8s
186:	learn: 4.4348562	test: 4.3248078	best: 4.3248078 (186)	total: 1m 10s	remaining: 5m 8s
187:	learn: 4.4345841	test: 4.3244452	best: 4.3244452 (187)	total: 1m 11s	remaining: 5m 7s
188:	learn: 4.4345291	test: 4.3244109	best: 4.3244109 (188)	total: 1m 11s	remaining: 5m 6s
189:	learn: 4.4342995	test: 4.3241273	best: 4.3241273 (189)	total: 1m 11s	remaining: 5m 6s
190:	learn: 4.4341855	test: 4.3240920	best: 4.3240920 (190)	total: 1m 12s	remaining: 5m 6s
191:	learn: 4.4340540	test: 4.3239235	best: 4.3239235 (191)	total: 1m 12s	remaining: 5m 5s
192:	learn: 4.4339907	test: 4.3238832	best: 4.3238832 (192)	total: 1m 12s	remaining: 5m 4s
193:	learn: 4.4338521	test: 4.3236509	best: 4.3236509 (193)	total: 1m 13s	remaining: 5m 5s
194:	learn: 4.4337453	test: 4.3235198	best: 4.3235198 (194)	total: 1m 13s	remaining: 5m 4s
195:	learn: 4.4334309	test: 4.3230676	best: 4.3230676 (195)	total: 1m 14s	remaining: 5m 3s

KeyboardInterrupt: 

In [None]:
assert False

In [None]:
predict = model.predict(valid_pool)
print(mean_squared_error(valid_df['Rating'].values, predict, squared=False))


In [None]:

test_joke_df_nofactrating = pd.read_csv(r'..\data\recsys-in-practice\test_joke_df_nofactrating.csv', index_col=0)

In [None]:
test_joke_df_nofactrating = test_joke_df_nofactrating.merge(jokes_df, how='left', left_on='JID',  right_index=True)
test_joke_df_nofactrating

In [None]:

test_pool = Pool(test_joke_df_nofactrating, cat_features=cat_features)

predict = model.predict(test_pool)

test_joke_df_nofactrating['Rating'] = predict

display(test_joke_df_nofactrating['Rating'].to_frame().head(5))
test_joke_df_nofactrating['Rating'].to_frame().to_csv('catboost_with_item_emb.csv')

In [None]:
test_joke_df_nofactrating

In [None]:

display(test_joke_df_nofactrating['Rating'].to_frame().head(30))

In [None]:
parameters = {
    'iterations': 1000,
    'custom_metric': 'RMSE',
    'random_seed': 0,
    'train_dir':'RMSE',
    'objective':'RMSE',
    'loss_function':'RMSE',
    'eval_metric':'RMSE',
}

model = CatBoostRegressor(**parameters)
model.fit(main_pool, eval_set=valid_pool, plot=True)

In [None]:
predict = model.predict(valid_pool)
print(mean_squared_error(valid_df['Rating'].values, predict, squared=False))

In [None]:
predict = model.predict(test_pool)

test_joke_df_nofactrating['Rating'] = predict

display(test_joke_df_nofactrating['Rating'].to_frame().head(5))
test_joke_df_nofactrating['Rating'].to_frame().to_csv('catboost_with_rating_and_item_emb.csv')