In [38]:
import pandas as pd
import numpy as np
from scipy import sparse
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tqdm.notebook import tqdm
import pickle

In [5]:
train_joke_df = pd.read_csv(r'..\data\recsys-in-practice\train_joke_df.csv')

joke_df = pd.read_csv(r'..\part1\doc2vec\joke_doc2vec_vectors.csv')

In [6]:
train_joke_df["UID"] = train_joke_df["UID"].astype(int)
train_joke_df["JID"] = train_joke_df["JID"].astype(int)

In [7]:
train_joke_df = train_joke_df.merge(joke_df, left_on='JID', right_index=True)
train_joke_df

Unnamed: 0,UID,JID,Rating,joke_feature_1,joke_feature_2,joke_feature_3,joke_feature_4,joke_feature_5,joke_feature_6,joke_feature_7,...,joke_feature_91,joke_feature_92,joke_feature_93,joke_feature_94,joke_feature_95,joke_feature_96,joke_feature_97,joke_feature_98,joke_feature_99,joke_feature_100
0,18029,6,-1.26,-0.036663,-0.067076,0.023498,-0.033260,0.055063,-0.029962,0.007167,...,-0.057065,-0.026336,-0.019614,-0.023196,0.053547,0.026594,0.018127,-0.023290,-0.042454,-0.008761
17,5621,6,3.16,-0.036663,-0.067076,0.023498,-0.033260,0.055063,-0.029962,0.007167,...,-0.057065,-0.026336,-0.019614,-0.023196,0.053547,0.026594,0.018127,-0.023290,-0.042454,-0.008761
101,17251,6,-3.74,-0.036663,-0.067076,0.023498,-0.033260,0.055063,-0.029962,0.007167,...,-0.057065,-0.026336,-0.019614,-0.023196,0.053547,0.026594,0.018127,-0.023290,-0.042454,-0.008761
142,2786,6,6.36,-0.036663,-0.067076,0.023498,-0.033260,0.055063,-0.029962,0.007167,...,-0.057065,-0.026336,-0.019614,-0.023196,0.053547,0.026594,0.018127,-0.023290,-0.042454,-0.008761
273,14079,6,2.09,-0.036663,-0.067076,0.023498,-0.033260,0.055063,-0.029962,0.007167,...,-0.057065,-0.026336,-0.019614,-0.023196,0.053547,0.026594,0.018127,-0.023290,-0.042454,-0.008761
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1447099,22356,74,1.89,0.004980,0.003973,-0.010549,-0.004713,0.029614,0.002287,-0.001566,...,-0.014091,-0.017337,-0.000019,-0.010038,0.009264,0.035359,0.022170,-0.013675,0.009155,-0.004111
1447290,4234,74,7.04,0.004980,0.003973,-0.010549,-0.004713,0.029614,0.002287,-0.001566,...,-0.014091,-0.017337,-0.000019,-0.010038,0.009264,0.035359,0.022170,-0.013675,0.009155,-0.004111
1447538,16115,74,0.34,0.004980,0.003973,-0.010549,-0.004713,0.029614,0.002287,-0.001566,...,-0.014091,-0.017337,-0.000019,-0.010038,0.009264,0.035359,0.022170,-0.013675,0.009155,-0.004111
1447727,11579,74,6.17,0.004980,0.003973,-0.010549,-0.004713,0.029614,0.002287,-0.001566,...,-0.014091,-0.017337,-0.000019,-0.010038,0.009264,0.035359,0.022170,-0.013675,0.009155,-0.004111


In [8]:
train_df, valid_df = train_test_split(train_joke_df, test_size=0.5, random_state=42)
train_df["UID"] = train_df["UID"].astype(int)
train_df["JID"] = train_df["JID"].astype(int)
valid_df["UID"] = valid_df["UID"].astype(int)
valid_df["JID"] = valid_df["JID"].astype(int)

In [9]:
# сделаем сортировку и перепишем index
train_df = train_df.sort_values(by=['UID', 'JID'])
train_df = train_df.reset_index(drop=True)

valid_df = valid_df.sort_values(by=['UID', 'JID'])
valid_df = valid_df.reset_index(drop=True)

In [10]:
from catboost import CatBoostRanker, Pool, MetricVisualizer, CatBoostRegressor
from copy import deepcopy

In [11]:
cat_features = ['UID', 'JID']

In [12]:
train_pool = Pool(train_df.drop(columns='Rating'), label=train_df['Rating'], group_id=train_df['UID'],cat_features=cat_features)
valid_pool = Pool(valid_df.drop(columns='Rating'), label=valid_df['Rating'], group_id=valid_df['UID'],cat_features=cat_features)

In [13]:
default_parameters = {
    'iterations': 2000,
    'custom_metric': 'RMSE',
    'random_seed': 0,
    'train_dir':'RMSE',
    'objective':'RMSE',
    'loss_function':'RMSE',
    'eval_metric':'RMSE'
}


In [14]:
model = CatBoostRegressor(**default_parameters)
model.fit(train_pool, eval_set=valid_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.093732
0:	learn: 5.1380028	test: 5.1243726	best: 5.1243726 (0)	total: 518ms	remaining: 17m 14s
1:	learn: 5.0564060	test: 5.0325277	best: 5.0325277 (1)	total: 688ms	remaining: 11m 27s
2:	learn: 4.9885391	test: 4.9537839	best: 4.9537839 (2)	total: 946ms	remaining: 10m 29s
3:	learn: 4.9296650	test: 4.8862962	best: 4.8862962 (3)	total: 1.11s	remaining: 9m 12s
4:	learn: 4.8803754	test: 4.8299843	best: 4.8299843 (4)	total: 1.41s	remaining: 9m 21s
5:	learn: 4.8378240	test: 4.7800292	best: 4.7800292 (5)	total: 1.57s	remaining: 8m 42s
6:	learn: 4.8025761	test: 4.7383124	best: 4.7383124 (6)	total: 1.69s	remaining: 8m 2s
7:	learn: 4.7727960	test: 4.7027357	best: 4.7027357 (7)	total: 1.85s	remaining: 7m 39s
8:	learn: 4.7464854	test: 4.6718082	best: 4.6718082 (8)	total: 2.01s	remaining: 7m 25s
9:	learn: 4.7249214	test: 4.6460182	best: 4.6460182 (9)	total: 2.16s	remaining: 7m 10s
10:	learn: 4.7059087	test: 4.6226851	best: 4.6226851 (10)	total: 2.33s	remaining: 7m
11:	learn: 4.

93:	learn: 4.5792831	test: 4.4546929	best: 4.4546929 (93)	total: 24s	remaining: 8m 6s
94:	learn: 4.5791196	test: 4.4545193	best: 4.4545193 (94)	total: 24.4s	remaining: 8m 9s
95:	learn: 4.5790884	test: 4.4544885	best: 4.4544885 (95)	total: 24.6s	remaining: 8m 8s
96:	learn: 4.5789407	test: 4.4542654	best: 4.4542654 (96)	total: 25s	remaining: 8m 11s
97:	learn: 4.5788246	test: 4.4541489	best: 4.4541489 (97)	total: 25.5s	remaining: 8m 14s
98:	learn: 4.5786842	test: 4.4539848	best: 4.4539848 (98)	total: 25.9s	remaining: 8m 16s
99:	learn: 4.5785954	test: 4.4539092	best: 4.4539092 (99)	total: 26.4s	remaining: 8m 21s
100:	learn: 4.5785293	test: 4.4538519	best: 4.4538519 (100)	total: 26.7s	remaining: 8m 21s
101:	learn: 4.5784346	test: 4.4537483	best: 4.4537483 (101)	total: 27.1s	remaining: 8m 24s
102:	learn: 4.5783110	test: 4.4536601	best: 4.4536601 (102)	total: 27.6s	remaining: 8m 28s
103:	learn: 4.5782119	test: 4.4535561	best: 4.4535561 (103)	total: 27.8s	remaining: 8m 27s
104:	learn: 4.578116

185:	learn: 4.5718755	test: 4.4479897	best: 4.4479897 (185)	total: 55.6s	remaining: 9m 2s
186:	learn: 4.5717826	test: 4.4479283	best: 4.4479283 (186)	total: 56.1s	remaining: 9m 4s
187:	learn: 4.5716975	test: 4.4478425	best: 4.4478425 (187)	total: 56.3s	remaining: 9m 3s
188:	learn: 4.5716814	test: 4.4478205	best: 4.4478205 (188)	total: 56.7s	remaining: 9m 2s
189:	learn: 4.5714498	test: 4.4475620	best: 4.4475620 (189)	total: 57.1s	remaining: 9m 4s
190:	learn: 4.5714068	test: 4.4475386	best: 4.4475386 (190)	total: 57.6s	remaining: 9m 5s
191:	learn: 4.5713684	test: 4.4475095	best: 4.4475095 (191)	total: 57.9s	remaining: 9m 5s
192:	learn: 4.5713324	test: 4.4474933	best: 4.4474933 (192)	total: 58.1s	remaining: 9m 3s
193:	learn: 4.5712692	test: 4.4474821	best: 4.4474821 (193)	total: 58.4s	remaining: 9m 3s
194:	learn: 4.5711827	test: 4.4473885	best: 4.4473885 (194)	total: 58.9s	remaining: 9m 5s
195:	learn: 4.5710953	test: 4.4473214	best: 4.4473214 (195)	total: 59.2s	remaining: 9m 5s
196:	learn

276:	learn: 4.5664205	test: 4.4440012	best: 4.4440012 (276)	total: 1m 26s	remaining: 8m 56s
277:	learn: 4.5663812	test: 4.4439303	best: 4.4439303 (277)	total: 1m 26s	remaining: 8m 57s
278:	learn: 4.5663469	test: 4.4439102	best: 4.4439102 (278)	total: 1m 26s	remaining: 8m 55s
279:	learn: 4.5663143	test: 4.4439111	best: 4.4439102 (278)	total: 1m 27s	remaining: 8m 56s
280:	learn: 4.5662915	test: 4.4438969	best: 4.4438969 (280)	total: 1m 27s	remaining: 8m 56s
281:	learn: 4.5662650	test: 4.4438792	best: 4.4438792 (281)	total: 1m 27s	remaining: 8m 55s
282:	learn: 4.5662523	test: 4.4438607	best: 4.4438607 (282)	total: 1m 28s	remaining: 8m 55s
283:	learn: 4.5661995	test: 4.4438130	best: 4.4438130 (283)	total: 1m 28s	remaining: 8m 54s
284:	learn: 4.5661215	test: 4.4438123	best: 4.4438123 (284)	total: 1m 28s	remaining: 8m 53s
285:	learn: 4.5660527	test: 4.4437825	best: 4.4437825 (285)	total: 1m 29s	remaining: 8m 55s
286:	learn: 4.5660152	test: 4.4437721	best: 4.4437721 (286)	total: 1m 29s	remain

366:	learn: 4.5621648	test: 4.4413507	best: 4.4413507 (366)	total: 1m 57s	remaining: 8m 41s
367:	learn: 4.5621264	test: 4.4413164	best: 4.4413164 (367)	total: 1m 57s	remaining: 8m 41s
368:	learn: 4.5621003	test: 4.4412991	best: 4.4412991 (368)	total: 1m 57s	remaining: 8m 41s
369:	learn: 4.5620746	test: 4.4412960	best: 4.4412960 (369)	total: 1m 58s	remaining: 8m 40s
370:	learn: 4.5620377	test: 4.4412931	best: 4.4412931 (370)	total: 1m 58s	remaining: 8m 40s
371:	learn: 4.5619478	test: 4.4412129	best: 4.4412129 (371)	total: 1m 59s	remaining: 8m 41s
372:	learn: 4.5618948	test: 4.4412125	best: 4.4412125 (372)	total: 1m 59s	remaining: 8m 41s
373:	learn: 4.5618364	test: 4.4412001	best: 4.4412001 (373)	total: 1m 59s	remaining: 8m 41s
374:	learn: 4.5618033	test: 4.4412055	best: 4.4412001 (373)	total: 2m	remaining: 8m 40s
375:	learn: 4.5617712	test: 4.4412032	best: 4.4412001 (373)	total: 2m	remaining: 8m 40s
376:	learn: 4.5617272	test: 4.4411929	best: 4.4411929 (376)	total: 2m	remaining: 8m 40s


456:	learn: 4.5579806	test: 4.4388035	best: 4.4388035 (456)	total: 2m 29s	remaining: 8m 25s
457:	learn: 4.5579093	test: 4.4387357	best: 4.4387357 (457)	total: 2m 29s	remaining: 8m 24s
458:	learn: 4.5578573	test: 4.4387399	best: 4.4387357 (457)	total: 2m 30s	remaining: 8m 23s
459:	learn: 4.5578361	test: 4.4387352	best: 4.4387352 (459)	total: 2m 30s	remaining: 8m 23s
460:	learn: 4.5577916	test: 4.4387111	best: 4.4387111 (460)	total: 2m 31s	remaining: 8m 24s
461:	learn: 4.5577758	test: 4.4387074	best: 4.4387074 (461)	total: 2m 31s	remaining: 8m 23s
462:	learn: 4.5577560	test: 4.4387011	best: 4.4387011 (462)	total: 2m 31s	remaining: 8m 23s
463:	learn: 4.5577327	test: 4.4387043	best: 4.4387011 (462)	total: 2m 31s	remaining: 8m 22s
464:	learn: 4.5577093	test: 4.4386933	best: 4.4386933 (464)	total: 2m 32s	remaining: 8m 22s
465:	learn: 4.5576823	test: 4.4386841	best: 4.4386841 (465)	total: 2m 32s	remaining: 8m 22s
466:	learn: 4.5576536	test: 4.4386661	best: 4.4386661 (466)	total: 2m 32s	remain

546:	learn: 4.5544655	test: 4.4368914	best: 4.4368914 (546)	total: 2m 59s	remaining: 7m 57s
547:	learn: 4.5544242	test: 4.4368805	best: 4.4368805 (547)	total: 3m	remaining: 7m 57s
548:	learn: 4.5543571	test: 4.4368347	best: 4.4368347 (548)	total: 3m	remaining: 7m 57s
549:	learn: 4.5543150	test: 4.4368064	best: 4.4368064 (549)	total: 3m	remaining: 7m 56s
550:	learn: 4.5542751	test: 4.4367930	best: 4.4367930 (550)	total: 3m 1s	remaining: 7m 57s
551:	learn: 4.5542524	test: 4.4367808	best: 4.4367808 (551)	total: 3m 1s	remaining: 7m 56s
552:	learn: 4.5542271	test: 4.4367676	best: 4.4367676 (552)	total: 3m 2s	remaining: 7m 56s
553:	learn: 4.5542144	test: 4.4367635	best: 4.4367635 (553)	total: 3m 2s	remaining: 7m 56s
554:	learn: 4.5541288	test: 4.4367449	best: 4.4367449 (554)	total: 3m 3s	remaining: 7m 56s
555:	learn: 4.5541256	test: 4.4367419	best: 4.4367419 (555)	total: 3m 3s	remaining: 7m 56s
556:	learn: 4.5540952	test: 4.4367266	best: 4.4367266 (556)	total: 3m 3s	remaining: 7m 56s
557:	le

636:	learn: 4.5510532	test: 4.4350561	best: 4.4350561 (636)	total: 3m 33s	remaining: 7m 37s
637:	learn: 4.5510290	test: 4.4350687	best: 4.4350561 (636)	total: 3m 34s	remaining: 7m 37s
638:	learn: 4.5510061	test: 4.4350693	best: 4.4350561 (636)	total: 3m 34s	remaining: 7m 37s
639:	learn: 4.5509611	test: 4.4350366	best: 4.4350366 (639)	total: 3m 34s	remaining: 7m 36s
640:	learn: 4.5509173	test: 4.4350240	best: 4.4350240 (640)	total: 3m 35s	remaining: 7m 36s
641:	learn: 4.5508806	test: 4.4350088	best: 4.4350088 (641)	total: 3m 35s	remaining: 7m 36s
642:	learn: 4.5508566	test: 4.4349980	best: 4.4349980 (642)	total: 3m 35s	remaining: 7m 35s
643:	learn: 4.5508401	test: 4.4350015	best: 4.4349980 (642)	total: 3m 36s	remaining: 7m 34s
644:	learn: 4.5508152	test: 4.4349941	best: 4.4349941 (644)	total: 3m 36s	remaining: 7m 34s
645:	learn: 4.5507766	test: 4.4349822	best: 4.4349822 (645)	total: 3m 36s	remaining: 7m 34s
646:	learn: 4.5507099	test: 4.4349256	best: 4.4349256 (646)	total: 3m 36s	remain

726:	learn: 4.5479446	test: 4.4334506	best: 4.4334506 (726)	total: 4m 5s	remaining: 7m 9s
727:	learn: 4.5479229	test: 4.4334587	best: 4.4334506 (726)	total: 4m 5s	remaining: 7m 9s
728:	learn: 4.5479075	test: 4.4334640	best: 4.4334506 (726)	total: 4m 6s	remaining: 7m 9s
729:	learn: 4.5478554	test: 4.4334474	best: 4.4334474 (729)	total: 4m 6s	remaining: 7m 8s
730:	learn: 4.5478146	test: 4.4334356	best: 4.4334356 (730)	total: 4m 6s	remaining: 7m 8s
731:	learn: 4.5477872	test: 4.4334260	best: 4.4334260 (731)	total: 4m 7s	remaining: 7m 8s
732:	learn: 4.5477551	test: 4.4334280	best: 4.4334260 (731)	total: 4m 7s	remaining: 7m 8s
733:	learn: 4.5477250	test: 4.4334266	best: 4.4334260 (731)	total: 4m 8s	remaining: 7m 8s
734:	learn: 4.5476889	test: 4.4334054	best: 4.4334054 (734)	total: 4m 8s	remaining: 7m 7s
735:	learn: 4.5476675	test: 4.4334090	best: 4.4334054 (734)	total: 4m 8s	remaining: 7m 7s
736:	learn: 4.5476406	test: 4.4334111	best: 4.4334054 (734)	total: 4m 9s	remaining: 7m 6s
737:	learn

816:	learn: 4.5446244	test: 4.4317361	best: 4.4317361 (816)	total: 4m 40s	remaining: 6m 45s
817:	learn: 4.5445916	test: 4.4317201	best: 4.4317201 (817)	total: 4m 40s	remaining: 6m 45s
818:	learn: 4.5445521	test: 4.4316875	best: 4.4316875 (818)	total: 4m 40s	remaining: 6m 44s
819:	learn: 4.5445312	test: 4.4316835	best: 4.4316835 (819)	total: 4m 41s	remaining: 6m 44s
820:	learn: 4.5445193	test: 4.4316892	best: 4.4316835 (819)	total: 4m 41s	remaining: 6m 43s
821:	learn: 4.5444872	test: 4.4316762	best: 4.4316762 (821)	total: 4m 41s	remaining: 6m 43s
822:	learn: 4.5444446	test: 4.4316717	best: 4.4316717 (822)	total: 4m 41s	remaining: 6m 43s
823:	learn: 4.5443580	test: 4.4315669	best: 4.4315669 (823)	total: 4m 42s	remaining: 6m 43s
824:	learn: 4.5443295	test: 4.4315496	best: 4.4315496 (824)	total: 4m 42s	remaining: 6m 42s
825:	learn: 4.5442597	test: 4.4314909	best: 4.4314909 (825)	total: 4m 43s	remaining: 6m 42s
826:	learn: 4.5442300	test: 4.4314853	best: 4.4314853 (826)	total: 4m 43s	remain

906:	learn: 4.5415456	test: 4.4302997	best: 4.4302997 (906)	total: 5m 13s	remaining: 6m 17s
907:	learn: 4.5414526	test: 4.4301958	best: 4.4301958 (907)	total: 5m 14s	remaining: 6m 17s
908:	learn: 4.5414329	test: 4.4301726	best: 4.4301726 (908)	total: 5m 14s	remaining: 6m 17s
909:	learn: 4.5414051	test: 4.4301720	best: 4.4301720 (909)	total: 5m 14s	remaining: 6m 17s
910:	learn: 4.5413760	test: 4.4301575	best: 4.4301575 (910)	total: 5m 15s	remaining: 6m 16s
911:	learn: 4.5413561	test: 4.4301398	best: 4.4301398 (911)	total: 5m 15s	remaining: 6m 16s
912:	learn: 4.5413496	test: 4.4301405	best: 4.4301398 (911)	total: 5m 15s	remaining: 6m 15s
913:	learn: 4.5413211	test: 4.4301370	best: 4.4301370 (913)	total: 5m 16s	remaining: 6m 15s
914:	learn: 4.5412970	test: 4.4301131	best: 4.4301131 (914)	total: 5m 16s	remaining: 6m 15s
915:	learn: 4.5412662	test: 4.4300984	best: 4.4300984 (915)	total: 5m 16s	remaining: 6m 14s
916:	learn: 4.5412510	test: 4.4300941	best: 4.4300941 (916)	total: 5m 17s	remain

996:	learn: 4.5388552	test: 4.4290419	best: 4.4290419 (996)	total: 5m 48s	remaining: 5m 50s
997:	learn: 4.5388057	test: 4.4290246	best: 4.4290246 (997)	total: 5m 48s	remaining: 5m 50s
998:	learn: 4.5387834	test: 4.4290253	best: 4.4290246 (997)	total: 5m 49s	remaining: 5m 49s
999:	learn: 4.5387664	test: 4.4290266	best: 4.4290246 (997)	total: 5m 49s	remaining: 5m 49s
1000:	learn: 4.5387294	test: 4.4290263	best: 4.4290246 (997)	total: 5m 49s	remaining: 5m 49s
1001:	learn: 4.5386967	test: 4.4290042	best: 4.4290042 (1001)	total: 5m 50s	remaining: 5m 48s
1002:	learn: 4.5386595	test: 4.4289738	best: 4.4289738 (1002)	total: 5m 50s	remaining: 5m 48s
1003:	learn: 4.5386309	test: 4.4289724	best: 4.4289724 (1003)	total: 5m 50s	remaining: 5m 48s
1004:	learn: 4.5385646	test: 4.4289177	best: 4.4289177 (1004)	total: 5m 51s	remaining: 5m 47s
1005:	learn: 4.5385465	test: 4.4289182	best: 4.4289177 (1004)	total: 5m 51s	remaining: 5m 47s
1006:	learn: 4.5385004	test: 4.4288722	best: 4.4288722 (1006)	total: 

1084:	learn: 4.5360613	test: 4.4276995	best: 4.4276995 (1084)	total: 6m 22s	remaining: 5m 22s
1085:	learn: 4.5360413	test: 4.4276965	best: 4.4276965 (1085)	total: 6m 22s	remaining: 5m 21s
1086:	learn: 4.5360172	test: 4.4276758	best: 4.4276758 (1086)	total: 6m 22s	remaining: 5m 21s
1087:	learn: 4.5359976	test: 4.4276803	best: 4.4276758 (1086)	total: 6m 22s	remaining: 5m 21s
1088:	learn: 4.5359855	test: 4.4276829	best: 4.4276758 (1086)	total: 6m 23s	remaining: 5m 20s
1089:	learn: 4.5359541	test: 4.4276771	best: 4.4276758 (1086)	total: 6m 23s	remaining: 5m 20s
1090:	learn: 4.5359333	test: 4.4276809	best: 4.4276758 (1086)	total: 6m 24s	remaining: 5m 20s
1091:	learn: 4.5359192	test: 4.4276736	best: 4.4276736 (1091)	total: 6m 24s	remaining: 5m 19s
1092:	learn: 4.5358734	test: 4.4276493	best: 4.4276493 (1092)	total: 6m 24s	remaining: 5m 19s
1093:	learn: 4.5358088	test: 4.4276133	best: 4.4276133 (1093)	total: 6m 25s	remaining: 5m 19s
1094:	learn: 4.5357800	test: 4.4276136	best: 4.4276133 (1093

1172:	learn: 4.5336020	test: 4.4266825	best: 4.4266825 (1172)	total: 6m 54s	remaining: 4m 52s
1173:	learn: 4.5335515	test: 4.4266667	best: 4.4266667 (1173)	total: 6m 55s	remaining: 4m 52s
1174:	learn: 4.5335066	test: 4.4266215	best: 4.4266215 (1174)	total: 6m 55s	remaining: 4m 51s
1175:	learn: 4.5334816	test: 4.4266074	best: 4.4266074 (1175)	total: 6m 56s	remaining: 4m 51s
1176:	learn: 4.5334422	test: 4.4266000	best: 4.4266000 (1176)	total: 6m 56s	remaining: 4m 51s
1177:	learn: 4.5334267	test: 4.4265974	best: 4.4265974 (1177)	total: 6m 57s	remaining: 4m 51s
1178:	learn: 4.5334079	test: 4.4265957	best: 4.4265957 (1178)	total: 6m 57s	remaining: 4m 50s
1179:	learn: 4.5333744	test: 4.4265840	best: 4.4265840 (1179)	total: 6m 57s	remaining: 4m 50s
1180:	learn: 4.5333182	test: 4.4265441	best: 4.4265441 (1180)	total: 6m 58s	remaining: 4m 50s
1181:	learn: 4.5333010	test: 4.4265490	best: 4.4265441 (1180)	total: 6m 58s	remaining: 4m 49s
1182:	learn: 4.5332566	test: 4.4265360	best: 4.4265360 (1182

1260:	learn: 4.5306525	test: 4.4253431	best: 4.4253431 (1260)	total: 7m 30s	remaining: 4m 23s
1261:	learn: 4.5305861	test: 4.4252868	best: 4.4252868 (1261)	total: 7m 30s	remaining: 4m 23s
1262:	learn: 4.5305687	test: 4.4252837	best: 4.4252837 (1262)	total: 7m 30s	remaining: 4m 23s
1263:	learn: 4.5305216	test: 4.4252829	best: 4.4252829 (1263)	total: 7m 31s	remaining: 4m 22s
1264:	learn: 4.5305065	test: 4.4252796	best: 4.4252796 (1264)	total: 7m 31s	remaining: 4m 22s
1265:	learn: 4.5304383	test: 4.4252234	best: 4.4252234 (1265)	total: 7m 32s	remaining: 4m 22s
1266:	learn: 4.5303663	test: 4.4251748	best: 4.4251748 (1266)	total: 7m 32s	remaining: 4m 21s
1267:	learn: 4.5303142	test: 4.4251568	best: 4.4251568 (1267)	total: 7m 33s	remaining: 4m 21s
1268:	learn: 4.5302989	test: 4.4251580	best: 4.4251568 (1267)	total: 7m 33s	remaining: 4m 21s
1269:	learn: 4.5302601	test: 4.4251399	best: 4.4251399 (1269)	total: 7m 33s	remaining: 4m 20s
1270:	learn: 4.5302377	test: 4.4251482	best: 4.4251399 (1269

1348:	learn: 4.5279475	test: 4.4239517	best: 4.4239517 (1348)	total: 8m 6s	remaining: 3m 54s
1349:	learn: 4.5279179	test: 4.4239430	best: 4.4239430 (1349)	total: 8m 7s	remaining: 3m 54s
1350:	learn: 4.5278698	test: 4.4238914	best: 4.4238914 (1350)	total: 8m 7s	remaining: 3m 54s
1351:	learn: 4.5278507	test: 4.4238765	best: 4.4238765 (1351)	total: 8m 8s	remaining: 3m 53s
1352:	learn: 4.5278280	test: 4.4238792	best: 4.4238765 (1351)	total: 8m 8s	remaining: 3m 53s
1353:	learn: 4.5278131	test: 4.4238678	best: 4.4238678 (1353)	total: 8m 8s	remaining: 3m 53s
1354:	learn: 4.5277936	test: 4.4238679	best: 4.4238678 (1353)	total: 8m 9s	remaining: 3m 52s
1355:	learn: 4.5277735	test: 4.4238504	best: 4.4238504 (1355)	total: 8m 9s	remaining: 3m 52s
1356:	learn: 4.5277581	test: 4.4238435	best: 4.4238435 (1356)	total: 8m 9s	remaining: 3m 52s
1357:	learn: 4.5277204	test: 4.4238258	best: 4.4238258 (1357)	total: 8m 10s	remaining: 3m 51s
1358:	learn: 4.5277095	test: 4.4238268	best: 4.4238258 (1357)	total: 

1436:	learn: 4.5256761	test: 4.4233422	best: 4.4233422 (1436)	total: 8m 39s	remaining: 3m 23s
1437:	learn: 4.5256372	test: 4.4233446	best: 4.4233422 (1436)	total: 8m 39s	remaining: 3m 22s
1438:	learn: 4.5256076	test: 4.4233395	best: 4.4233395 (1438)	total: 8m 39s	remaining: 3m 22s
1439:	learn: 4.5255602	test: 4.4233040	best: 4.4233040 (1439)	total: 8m 40s	remaining: 3m 22s
1440:	learn: 4.5255463	test: 4.4233020	best: 4.4233020 (1440)	total: 8m 40s	remaining: 3m 21s
1441:	learn: 4.5255285	test: 4.4233019	best: 4.4233019 (1441)	total: 8m 40s	remaining: 3m 21s
1442:	learn: 4.5255039	test: 4.4233004	best: 4.4233004 (1442)	total: 8m 41s	remaining: 3m 21s
1443:	learn: 4.5254659	test: 4.4233004	best: 4.4233004 (1442)	total: 8m 41s	remaining: 3m 20s
1444:	learn: 4.5254151	test: 4.4232736	best: 4.4232736 (1444)	total: 8m 42s	remaining: 3m 20s
1445:	learn: 4.5253738	test: 4.4232578	best: 4.4232578 (1445)	total: 8m 42s	remaining: 3m 20s
1446:	learn: 4.5253591	test: 4.4232622	best: 4.4232578 (1445

1524:	learn: 4.5229991	test: 4.4219011	best: 4.4219011 (1524)	total: 9m 13s	remaining: 2m 52s
1525:	learn: 4.5229876	test: 4.4219047	best: 4.4219011 (1524)	total: 9m 14s	remaining: 2m 52s
1526:	learn: 4.5229643	test: 4.4218787	best: 4.4218787 (1526)	total: 9m 14s	remaining: 2m 51s
1527:	learn: 4.5229428	test: 4.4218811	best: 4.4218787 (1526)	total: 9m 14s	remaining: 2m 51s
1528:	learn: 4.5228975	test: 4.4218668	best: 4.4218668 (1528)	total: 9m 15s	remaining: 2m 51s
1529:	learn: 4.5228809	test: 4.4218598	best: 4.4218598 (1529)	total: 9m 15s	remaining: 2m 50s
1530:	learn: 4.5228694	test: 4.4218557	best: 4.4218557 (1530)	total: 9m 16s	remaining: 2m 50s
1531:	learn: 4.5228603	test: 4.4218561	best: 4.4218557 (1530)	total: 9m 16s	remaining: 2m 49s
1532:	learn: 4.5228137	test: 4.4218263	best: 4.4218263 (1532)	total: 9m 16s	remaining: 2m 49s
1533:	learn: 4.5228059	test: 4.4218222	best: 4.4218222 (1533)	total: 9m 17s	remaining: 2m 49s
1534:	learn: 4.5227980	test: 4.4218196	best: 4.4218196 (1534

1612:	learn: 4.5207933	test: 4.4209356	best: 4.4209356 (1612)	total: 9m 46s	remaining: 2m 20s
1613:	learn: 4.5207820	test: 4.4209347	best: 4.4209347 (1613)	total: 9m 47s	remaining: 2m 20s
1614:	learn: 4.5207734	test: 4.4209384	best: 4.4209347 (1613)	total: 9m 47s	remaining: 2m 20s
1615:	learn: 4.5207585	test: 4.4209296	best: 4.4209296 (1615)	total: 9m 47s	remaining: 2m 19s
1616:	learn: 4.5207388	test: 4.4209217	best: 4.4209217 (1616)	total: 9m 48s	remaining: 2m 19s
1617:	learn: 4.5207333	test: 4.4209237	best: 4.4209217 (1616)	total: 9m 48s	remaining: 2m 18s
1618:	learn: 4.5206881	test: 4.4208705	best: 4.4208705 (1618)	total: 9m 49s	remaining: 2m 18s
1619:	learn: 4.5206656	test: 4.4208670	best: 4.4208670 (1619)	total: 9m 49s	remaining: 2m 18s
1620:	learn: 4.5206259	test: 4.4208625	best: 4.4208625 (1620)	total: 9m 49s	remaining: 2m 17s
1621:	learn: 4.5206057	test: 4.4208639	best: 4.4208625 (1620)	total: 9m 50s	remaining: 2m 17s
1622:	learn: 4.5205775	test: 4.4208659	best: 4.4208625 (1620

1700:	learn: 4.5183861	test: 4.4201273	best: 4.4201273 (1700)	total: 10m 19s	remaining: 1m 48s
1701:	learn: 4.5183215	test: 4.4200425	best: 4.4200425 (1701)	total: 10m 20s	remaining: 1m 48s
1702:	learn: 4.5182950	test: 4.4200342	best: 4.4200342 (1702)	total: 10m 20s	remaining: 1m 48s
1703:	learn: 4.5182636	test: 4.4200273	best: 4.4200273 (1703)	total: 10m 20s	remaining: 1m 47s
1704:	learn: 4.5182447	test: 4.4200167	best: 4.4200167 (1704)	total: 10m 21s	remaining: 1m 47s
1705:	learn: 4.5182166	test: 4.4200152	best: 4.4200152 (1705)	total: 10m 21s	remaining: 1m 47s
1706:	learn: 4.5182045	test: 4.4200180	best: 4.4200152 (1705)	total: 10m 21s	remaining: 1m 46s
1707:	learn: 4.5181801	test: 4.4200125	best: 4.4200125 (1707)	total: 10m 22s	remaining: 1m 46s
1708:	learn: 4.5181624	test: 4.4200110	best: 4.4200110 (1708)	total: 10m 22s	remaining: 1m 45s
1709:	learn: 4.5181397	test: 4.4199950	best: 4.4199950 (1709)	total: 10m 22s	remaining: 1m 45s
1710:	learn: 4.5181231	test: 4.4199978	best: 4.419

1788:	learn: 4.5159323	test: 4.4190933	best: 4.4190933 (1788)	total: 10m 54s	remaining: 1m 17s
1789:	learn: 4.5159063	test: 4.4191006	best: 4.4190933 (1788)	total: 10m 54s	remaining: 1m 16s
1790:	learn: 4.5158809	test: 4.4190952	best: 4.4190933 (1788)	total: 10m 55s	remaining: 1m 16s
1791:	learn: 4.5158667	test: 4.4190877	best: 4.4190877 (1791)	total: 10m 55s	remaining: 1m 16s
1792:	learn: 4.5158422	test: 4.4190743	best: 4.4190743 (1792)	total: 10m 56s	remaining: 1m 15s
1793:	learn: 4.5158264	test: 4.4190732	best: 4.4190732 (1793)	total: 10m 56s	remaining: 1m 15s
1794:	learn: 4.5157941	test: 4.4190585	best: 4.4190585 (1794)	total: 10m 56s	remaining: 1m 15s
1795:	learn: 4.5157418	test: 4.4190149	best: 4.4190149 (1795)	total: 10m 57s	remaining: 1m 14s
1796:	learn: 4.5157078	test: 4.4190067	best: 4.4190067 (1796)	total: 10m 57s	remaining: 1m 14s
1797:	learn: 4.5156999	test: 4.4190110	best: 4.4190067 (1796)	total: 10m 58s	remaining: 1m 13s
1798:	learn: 4.5156636	test: 4.4190032	best: 4.419

1876:	learn: 4.5137933	test: 4.4183745	best: 4.4183745 (1876)	total: 11m 27s	remaining: 45.1s
1877:	learn: 4.5137574	test: 4.4183598	best: 4.4183598 (1877)	total: 11m 27s	remaining: 44.7s
1878:	learn: 4.5137435	test: 4.4183635	best: 4.4183598 (1877)	total: 11m 28s	remaining: 44.3s
1879:	learn: 4.5136239	test: 4.4182123	best: 4.4182123 (1879)	total: 11m 28s	remaining: 44s
1880:	learn: 4.5135829	test: 4.4181743	best: 4.4181743 (1880)	total: 11m 29s	remaining: 43.6s
1881:	learn: 4.5135345	test: 4.4181169	best: 4.4181169 (1881)	total: 11m 29s	remaining: 43.3s
1882:	learn: 4.5134528	test: 4.4180476	best: 4.4180476 (1882)	total: 11m 30s	remaining: 42.9s
1883:	learn: 4.5134366	test: 4.4180370	best: 4.4180370 (1883)	total: 11m 30s	remaining: 42.5s
1884:	learn: 4.5133951	test: 4.4180100	best: 4.4180100 (1884)	total: 11m 31s	remaining: 42.2s
1885:	learn: 4.5133491	test: 4.4180053	best: 4.4180053 (1885)	total: 11m 31s	remaining: 41.8s
1886:	learn: 4.5133353	test: 4.4179967	best: 4.4179967 (1886)	

1964:	learn: 4.5116271	test: 4.4174691	best: 4.4174604 (1957)	total: 11m 59s	remaining: 12.8s
1965:	learn: 4.5116041	test: 4.4174603	best: 4.4174603 (1965)	total: 11m 59s	remaining: 12.5s
1966:	learn: 4.5116002	test: 4.4174602	best: 4.4174602 (1966)	total: 12m	remaining: 12.1s
1967:	learn: 4.5115868	test: 4.4174618	best: 4.4174602 (1966)	total: 12m	remaining: 11.7s
1968:	learn: 4.5115557	test: 4.4174552	best: 4.4174552 (1968)	total: 12m	remaining: 11.3s
1969:	learn: 4.5115432	test: 4.4174536	best: 4.4174536 (1969)	total: 12m 1s	remaining: 11s
1970:	learn: 4.5115255	test: 4.4174556	best: 4.4174536 (1969)	total: 12m 1s	remaining: 10.6s
1971:	learn: 4.5114912	test: 4.4174443	best: 4.4174443 (1971)	total: 12m 2s	remaining: 10.3s
1972:	learn: 4.5114617	test: 4.4174466	best: 4.4174443 (1971)	total: 12m 2s	remaining: 9.89s
1973:	learn: 4.5114248	test: 4.4174096	best: 4.4174096 (1973)	total: 12m 3s	remaining: 9.52s
1974:	learn: 4.5114160	test: 4.4174135	best: 4.4174096 (1973)	total: 12m 3s	rem

<catboost.core.CatBoostRegressor at 0x24a02882dd0>

In [15]:
predict = model.predict(valid_pool)
print(mean_squared_error(valid_df['Rating'].values, predict, squared=False))

predict = model.predict(train_pool)
print(mean_squared_error(train_df['Rating'].values, predict, squared=False))


4.41730049584738
4.217502444175489


In [16]:
model_name = "catboost_doc2vec"

In [19]:
model.save_model(model_name)
from_file = CatBoostRegressor()

from_file.load_model(model_name)


<catboost.core.CatBoostRegressor at 0x24a0231dae0>

In [20]:
predict = from_file.predict(valid_pool)
print(mean_squared_error(valid_df['Rating'].values, predict, squared=False))

predict = from_file.predict(train_pool)
print(mean_squared_error(train_df['Rating'].values, predict, squared=False))


4.41730049584738
4.217502444175489


In [23]:
n_users = np.unique(train_joke_df['UID']).size
n_items = np.unique(train_joke_df['JID']).size

test_set_all = np.zeros((n_users * n_items, 2))
for u in tqdm(range(n_users)):
    for j in range(n_items):
        test_set_all[(n_items * u) + j, 0] = u
        test_set_all[(n_items * u) + j, 1] = j

df_test = pd.DataFrame(test_set_all, columns=['UID', 'JID'])
df_test

  0%|          | 0/24983 [00:00<?, ?it/s]

Unnamed: 0,UID,JID
0,0.0,0.0
1,0.0,1.0
2,0.0,2.0
3,0.0,3.0
4,0.0,4.0
...,...,...
2473312,24982.0,94.0
2473313,24982.0,95.0
2473314,24982.0,96.0
2473315,24982.0,97.0


In [24]:

df_test['UID'] = df_test['UID'].astype(int)
df_test['JID'] = df_test['JID'].astype(int)

In [25]:

df_test = df_test.merge(joke_df, how='left', left_on='JID', right_index=True)
display(df_test)
df_test = df_test.sort_values(by=['UID', 'JID'])

Unnamed: 0,UID,JID,joke_feature_1,joke_feature_2,joke_feature_3,joke_feature_4,joke_feature_5,joke_feature_6,joke_feature_7,joke_feature_8,...,joke_feature_91,joke_feature_92,joke_feature_93,joke_feature_94,joke_feature_95,joke_feature_96,joke_feature_97,joke_feature_98,joke_feature_99,joke_feature_100
0,0,0,0.019126,0.012131,-0.027477,0.019825,-0.007704,0.020013,-0.022230,-0.015224,...,0.013015,0.027021,-0.012433,0.006457,-0.014875,-0.015005,-0.018308,-0.001017,0.048864,0.007200
1,0,1,-0.011499,0.002285,0.003630,0.004051,0.003290,-0.000463,0.006562,-0.001884,...,-0.009672,-0.005080,-0.002575,-0.005094,0.006556,0.001787,0.013340,0.011139,-0.017809,-0.000959
2,0,2,-0.016668,-0.018199,0.012311,-0.010475,0.025088,0.000381,0.004331,0.023751,...,-0.019020,-0.012687,-0.009148,-0.009461,0.019125,0.026595,0.013386,-0.013205,-0.019013,-0.015191
3,0,3,-0.032417,0.002634,0.027978,-0.019963,0.023566,0.007097,0.021878,0.044580,...,-0.016706,-0.034643,0.006712,-0.007995,0.028974,0.030824,0.034675,0.001362,-0.046984,-0.017229
4,0,4,-0.014843,-0.046528,0.002575,-0.029940,0.030619,-0.011654,0.012663,0.048356,...,-0.025209,-0.016499,-0.004959,-0.026612,0.021617,0.048349,0.002544,-0.042855,-0.021552,-0.023502
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473312,24982,94,-0.000154,-0.006946,-0.010851,0.011448,0.004269,-0.007663,0.003276,-0.000208,...,-0.003951,-0.009226,-0.001803,-0.025048,0.003969,0.020448,0.011776,-0.013034,-0.003493,-0.020192
2473313,24982,95,-0.004857,0.012683,0.009205,-0.008275,-0.003818,0.006676,-0.003004,0.014902,...,0.011361,-0.006212,0.008240,-0.001582,0.010192,-0.002656,0.002402,0.000192,-0.013342,-0.000508
2473314,24982,96,0.034768,0.040293,-0.008734,0.023405,-0.038429,0.024317,-0.039087,-0.039802,...,0.036261,0.070944,-0.018116,0.070075,-0.064077,-0.088195,-0.061626,0.055512,0.110628,0.040841
2473315,24982,97,0.017260,0.013419,-0.001536,-0.003390,-0.020430,-0.008701,0.003307,-0.016211,...,0.025377,-0.005499,0.053408,0.017966,0.011246,-0.036009,0.007395,0.013810,-0.027661,0.000457


In [26]:

test_pool = Pool(df_test, group_id=df_test['UID'], cat_features=cat_features)



In [27]:
predictions = model.predict(test_pool)


In [28]:
df_test['Rating_pred'] = predictions
df_test

Unnamed: 0,UID,JID,joke_feature_1,joke_feature_2,joke_feature_3,joke_feature_4,joke_feature_5,joke_feature_6,joke_feature_7,joke_feature_8,...,joke_feature_92,joke_feature_93,joke_feature_94,joke_feature_95,joke_feature_96,joke_feature_97,joke_feature_98,joke_feature_99,joke_feature_100,Rating_pred
0,0,0,0.019126,0.012131,-0.027477,0.019825,-0.007704,0.020013,-0.022230,-0.015224,...,0.027021,-0.012433,0.006457,-0.014875,-0.015005,-0.018308,-0.001017,0.048864,0.007200,3.297993
1,0,1,-0.011499,0.002285,0.003630,0.004051,0.003290,-0.000463,0.006562,-0.001884,...,-0.005080,-0.002575,-0.005094,0.006556,0.001787,0.013340,0.011139,-0.017809,-0.000959,3.497294
2,0,2,-0.016668,-0.018199,0.012311,-0.010475,0.025088,0.000381,0.004331,0.023751,...,-0.012687,-0.009148,-0.009461,0.019125,0.026595,0.013386,-0.013205,-0.019013,-0.015191,5.748212
3,0,3,-0.032417,0.002634,0.027978,-0.019963,0.023566,0.007097,0.021878,0.044580,...,-0.034643,0.006712,-0.007995,0.028974,0.030824,0.034675,0.001362,-0.046984,-0.017229,3.986693
4,0,4,-0.014843,-0.046528,0.002575,-0.029940,0.030619,-0.011654,0.012663,0.048356,...,-0.016499,-0.004959,-0.026612,0.021617,0.048349,0.002544,-0.042855,-0.021552,-0.023502,3.870478
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473312,24982,94,-0.000154,-0.006946,-0.010851,0.011448,0.004269,-0.007663,0.003276,-0.000208,...,-0.009226,-0.001803,-0.025048,0.003969,0.020448,0.011776,-0.013034,-0.003493,-0.020192,-0.008525
2473313,24982,95,-0.004857,0.012683,0.009205,-0.008275,-0.003818,0.006676,-0.003004,0.014902,...,-0.006212,0.008240,-0.001582,0.010192,-0.002656,0.002402,0.000192,-0.013342,-0.000508,0.413416
2473314,24982,96,0.034768,0.040293,-0.008734,0.023405,-0.038429,0.024317,-0.039087,-0.039802,...,0.070944,-0.018116,0.070075,-0.064077,-0.088195,-0.061626,0.055512,0.110628,0.040841,0.496009
2473315,24982,97,0.017260,0.013419,-0.001536,-0.003390,-0.020430,-0.008701,0.003307,-0.016211,...,-0.005499,0.053408,0.017966,0.011246,-0.036009,0.007395,0.013810,-0.027661,0.000457,1.273655


In [29]:
mrg = df_test.merge(train_df, on=['UID', 'JID'], how="left", indicator=True)
mrg

Unnamed: 0,UID,JID,joke_feature_1_x,joke_feature_2_x,joke_feature_3_x,joke_feature_4_x,joke_feature_5_x,joke_feature_6_x,joke_feature_7_x,joke_feature_8_x,...,joke_feature_92_y,joke_feature_93_y,joke_feature_94_y,joke_feature_95_y,joke_feature_96_y,joke_feature_97_y,joke_feature_98_y,joke_feature_99_y,joke_feature_100_y,_merge
0,0,0,0.019126,0.012131,-0.027477,0.019825,-0.007704,0.020013,-0.022230,-0.015224,...,,,,,,,,,,left_only
1,0,1,-0.011499,0.002285,0.003630,0.004051,0.003290,-0.000463,0.006562,-0.001884,...,,,,,,,,,,left_only
2,0,2,-0.016668,-0.018199,0.012311,-0.010475,0.025088,0.000381,0.004331,0.023751,...,,,,,,,,,,left_only
3,0,3,-0.032417,0.002634,0.027978,-0.019963,0.023566,0.007097,0.021878,0.044580,...,,,,,,,,,,left_only
4,0,4,-0.014843,-0.046528,0.002575,-0.029940,0.030619,-0.011654,0.012663,0.048356,...,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473312,24982,94,-0.000154,-0.006946,-0.010851,0.011448,0.004269,-0.007663,0.003276,-0.000208,...,,,,,,,,,,left_only
2473313,24982,95,-0.004857,0.012683,0.009205,-0.008275,-0.003818,0.006676,-0.003004,0.014902,...,,,,,,,,,,left_only
2473314,24982,96,0.034768,0.040293,-0.008734,0.023405,-0.038429,0.024317,-0.039087,-0.039802,...,,,,,,,,,,left_only
2473315,24982,97,0.017260,0.013419,-0.001536,-0.003390,-0.020430,-0.008701,0.003307,-0.016211,...,,,,,,,,,,left_only


In [30]:
mrg = mrg[mrg['_merge'] == 'left_only']
mrg

Unnamed: 0,UID,JID,joke_feature_1_x,joke_feature_2_x,joke_feature_3_x,joke_feature_4_x,joke_feature_5_x,joke_feature_6_x,joke_feature_7_x,joke_feature_8_x,...,joke_feature_92_y,joke_feature_93_y,joke_feature_94_y,joke_feature_95_y,joke_feature_96_y,joke_feature_97_y,joke_feature_98_y,joke_feature_99_y,joke_feature_100_y,_merge
0,0,0,0.019126,0.012131,-0.027477,0.019825,-0.007704,0.020013,-0.022230,-0.015224,...,,,,,,,,,,left_only
1,0,1,-0.011499,0.002285,0.003630,0.004051,0.003290,-0.000463,0.006562,-0.001884,...,,,,,,,,,,left_only
2,0,2,-0.016668,-0.018199,0.012311,-0.010475,0.025088,0.000381,0.004331,0.023751,...,,,,,,,,,,left_only
3,0,3,-0.032417,0.002634,0.027978,-0.019963,0.023566,0.007097,0.021878,0.044580,...,,,,,,,,,,left_only
4,0,4,-0.014843,-0.046528,0.002575,-0.029940,0.030619,-0.011654,0.012663,0.048356,...,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2473312,24982,94,-0.000154,-0.006946,-0.010851,0.011448,0.004269,-0.007663,0.003276,-0.000208,...,,,,,,,,,,left_only
2473313,24982,95,-0.004857,0.012683,0.009205,-0.008275,-0.003818,0.006676,-0.003004,0.014902,...,,,,,,,,,,left_only
2473314,24982,96,0.034768,0.040293,-0.008734,0.023405,-0.038429,0.024317,-0.039087,-0.039802,...,,,,,,,,,,left_only
2473315,24982,97,0.017260,0.013419,-0.001536,-0.003390,-0.020430,-0.008701,0.003307,-0.016211,...,,,,,,,,,,left_only


In [31]:
def get_n_recommendations_for_user(df, user_id, n, sort_by):
    recommended_items = df[df['UID'] == user_id]
    recommended_items = recommended_items.sort_values(sort_by, ascending=False)  
    return recommended_items.iloc[:n]

In [32]:
n_recommendations = 10

In [33]:
frames = []
for user in tqdm(range(n_users)):
    frames.append(get_n_recommendations_for_user(mrg, user, n_recommendations, 'Rating_pred'))
    
df_rec = pd.concat(frames).reset_index()
df_rec = df_rec.drop(columns=['index'])
df_rec

  0%|          | 0/24983 [00:00<?, ?it/s]

Unnamed: 0,UID,JID,joke_feature_1_x,joke_feature_2_x,joke_feature_3_x,joke_feature_4_x,joke_feature_5_x,joke_feature_6_x,joke_feature_7_x,joke_feature_8_x,...,joke_feature_92_y,joke_feature_93_y,joke_feature_94_y,joke_feature_95_y,joke_feature_96_y,joke_feature_97_y,joke_feature_98_y,joke_feature_99_y,joke_feature_100_y,_merge
0,0,35,-0.033016,-0.022457,0.018897,-0.027672,0.037534,-0.006529,0.009649,0.038965,...,,,,,,,,,,left_only
1,0,38,0.004627,0.005074,-0.004401,0.010018,-0.008095,0.004768,-0.017488,-0.008671,...,,,,,,,,,,left_only
2,0,89,-0.061068,-0.044929,0.062493,-0.046146,0.059271,-0.023593,0.042571,0.111263,...,,,,,,,,,,left_only
3,0,6,-0.036663,-0.067076,0.023498,-0.033260,0.055063,-0.029962,0.007167,0.014843,...,,,,,,,,,,left_only
4,0,47,-0.010867,-0.000013,0.012010,-0.002431,0.009287,-0.000874,0.006839,0.026251,...,,,,,,,,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249825,24982,53,0.015199,0.006955,-0.026504,0.018141,-0.021094,0.002491,0.001258,-0.031133,...,,,,,,,,,,left_only
249826,24982,69,0.002121,-0.029449,-0.013974,-0.010344,0.027003,-0.011740,0.006695,0.002539,...,,,,,,,,,,left_only
249827,24982,62,-0.013547,0.011272,0.014504,0.005440,-0.011418,-0.024910,-0.009044,-0.016531,...,,,,,,,,,,left_only
249828,24982,42,0.041332,0.027141,-0.089489,-0.009382,-0.049431,-0.010458,0.083823,-0.065743,...,,,,,,,,,,left_only


In [34]:
df_rec.merge(train_df, on=['UID','JID'])

Unnamed: 0,UID,JID,joke_feature_1_x,joke_feature_2_x,joke_feature_3_x,joke_feature_4_x,joke_feature_5_x,joke_feature_6_x,joke_feature_7_x,joke_feature_8_x,...,joke_feature_91,joke_feature_92,joke_feature_93,joke_feature_94,joke_feature_95,joke_feature_96,joke_feature_97,joke_feature_98,joke_feature_99,joke_feature_100


In [35]:
df_rec.merge(valid_df, on=['UID','JID'])

Unnamed: 0,UID,JID,joke_feature_1_x,joke_feature_2_x,joke_feature_3_x,joke_feature_4_x,joke_feature_5_x,joke_feature_6_x,joke_feature_7_x,joke_feature_8_x,...,joke_feature_91,joke_feature_92,joke_feature_93,joke_feature_94,joke_feature_95,joke_feature_96,joke_feature_97,joke_feature_98,joke_feature_99,joke_feature_100
0,1,36,-0.001037,0.006190,-0.008002,0.009853,-0.003585,0.004941,0.003574,-0.010647,...,0.009863,0.006131,0.004225,-0.001143,-0.008263,-0.013172,-0.005562,0.007160,-0.000393,0.000634
1,1,32,0.012997,-0.019345,0.001703,0.007861,0.000268,-0.022598,-0.003993,-0.019360,...,-0.019923,-0.011656,0.026532,-0.005845,0.005256,-0.000017,-0.005756,0.003037,-0.005521,0.007523
2,1,53,0.015199,0.006955,-0.026504,0.018141,-0.021094,0.002491,0.001258,-0.031133,...,0.011483,0.014364,0.006693,-0.004073,-0.016496,-0.017518,-0.012153,0.005795,0.013575,0.006137
3,1,62,-0.013547,0.011272,0.014504,0.005440,-0.011418,-0.024910,-0.009044,-0.016531,...,0.013551,0.003050,-0.011050,0.025457,0.036402,-0.043246,-0.004150,0.011129,0.018653,0.030358
4,1,68,0.024309,0.001024,-0.026021,0.033786,-0.014048,-0.003762,-0.008327,-0.045336,...,-0.003169,0.005297,0.019142,-0.008203,0.000500,-0.023445,-0.012393,0.009742,-0.010909,0.009591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
143829,24982,36,-0.001037,0.006190,-0.008002,0.009853,-0.003585,0.004941,0.003574,-0.010647,...,0.009863,0.006131,0.004225,-0.001143,-0.008263,-0.013172,-0.005562,0.007160,-0.000393,0.000634
143830,24982,29,0.003673,0.001946,-0.014492,-0.002926,0.011435,-0.000562,0.013989,0.011275,...,0.010610,0.010710,-0.013466,-0.005950,-0.026658,0.012785,-0.008919,-0.040272,0.033426,0.008710
143831,24982,53,0.015199,0.006955,-0.026504,0.018141,-0.021094,0.002491,0.001258,-0.031133,...,0.011483,0.014364,0.006693,-0.004073,-0.016496,-0.017518,-0.012153,0.005795,0.013575,0.006137
143832,24982,69,0.002121,-0.029449,-0.013974,-0.010344,0.027003,-0.011740,0.006695,0.002539,...,-0.025404,-0.021444,0.002309,-0.033459,0.009489,0.027197,0.008491,-0.017330,-0.026968,-0.023833


In [36]:
catboost_predictions = list(df_rec.groupby('UID').agg({'JID':list})['JID'].values)

In [39]:
predictions_df_catboost_doc2vec = {'catboost_doc2vec':df_rec}
with open('predictions_df_catboost_doc2vec.pkl', 'wb') as f:
    pickle.dump(predictions_df_catboost_doc2vec, f)