## Train the model

In [1]:
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join

filter_timestamp = None#int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","user_index.parquet")#"train_user_index.parquet"
csv_data_location = join("data","downloaded_data")
model_save_location = join("saved_models","xgb_models_06_submission")

In [2]:
dl = dataloader.RecSys2021TSVDataLoader(csv_data_location, use_user_index, mode="train", filter_timestamp=filter_timestamp, verbose=2)

In [3]:
%%time
data = next(iter(dl))
train_data = data[0]
labels = data[1]

Loaded Batch Nr. 1 in 19.11
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of 1 in 0.81
Did prepro part 2 of 1 in 44.23
Did prepro part 3 of 1 in 0.17
Merged Users of 1 in 28.59
Extracted TE of 1 in 3.78
Finished Batch Nr. 1 from file part-00000.tsv in 104.56s!
CPU times: user 1min 39s, sys: 7.78 s, total: 1min 47s
Wall time: 1min 47s


In [5]:
recsysxgb = utils.model.RecSysXGB1()
xgb_params = {'objective': 'binary:logistic', 'eval_metric':'map'}
recsysxgb.train_in_memory(train_data, labels, xgb_params, save_dir = model_save_location)

## Evaluate the model

In [2]:
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join

filter_timestamp = None#int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","user_index.parquet")#"train_user_index.parquet"
csv_data_location = join("data","downloaded_data")
model_save_location = join("saved_models","xgb_models_06_submission")

In [3]:
recsysxgb = utils.model.RecSysXGB1(model_save_location)
dl = dataloader.RecSys2021TSVDataLoader(csv_data_location, use_user_index, mode="val", filter_timestamp=filter_timestamp, verbose=2, random_file_sampling=True)

In [4]:
res = recsysxgb.evaluate_validation_set(dl)
for (target, ap, rce) in zip(recsysxgb.targets__, res[0], res[1]):
    print(f"{target}: {ap} - {rce}")

Loaded Batch Nr. 1 in 21.90
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of 1 in 0.69
Did prepro part 2 of 1 in 37.14
Did prepro part 3 of 1 in 0.14
Merged Users of 1 in 26.60
Extracted TE of 1 in 3.66
Finished Batch Nr. 1 from file part-00192.tsv in 96.91s!
has_reply: 0.4645412232522657 - 32.06209101303632
has_retweet: 0.548032666389949 - 35.19578497942868
has_retweet_comment: 0.2900911843571008 - -3.437766391804664
has_like: 0.7747565166007264 - 28.360476124751344


In [5]:
dict(sorted(recsysxgb.clfs_["has_like"].get_score(importance_type='gain').items(), key=lambda item: item[1],reverse=True))

{'TE_like_a_A': 24727.113911938868,
 'TE_retweet_b_B': 9309.686338033333,
 'n_reply_a_A': 6508.85156,
 'n_like_a_A': 4876.255566966666,
 'TE_reply_b_B': 4766.094137142857,
 'a_follower_count': 3706.351846145454,
 'type_encoding': 3542.090630823077,
 'n_present_a_A': 3339.263940088,
 'n_retweet_comment_a_A': 2824.6595866406665,
 'n_like_b_B': 2765.6549667135796,
 'n_retweet_b_A': 1918.0728423176472,
 'n_day_44_A': 1863.421875,
 'b_creation_delta': 1778.7288189846158,
 'n_present_b_A': 1662.668864988889,
 'n_photos': 1637.3705602454545,
 'a_follows_b': 1571.0743259666667,
 'TE_like_b_B': 1354.383182279663,
 'bert_token_len': 1121.31177,
 'n_day_42_A': 1111.78577,
 'TE_retweet_a_A': 1022.5263937859091,
 'n_retweet_a_A': 973.51355,
 'n_day_43_A': 879.901123,
 'b_following_count': 785.57483005,
 'n_retweet_b_B': 668.2960213,
 'TE_retweet_b_A': 653.8935736266667,
 'a_is_verified': 563.516724,
 'n_day_45_A': 403.98168960500004,
 'n_day_36_A': 367.082886,
 'n_present_b_B': 280.11854680312496,


## Try sample test run

In [None]:
import utils.model
import utils.dataloader

dl = utils.dataloader.RecSys2021TSVDataLoader("test", "user_index.parquet", mode="test", load_n_batches=-1)
recsysxgb = utils.model.RecSysXGB1("xgb_models_05_submission")

recsysxgb.evaluate_test_set(testLoader = dl, output_file = "res.csv")

## Testing Custom batch sizes

In [1]:
from utils import dataloader
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join

filter_timestamp = None#int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","user_index.parquet")#"train_user_index.parquet"
csv_data_location = join("data","downloaded_data")
model_save_location = join("saved_models","xgb_models_06_submission")

In [2]:
dl = dataloader.RecSys2021TSVDataLoader(csv_data_location, use_user_index, mode="train", filter_timestamp=filter_timestamp, load_n_batches=1, batch_size=1000, verbose=2, random_file_sampling=True)

In [3]:
r = [a[0] for a in dl]

Loaded Batch Nr. 1 in 0.05
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of 1 in 0.00
Did prepro part 2 of 1 in 0.02
Did prepro part 3 of 1 in 0.00
Merged Users of 1 in 20.69
Extracted TE of 1 in 3.09
Finished Batch Nr. 1 from file part-00072.tsv in 23.92s!


In [5]:
r[0]

Unnamed: 0,a_follower_count,a_following_count,a_is_verified,b_follower_count,b_following_count,b_is_verified,a_follows_b,bert_token_len,n_photos,n_videos,...,TE_like_b_A,TE_like_b_B,TE_retweet_a_A,TE_retweet_a_B,TE_retweet_b_A,TE_retweet_b_B,TE_retweet_comment_a_A,TE_retweet_comment_a_B,TE_retweet_comment_b_A,TE_retweet_comment_b_B
0,526918,163,False,220,638,False,False,3,0.693147,0.0,...,0.655940,0.582249,0.040542,0.050395,0.158352,0.006139,0.001334,0.005083,0.011672,0.000292
1,3062,1695,False,392,487,False,True,3,-1.000000,-1.0,...,0.718410,0.644231,0.020698,0.055194,0.173433,0.053160,0.019945,0.005567,0.012784,0.001226
2,2298,346,False,1306,5000,False,True,2,-1.000000,-1.0,...,0.285400,0.757480,0.035474,0.074269,0.031520,0.103158,0.000508,0.002088,0.007086,0.005966
3,4813450,662,True,90,91,False,False,3,0.693147,0.0,...,0.754331,0.705587,0.031772,0.052660,0.132105,0.148972,0.004921,0.002851,0.013423,0.012081
4,2740151,15255,True,158,411,False,False,3,-1.000000,-1.0,...,0.754331,0.803464,0.016482,0.057954,0.132105,0.105684,0.002477,0.005845,0.013423,0.010738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1699225,1682398,4939,True,288,390,False,False,3,0.693147,0.0,...,0.754331,0.371785,0.026459,0.035124,0.132105,0.348737,0.007123,0.003542,0.013423,0.000831
1699226,138545,267,False,197,450,False,False,4,-1.000000,-1.0,...,0.699418,0.504699,0.176268,0.105287,0.114874,0.089691,0.005643,0.002386,0.011672,0.000226
1699227,4018,4048,False,112,85,False,True,3,-1.000000,-1.0,...,0.731210,0.763234,0.017553,0.022727,0.120095,0.036060,0.000950,0.002292,0.012203,0.002658
1699228,523356,6145,True,0,73,False,False,3,-1.000000,-1.0,...,0.753609,0.837485,0.095438,0.057954,0.110087,0.043313,0.006496,0.005845,0.011186,0.004401


small batch sizes are unproportionally slow. Probably the left merge with the index is the slow thing but not sure

In [17]:
import utils.features as fe
import utils.constants as co
import pandas as pd
import numpy as np

In [12]:
current_file = pd.read_csv(
                "data/test_files/part-00002.csv",
                sep='\x01',
                header=None,
                names=co.all_features,
                dtype={k: v for k, v in co.dtypes_of_features.items() if k in co.all_features}
            )

In [14]:
current_file["medias"] = current_file["medias"].fillna("")
current_file["hashtags"] = current_file["hashtags"].fillna("")
current_file["links"] = current_file["links"].fillna("")
current_file["domains"] = current_file["domains"].fillna("")
current_file["medias"] = current_file["medias"].fillna("")