## Train the model

In [5]:
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join
from importlib import reload

method_name = "43_Prior_Baseline"
filter_timestamp = None#int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","user_index_w_type.parquet")#"train_user_index.parquet"
csv_data_location = join("data","downloaded_data")
model_save_location = join("saved_models","xgb_models_"+method_name)
result_logfile = join("logs","3batches_results.csv")
full_result_logfile = join("logs","results.csv")

In [6]:
dl = dataloader.RecSys2021TSVDataLoader(
    csv_data_location, 
    use_user_index, 
    mode="train", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    remove_day_counts=True, 
    keep_user_percent=0.98, 
    batch_size = 1500000,
    load_n_batches = 1,
    TE_smoothing = {"reply":40, "like":40, "retweet":20, "retweet_comment":15},
    remove_user_counts=True,
    add_normal_TE_noise_std= 0.0
    )

Loading User Index
Randomly keeping only 98.0% of the users.
Removing day counts
Extracting combined user counts
Created Dataloader in 59.51 seconds!


  TE_smoothing = {"reply":40, "like":40, "retweet":20, "retweet_comment":15}

In [8]:
recsysxgb = utils.model.RecSysXGB1(model_dir = model_save_location)
#recsys_neural = utils.model.RecSysNeural1(model_dir = model_save_location, n_input_features=90, device="cuda")
xgb_params = {'objective': 'binary:logistic', 'eval_metric':'map', 'max_depth':8, "max_delta_step":0}

In [7]:
model = utils.model.PriorPredModel()
model.fit(dl)

Loaded Batch Nr. 1 in 10.30
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of Batch Nr. 1 in 0.34
Did prepro part 2 of Batch Nr. 1 in 18.93
Did prepro part 3 of Batch Nr. 1 in 0.11
Merged Users of Batch Nr. 1 in 25.70
Extracted TE of Batch Nr. 1 in 16.39
Finished Batch Nr. 1 from file part-00000.tsv in 74.86s!


In [9]:
%%time
recsysxgb.fit(
    train_loader= dl,
    xgb_parameters = xgb_params,
    boost_rounds_per_iteration = 20,
    verbose = 1,
    n_epochs = 1
)
#recsys_neural.fit(dl, 1, lr = 5 * 1e-5)

Loaded Batch Nr. 1 in 10.82
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of Batch Nr. 1 in 0.33
Did prepro part 2 of Batch Nr. 1 in 18.52
Did prepro part 3 of Batch Nr. 1 in 0.11
Merged Users of Batch Nr. 1 in 27.01
Extracted TE of Batch Nr. 1 in 16.05
Finished Batch Nr. 1 from file part-00000.tsv in 75.91s!
Finished 1 epochs.
CPU times: user 53min 17s, sys: 33 s, total: 53min 50s
Wall time: 8min 37s


## Evaluate the model

In [8]:
#recsysxgb = utils.model.RecSysXGB1(model_save_location)
#model = utils.model.RandomBaselineModel()
val_data_location = join("data","validation_data")

dl = dataloader.RecSys2021TSVDataLoader(
    data_directory = val_data_location, 
    user_index_location = use_user_index,
    mode="val", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    random_file_sampling=False, 
    load_n_batches=-1, 
    remove_day_counts=True,
    TE_smoothing = {"reply":20, "like":20, "retweet":20, "retweet_comment":20},
    batch_size = 1000000,
    keep_user_percent=1,
    remove_user_counts=True,
    #normalize_batch=True
)

Loading User Index
Removing day counts
Extracting combined user counts
Created Dataloader in 20.17 seconds!


In [9]:
res = model.evaluate_validation_set(dl, store_results_file=full_result_logfile, validation_run_name=method_name)

Loaded Batch Nr. 1 in 8.19
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of Batch Nr. 1 in 0.23
Did prepro part 2 of Batch Nr. 1 in 12.68
Did prepro part 3 of Batch Nr. 1 in 0.08
Merged Users of Batch Nr. 1 in 24.41
Extracted TE of Batch Nr. 1 in 15.34
Finished Batch Nr. 1 from file part-00000.csv in 62.98s!
Loaded Batch Nr. 2 in 9.65
Timestamp Filtered Batch Nr. 2 in 0.00
Did prepro part 1 of Batch Nr. 2 in 0.23
Did prepro part 2 of Batch Nr. 2 in 12.59
Did prepro part 3 of Batch Nr. 2 in 0.08
Merged Users of Batch Nr. 2 in 23.79
Extracted TE of Batch Nr. 2 in 15.38
Finished Batch Nr. 2 from file part-00000.csv in 63.84s!
Loaded Batch Nr. 3 in 9.88
Timestamp Filtered Batch Nr. 3 in 0.00
Did prepro part 1 of Batch Nr. 3 in 0.22
Did prepro part 2 of Batch Nr. 3 in 12.56
Did prepro part 3 of Batch Nr. 3 in 0.08
Merged Users of Batch Nr. 3 in 23.82
Extracted TE of Batch Nr. 3 in 15.35
Finished Batch Nr. 3 from file part-00000.csv in 64.05s!
Loaded Batch Nr. 4 in 10.94
Timestamp

In [12]:
for key in res:
    print(f"{key:32}: {res[key]}")

Q1_reply_rce                    : -12.585719490603209
Q1_retweet_rce                  : 0.4125306471494561
Q1_retweet_comment_rce          : -6.146190616662284
Q1_like_rce                     : -12.167885394376965
Q1_reply_avg_prec               : 0.1211784738452119
Q1_retweet_avg_prec             : 0.3180248997974516
Q1_retweet_comment_avg_prec     : 0.024446132102799913
Q1_like_avg_prec                : 0.5592943225962173
Q2_reply_rce                    : -2.3790892398693586
Q2_retweet_rce                  : 7.739039833443629
Q2_retweet_comment_rce          : -4.303713220733707
Q2_like_rce                     : -3.241459100826116
Q2_reply_avg_prec               : 0.15038604849459117
Q2_retweet_avg_prec             : 0.3225702729386979
Q2_retweet_comment_avg_prec     : 0.024064927702804485
Q2_like_avg_prec                : 0.5561679729119796
Q3_reply_rce                    : 6.126517344773886
Q3_retweet_rce                  : 13.491213765462906
Q3_retweet_comment_rce          : -0.707

In [13]:
for target in recsysxgb.targets__:
    print(f"\n{target}:")
    print(dict(sorted(recsysxgb.clfs_[target].get_score(importance_type='gain').items(), key=lambda item: item[1], reverse=True)))


has_reply:
{'TE_reply_b_B': 735.350947829531, 'TE_reply_a_A': 376.37462088576865, 'TE_like_b_TopLevel_B': 100.13497553691674, 'type_encoding': 97.2133077297846, 'a_follows_b': 86.46060753400002, 'TE_reply_b_TopLevel_B': 84.86845846827272, 'TE_reply_b_Quote_B': 65.84351171276433, 'TE_retweet_b_TopLevel_B': 60.80687721988095, 'TE_like_b_Quote_B': 52.61863481170639, 'TE_reply_b_Retweet_B': 49.23593328271971, 'TE_like_b_A': 45.121439091351355, 'photo_count': 40.56001616166667, 'TE_like_a_TopLevel_A': 37.43380112304284, 'TE_retweet_comment_a_TopLevel_A': 36.820448405644065, 'TE_like_b_B': 36.22819135241667, 'TE_reply_a_TopLevel_A': 35.549748509981555, 'TE_reply_a_Quote_A': 32.027620360622954, 'a_follower_count': 31.441350596188673, 'TE_reply_a_B': 30.992988409957448, 'TE_retweet_comment_b_TopLevel_B': 30.766983620190477, 'TE_retweet_comment_a_A': 30.348533817285723, 'TE_retweet_comment_a_TopLevel_B': 27.112423234565224, 'TE_like_a_TopLevel_B': 26.035899335342858, 'TE_like_b_Retweet_B': 25.