## Train the model

In [1]:
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join
from importlib import reload

method_name = "54_All_20_TE_98percent_user_no_tweet_features"
filter_timestamp = None#int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","user_index_w_type.parquet")#"train_user_index.parquet"
csv_data_location = join("data","downloaded_data")
model_save_location = join("saved_models","xgb_models_"+method_name)
batches1_result_logfile = join("logs","1batches_results.csv")
batches3_result_logfile = join("logs","3batches_results.csv")
full_result_logfile = join("logs","results.csv")

In [2]:
dl = dataloader.RecSys2021TSVDataLoader(
    csv_data_location, 
    use_user_index, 
    mode="train", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    remove_day_counts=True, 
    keep_user_percent=0.98, 
    batch_size = 1500000,
    load_n_batches = 1,
    TE_smoothing = {"reply":20, "like":20, "retweet":20, "retweet_comment":20},
    remove_user_counts=True,
    add_normal_TE_noise_std= 0.05,
    drop_features=["hour_of_day", "day_of_week"]
    )

Loading User Index
Randomly keeping only 98.0% of the users.
Removing day counts
Extracting combined user counts
Created Dataloader in 59.73 seconds!


  TE_smoothing = {"reply":40, "like":40, "retweet":20, "retweet_comment":15}

In [3]:
recsysxgb = utils.model.RecSysXGB1(model_dir = model_save_location)
#recsys_neural = utils.model.RecSysNeural1(model_dir = model_save_location, n_input_features=90, device="cuda")
xgb_params = {'objective': 'binary:logistic', 'eval_metric':'map', 'max_depth':8, "max_delta_step":0}

In [4]:
%%time
recsysxgb.fit(
    train_loader= dl,
    xgb_parameters = xgb_params,
    boost_rounds_per_iteration = 20,
    verbose = 1,
    n_epochs = 1
)
#recsys_neural.fit(dl, 1, lr = 5 * 1e-5)

Loaded Batch Nr. 1 in 11.02
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of Batch Nr. 1 in 0.33
Did prepro part 2 of Batch Nr. 1 in 19.19
Did prepro part 3 of Batch Nr. 1 in 0.11
Merged Users of Batch Nr. 1 in 25.34
Extracted TE of Batch Nr. 1 in 19.00
Finished Batch Nr. 1 from file part-00000.tsv in 78.08s!
Finished 1 epochs.
CPU times: user 1h 26min 31s, sys: 41.8 s, total: 1h 27min 13s
Wall time: 13min 23s


## Evaluate the model

In [19]:
recsysxgb = utils.model.RecSysXGB1("saved_models/xgb_models_45_No_Item_Based_Features")
#model = utils.model.RandomBaselineModel()
val_data_location = join("data","validation_data")

dl = dataloader.RecSys2021TSVDataLoader(
    data_directory = val_data_location, 
    user_index_location = use_user_index,
    mode="val", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    random_file_sampling=False, 
    load_n_batches=-1, 
    remove_day_counts=True,
    TE_smoothing = {"reply":15, "like":15, "retweet":15, "retweet_comment":15},
    batch_size = 1000000,
    keep_user_percent=1,
    remove_user_counts=True,
    drop_features=["hour_of_day", "day_of_week"]
    #normalize_batch=True
)

Loading User Index
Removing day counts
Extracting combined user counts
Created Dataloader in 15.55 seconds!


In [20]:
res = recsysxgb.evaluate_validation_set(dl, store_results_file=full_result_logfile, validation_run_name="45_No_Item_Based_Features_15TE")

Loaded Batch Nr. 1 in 5.65
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of Batch Nr. 1 in 0.22
Did prepro part 2 of Batch Nr. 1 in 12.99
Did prepro part 3 of Batch Nr. 1 in 0.08
Merged Users of Batch Nr. 1 in 24.03
Extracted TE of Batch Nr. 1 in 15.33
Finished Batch Nr. 1 from file part-00000.csv in 60.39s!
Loaded Batch Nr. 2 in 8.84
Timestamp Filtered Batch Nr. 2 in 0.00
Did prepro part 1 of Batch Nr. 2 in 0.23
Did prepro part 2 of Batch Nr. 2 in 13.03
Did prepro part 3 of Batch Nr. 2 in 0.08
Merged Users of Batch Nr. 2 in 23.59
Extracted TE of Batch Nr. 2 in 15.34
Finished Batch Nr. 2 from file part-00000.csv in 63.19s!
Loaded Batch Nr. 3 in 11.98
Timestamp Filtered Batch Nr. 3 in 0.00
Did prepro part 1 of Batch Nr. 3 in 0.22
Did prepro part 2 of Batch Nr. 3 in 13.48
Did prepro part 3 of Batch Nr. 3 in 0.08
Merged Users of Batch Nr. 3 in 26.97
Extracted TE of Batch Nr. 3 in 16.15
Finished Batch Nr. 3 from file part-00000.csv in 71.09s!
Loaded Batch Nr. 4 in 12.01
Timestam

In [7]:
for key in res:
    print(f"{key:32}: {res[key]}")

Q1_reply_rce                    : 14.013823473473263
Q1_retweet_rce                  : 16.404863358793374
Q1_retweet_comment_rce          : 8.382256077513139
Q1_like_rce                     : 2.2255545096960594
Q1_reply_avg_prec               : 0.1618842827217921
Q1_retweet_avg_prec             : 0.35054776613360816
Q1_retweet_comment_avg_prec     : 0.05265133649580212
Q1_like_avg_prec                : 0.5629720154270186
Q2_reply_rce                    : 17.42610515679328
Q2_retweet_rce                  : 19.088782129516304
Q2_retweet_comment_rce          : 8.580979222956975
Q2_like_rce                     : 4.616886495217698
Q2_reply_avg_prec               : 0.19057418117297945
Q2_retweet_avg_prec             : 0.3542256415570396
Q2_retweet_comment_avg_prec     : 0.04753390248413203
Q2_like_avg_prec                : 0.5529952328129224
Q3_reply_rce                    : 20.240779997182745
Q3_retweet_rce                  : 20.96650504393377
Q3_retweet_comment_rce          : 9.74567553456

In [8]:
for target in recsysxgb.targets__:
    print(f"\n{target}:")
    print(dict(sorted(recsysxgb.clfs_[target].get_score(importance_type='gain').items(), key=lambda item: item[1], reverse=True)))


has_reply:
{'type_encoding': 284.1526326651887, 'TE_reply_b_B': 234.93349084924142, 'a_follows_b': 137.81420806428574, 'TE_reply_a_TopLevel_A': 101.19735677495663, 'TE_reply_b_TopLevel_B': 82.40309073139193, 'TE_reply_a_A': 77.3664276162817, 'photo_count': 45.331092485999996, 'TE_like_b_TopLevel_B': 44.9664293289007, 'a_follower_count': 33.57327131576975, 'TE_like_b_B': 30.77871013116574, 'TE_retweet_b_B': 28.04354604183333, 'TE_like_b_A': 27.012651586562495, 'TE_like_a_TopLevel_A': 26.403389915533335, 'TE_like_a_A': 25.371632450597218, 'TE_reply_b_Quote_B': 24.86108129402778, 'TE_like_a_B': 24.682872706244897, 'TE_reply_b_Retweet_B': 23.67892222197575, 'a_b_follower_ratio': 23.094230736636366, 'TE_like_a_TopLevel_B': 21.888018830631577, 'TE_like_b_Quote_B': 19.976625707301586, 'b_follower_count': 18.718336541999996, 'TE_reply_a_Quote_A': 18.619800717777782, 'language_encoding': 17.93715884722222, 'TE_retweet_b_TopLevel_B': 17.747888495444442, 'TE_like_a_Retweet_A': 17.43989414268519,