In [7]:
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join
from importlib import reload

filter_timestamp = None#int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","user_index_w_type.parquet")#"train_user_index.parquet"
csv_data_location = join("data","downloaded_data")
result_logfile = join("logs","3batches_results.csv")

In [8]:
recsysxgb = utils.model.RecSysXGB1(model_dir = "saved_models/xgb_models_33_0.05_noise_0maxdeltastep")

In [9]:
supermodel = utils.model.RecSysSuperModel([recsysxgb], "saved_models/super_models/super_model_2")

In [10]:
dl = dataloader.RecSys2021TSVDataLoader(
    csv_data_location, 
    use_user_index, 
    mode="train", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    remove_day_counts=True, 
    keep_user_percent=1, 
    batch_size = 1500000,
    load_n_batches = 1,
    TE_smoothing = {"reply":40, "like":40, "retweet":20, "retweet_comment":15},
    remove_user_counts=True,
    add_normal_TE_noise_std = 0 #here 0
    )

Loading User Index
Removing day counts
Extracting combined user counts
Created Dataloader in 11.84 seconds!


In [11]:
xgb_params = {'objective': 'binary:logistic', 'eval_metric':'logloss', 'max_depth':6, "max_delta_step":0}
supermodel.fit(dl, verbose = 1, boost_rounds_per_iteration = 12, xgb_parameters = xgb_params)

Loaded Batch Nr. 1 in 7.68
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of 1 in 0.35
Did prepro part 2 of 1 in 20.14
Did prepro part 3 of 1 in 0.11
Merged Users of 1 in 23.04
Extracted TE of 1 in 16.23
Finished Batch Nr. 1 from file part-00000.tsv in 70.58s!


In [12]:
val_data_location = join("data","validation_data")

dl = dataloader.RecSys2021TSVDataLoader(
    data_directory = val_data_location, 
    user_index_location = use_user_index,
    mode="val", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    random_file_sampling=False, 
    load_n_batches=3, 
    remove_day_counts=True,
    batch_size = 1000000,
    keep_user_percent=1,
    remove_user_counts=True
)

Loading User Index
Removing day counts
Extracting combined user counts
Created Dataloader in 11.59 seconds!


In [13]:
res = supermodel.evaluate_validation_set(dl, store_results_file=result_logfile, validation_run_name="second_try_ensemble")

Loaded Batch Nr. 1 in 5.05
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of 1 in 0.28
Did prepro part 2 of 1 in 13.17
Did prepro part 3 of 1 in 0.08
Merged Users of 1 in 23.95
Extracted TE of 1 in 15.46
Finished Batch Nr. 1 from file part-00000.csv in 60.10s!
Loaded Batch Nr. 2 in 6.02
Timestamp Filtered Batch Nr. 2 in 0.00
Did prepro part 1 of 2 in 0.23
Did prepro part 2 of 2 in 13.34
Did prepro part 3 of 2 in 0.08
Merged Users of 2 in 23.60
Extracted TE of 2 in 15.54
Finished Batch Nr. 2 from file part-00000.csv in 60.90s!
Loaded Batch Nr. 3 in 6.93
Timestamp Filtered Batch Nr. 3 in 0.00
Did prepro part 1 of 3 in 0.23
Did prepro part 2 of 3 in 13.06
Did prepro part 3 of 3 in 0.07
Merged Users of 3 in 23.63
Extracted TE of 3 in 15.29
Finished Batch Nr. 3 from file part-00000.csv in 61.32s!
