In [1]:
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join
from importlib import reload

filter_timestamp = None#int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","user_index_w_type.parquet")#"train_user_index.parquet"
csv_data_location = join("data","downloaded_data")
result_logfile = join("logs","3batches_results.csv")

In [2]:
models = [
    utils.model.RecSysXGB1(model_dir = "saved_models/xgb_models_33_0.05_noise_0maxdeltastep"),
    utils.model.RecSysXGB1(model_dir = "saved_models/xgb_models_52_All_20_TE_95percent_user"),
    utils.model.RecSysXGB1(model_dir = "saved_models/xgb_models_53_All_20_TE_95percent_user_100boost_rounds"),
    utils.model.RecSysXGB1(model_dir = "saved_models/xgb_models_50_Different_TE_weight_different_reply15"),
    utils.model.RecSysXGB1(model_dir = "saved_models/xgb_models_49_Different_TE_weight_different_reply30"),
    utils.model.RecSysXGB1(model_dir = "saved_models/xgb_models_48_Different_TE_weight"),
    utils.model.RecSysXGB1(model_dir = "saved_models/xgb_models_46_6_Max_depth"),
    utils.model.RecSysXGB1(model_dir = "saved_models/xgb_models_47_10_Max_depth")
]

In [3]:
supermodel = utils.model.RecSysSuperModel(models, "saved_models/super_models/super_model_3_many")

In [4]:
dl = dataloader.RecSys2021TSVDataLoader(
    csv_data_location, 
    use_user_index, 
    mode="train", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    remove_day_counts=True, 
    keep_user_percent=1, 
    batch_size = 1500000,
    load_n_batches = 1,
    TE_smoothing = {"reply":20, "like":20, "retweet":20, "retweet_comment":20},
    remove_user_counts=True,
    add_normal_TE_noise_std = 0 #here 0
    )

Loading User Index
Removing day counts
Extracting combined user counts
Created Dataloader in 26.43 seconds!


In [5]:
xgb_params = {'objective': 'binary:logistic', 'eval_metric':'logloss', 'max_depth':6, "max_delta_step":0}
supermodel.fit(dl, verbose = 1, boost_rounds_per_iteration = 12, xgb_parameters = xgb_params)

Loaded Batch Nr. 1 in 11.02
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of Batch Nr. 1 in 0.33
Did prepro part 2 of Batch Nr. 1 in 18.23
Did prepro part 3 of Batch Nr. 1 in 0.11
Merged Users of Batch Nr. 1 in 25.97
Extracted TE of Batch Nr. 1 in 16.00
Finished Batch Nr. 1 from file part-00000.tsv in 74.66s!


In [6]:
val_data_location = join("data","validation_data")

dl = dataloader.RecSys2021TSVDataLoader(
    data_directory = val_data_location, 
    user_index_location = use_user_index,
    mode="val", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    random_file_sampling=False, 
    load_n_batches=1, 
    remove_day_counts=True,
    batch_size = 1000000,
    keep_user_percent=1,
    remove_user_counts=True
)

Loading User Index
Removing day counts
Extracting combined user counts
Created Dataloader in 12.83 seconds!


In [7]:
res = supermodel.evaluate_validation_set(dl, store_results_file="logs/1batches_results.csv", validation_run_name="big3_try_ensemble")

Loaded Batch Nr. 1 in 9.94
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of Batch Nr. 1 in 0.22
Did prepro part 2 of Batch Nr. 1 in 12.34
Did prepro part 3 of Batch Nr. 1 in 0.08
Merged Users of Batch Nr. 1 in 26.12
Extracted TE of Batch Nr. 1 in 15.28
Finished Batch Nr. 1 from file part-00000.csv in 66.06s!
