## Train the model

In [1]:
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join
from importlib import reload

filter_timestamp = int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","train_user_index.parquet")#"train_user_index.parquet"
csv_data_location = join("data","downloaded_data")
model_save_location = join("saved_models","xgb_models_13_user_sampling")

In [2]:
dl = dataloader.RecSys2021TSVDataLoader(csv_data_location, use_user_index, mode="train", filter_timestamp=filter_timestamp, verbose=2, remove_day_counts=True, keep_user_percent=0.6)

In [3]:
%%time
data = next(iter(dl))
train_data = data[0]
quantiles = data[1]
labels = data[2]

Loaded Batch Nr. 1 in 17.80
Timestamp Filtered Batch Nr. 1 in 0.57
Did prepro part 1 of 1 in 0.62
Did prepro part 2 of 1 in 31.76
Did prepro part 3 of 1 in 0.14
Merged Users of 1 in 12.06
Extracted TE of 1 in 2.12
Finished Batch Nr. 1 from file part-00000.tsv in 70.94s!
CPU times: user 1min 9s, sys: 3.52 s, total: 1min 12s
Wall time: 1min 12s


In [10]:
%%time
recsysxgb = utils.model.RecSysXGB1()
xgb_params = {'objective': 'binary:logistic', 'eval_metric':'map'}
recsysxgb.train_in_memory(
    train_set = train_data, 
    quantiles = quantiles, 
    targets = labels, 
    xgb_parameters = xgb_params, 
    save_dir = model_save_location
    )

CPU times: user 20min 27s, sys: 17.7 s, total: 20min 44s
Wall time: 3min 38s


## Evaluate the model

In [15]:
use_user_index = join("indices","user_index.parquet")#"train_user_index.parquet"
recsysxgb = utils.model.RecSysXGB1(model_save_location)
dl = dataloader.RecSys2021TSVDataLoader(csv_data_location, use_user_index, mode="val", filter_timestamp=filter_timestamp, verbose=2, random_file_sampling=True, load_n_batches=1, remove_day_counts=True)

In [16]:
res = recsysxgb.evaluate_validation_set(dl)

Loaded Batch Nr. 1 in 21.60
Timestamp Filtered Batch Nr. 1 in 0.40
Did prepro part 1 of 1 in 0.26
Did prepro part 2 of 1 in 10.67
Did prepro part 3 of 1 in 0.04
Merged Users of 1 in 20.85
Extracted TE of 1 in 3.16
Finished Batch Nr. 1 from file part-00063.tsv in 59.10s!


In [17]:
for key in res:
    print(f"{key:32}: {res[key]}")

Q1_reply_rce                    : 40.43066941723738
Q1_retweet_rce                  : 42.23354816056091
Q1_retweet_comment_rce          : 10.557050634796173
Q1_like_rce                     : 35.15080642986911
Q1_reply_avg_prec               : 0.5880017132565781
Q1_retweet_avg_prec             : 0.6506332673833364
Q1_retweet_comment_avg_prec     : 0.4505410907293112
Q1_like_avg_prec                : 0.8408528715639593
Q2_reply_rce                    : 33.46047612682707
Q2_retweet_rce                  : 37.24967154428882
Q2_retweet_comment_rce          : 0.3776908172082427
Q2_like_rce                     : 29.864388034567046
Q2_reply_avg_prec               : 0.483154777305578
Q2_retweet_avg_prec             : 0.5871992837757639
Q2_retweet_comment_avg_prec     : 0.3466507803607606
Q2_like_avg_prec                : 0.7956592362537639
Q3_reply_rce                    : 31.12794796967223
Q3_retweet_rce                  : 34.203633448816205
Q3_retweet_comment_rce          : -4.0004203168899855

In [14]:
dict(sorted(recsysxgb.clfs_["has_reply"].get_score(importance_type='gain').items(), key=lambda item: item[1],reverse=True))

{'TE_reply_b_B': 1185.576607681645,
 'TE_retweet_comment_a_B': 743.8687929222223,
 'TE_reply_a_A': 728.9023049449255,
 'type_encoding': 498.3373325551724,
 'TE_like_b_B': 459.2703493112501,
 'a_follows_b': 404.63523348333337,
 'n_like_b_B': 377.2458648152173,
 'n_reply_b_B': 358.38123173125,
 'TE_retweet_a_B': 327.93947554,
 'n_like_a_A': 271.8788388888889,
 'TE_like_b_A': 261.53769621279065,
 'n_present_a_A': 179.29027033843744,
 'a_follower_count': 152.18059728488234,
 'TE_retweet_b_A': 151.9155927442857,
 'n_reply_a_A': 151.32305655166667,
 'n_present_b_B': 123.1579262490909,
 'n_retweet_a_A': 108.057861,
 'TE_retweet_b_B': 83.203886915,
 'b_follower_count': 66.69645941666666,
 'TE_like_a_A': 64.15599807692307,
 'n_present_a_B': 58.981594083333334,
 'n_retweet_b_B': 58.04761208,
 'b_creation_delta': 50.548776246,
 'TE_reply_a_B': 47.35697772857143,
 'n_present_b_A': 32.950494757499996,
 'TE_retweet_comment_b_A': 29.50127816,
 'n_retweet_b_A': 26.04238934727273,
 'TE_reply_b_A': 22.2

## Try sample test run

In [None]:
import utils.model
import utils.dataloader

dl = utils.dataloader.RecSys2021TSVDataLoader("test", "user_index.parquet", mode="test", load_n_batches=-1)
recsysxgb = utils.model.RecSysXGB1("xgb_models_05_submission")

recsysxgb.evaluate_test_set(testLoader = dl, output_file = "res.csv")

## Testing Custom batch sizes

In [1]:
from utils import dataloader
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join

filter_timestamp = None#int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","user_index.parquet")#"train_user_index.parquet"
csv_data_location = join("data","test_files")
model_save_location = join("saved_models","xgb_models_06_submission")

In [2]:
dl = dataloader.RecSys2021TSVDataLoader(csv_data_location, use_user_index, mode="test", filter_timestamp=filter_timestamp, load_n_batches=-1, batch_size=1000000, verbose=2, random_file_sampling=True)

In [3]:
r = [a[1] for a in dl]

just one
1000000
Loaded Batch Nr. 1 in 6.10
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of 1 in 0.20
Did prepro part 2 of 1 in 11.80
Did prepro part 3 of 1 in 0.05
Merged Users of 1 in 21.76
Extracted TE of 1 in 3.21
Finished Batch Nr. 1 from file part-00003.csv in 45.47s!
just one
1000000
Loaded Batch Nr. 2 in 7.32
Timestamp Filtered Batch Nr. 2 in 0.00
Did prepro part 1 of 2 in 0.20
Did prepro part 2 of 2 in 11.87
Did prepro part 3 of 2 in 0.05
Merged Users of 2 in 22.05
Extracted TE of 2 in 3.20
Finished Batch Nr. 2 from file part-00003.csv in 47.02s!
just one
1000000
Loaded Batch Nr. 3 in 8.79
Timestamp Filtered Batch Nr. 3 in 0.00
Did prepro part 1 of 3 in 0.20
Did prepro part 2 of 3 in 11.85
Did prepro part 3 of 3 in 0.05
Merged Users of 3 in 22.73
Extracted TE of 3 in 3.20
Finished Batch Nr. 3 from file part-00003.csv in 49.16s!
more than one 1000000
1000000
Loaded Batch Nr. 4 in 12.99
Timestamp Filtered Batch Nr. 4 in 0.00
Did prepro part 1 of 4 in 0.20
Did prepro 

In [4]:
for df in r:
    print(len(df))

1000000
1000000
1000000
1000000
1000000
1000000
21788


In [17]:
import utils.features as fe
import utils.constants as co
import pandas as pd
import numpy as np

In [12]:
current_file = pd.read_csv(
                "data/test_files/part-00002.csv",
                sep='\x01',
                header=None,
                names=co.all_features,
                dtype={k: v for k, v in co.dtypes_of_features.items() if k in co.all_features}
            )

In [14]:
current_file["medias"] = current_file["medias"].fillna("")
current_file["hashtags"] = current_file["hashtags"].fillna("")
current_file["links"] = current_file["links"].fillna("")
current_file["domains"] = current_file["domains"].fillna("")
current_file["medias"] = current_file["medias"].fillna("")

In [27]:
import numpy as np
set(np.exp(train_data["n_gifs"]+ train_data["n_photos"] +train_data["n_videos"])-1)

{-0.9502129554748535, 1.0, 2.0, 3.0, 4.000000476837158, 5.0, 7.0}

# Other