## Train the model

In [1]:
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join
from importlib import reload

filter_timestamp = int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","train_user_index.parquet")#"train_user_index.parquet"
csv_data_location = join("data","downloaded_data")
model_save_location = join("saved_models","")

In [2]:
dl = dataloader.RecSys2021TSVDataLoader(csv_data_location, use_user_index, mode="train", filter_timestamp=filter_timestamp, verbose=2, remove_day_counts=True, keep_user_percent=0.6)

Loading User Index
Removing day counts
Randomly keeping only 60.0% of the users.
Created Dataloader in 20.15 seconds!


In [3]:
%%time
data = next(iter(dl))
train_data = data[0]
quantiles = data[1]
labels = data[2]

Loaded Batch Nr. 1 in 24.30
Timestamp Filtered Batch Nr. 1 in 0.53
Did prepro part 1 of 1 in 0.59
Did prepro part 2 of 1 in 31.77
Did prepro part 3 of 1 in 0.12
Merged Users of 1 in 10.86
Extracted TE of 1 in 2.05
Finished Batch Nr. 1 from file part-00000.tsv in 75.77s!
CPU times: user 1min 9s, sys: 3.58 s, total: 1min 12s
Wall time: 1min 17s


In [4]:
%%time
recsysxgb = utils.model.RecSysXGB1()
xgb_params = {'objective': 'binary:logistic', 'eval_metric':'map', 'max_depth':15}
recsysxgb.train_in_memory(
    train_set = train_data, 
    quantiles = quantiles, 
    targets = labels, 
    xgb_parameters = xgb_params, 
    save_dir = model_save_location
    )

CPU times: user 59min 13s, sys: 40 s, total: 59min 53s
Wall time: 8min 45s


## Evaluate the model

In [9]:
recsysxgb = utils.model.RecSysXGB1(model_save_location)
dl = dataloader.RecSys2021TSVDataLoader(
    data_directory = csv_data_location, 
    user_index_location = join("indices","user_index.parquet"),#use_user_index, 
    mode="val", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    random_file_sampling=True, 
    load_n_batches=1, 
    remove_day_counts=True,
    keep_user_percent=0.4
)

Loading User Index
Removing day counts
Randomly keeping only 40.0% of the users.
Created Dataloader in 11.35 seconds!


In [10]:
res = recsysxgb.evaluate_validation_set(dl)

Loaded Batch Nr. 1 in 21.37
Timestamp Filtered Batch Nr. 1 in 0.40
Did prepro part 1 of 1 in 0.23
Did prepro part 2 of 1 in 11.15
Did prepro part 3 of 1 in 0.04
Merged Users of 1 in 7.34
Extracted TE of 1 in 1.40
Finished Batch Nr. 1 from file part-00197.tsv in 44.24s!


In [11]:
for key in res:
    print(f"{key:32}: {res[key]}")

Q1_reply_rce                    : 22.729220073921987
Q1_retweet_rce                  : 26.151030010784282
Q1_retweet_comment_rce          : -9.780654431664804
Q1_like_rce                     : 21.09037327931468
Q1_reply_avg_prec               : 0.3336286918695288
Q1_retweet_avg_prec             : 0.46209248120849516
Q1_retweet_comment_avg_prec     : 0.15533765934336405
Q1_like_avg_prec                : 0.740558434702396
Q2_reply_rce                    : 19.618270593603004
Q2_retweet_rce                  : 22.28028927275042
Q2_retweet_comment_rce          : -14.346606375277581
Q2_like_rce                     : 17.487896762235643
Q2_reply_avg_prec               : 0.27621584326400306
Q2_retweet_avg_prec             : 0.4043430392308498
Q2_retweet_comment_avg_prec     : 0.12160858236590194
Q2_like_avg_prec                : 0.6889167457883101
Q3_reply_rce                    : 17.192485094446873
Q3_retweet_rce                  : 19.775637805773293
Q3_retweet_comment_rce          : -17.132469

In [12]:
dict(sorted(recsysxgb.clfs_["has_reply"].get_score(importance_type='gain').items(), key=lambda item: item[1],reverse=True))

{'type_encoding': 107.09208680142642,
 'a_follows_b': 71.13019857198053,
 'TE_reply_b_B': 62.4715877740013,
 'TE_reply_a_A': 30.63367578810191,
 'n_reply_b_B': 25.503888169851507,
 'n_like_b_B': 18.849285707841,
 'TE_like_b_A': 17.795373948141187,
 'n_reply_a_A': 15.689319692110734,
 'TE_retweet_comment_a_B': 11.730206456717488,
 'n_present_a_A': 11.281252995845886,
 'TE_like_b_B': 8.907177979841684,
 'TE_like_a_B': 6.787289813129181,
 'n_present_b_B': 6.184094191282754,
 'TE_retweet_a_B': 5.310141026260722,
 'n_like_a_A': 5.234576240058882,
 'TE_retweet_b_B': 4.947445741521798,
 'TE_like_a_A': 4.72830560476688,
 'language_encoding': 4.628020425325446,
 'n_retweet_b_A': 4.139456082878347,
 'bert_token_len': 4.132712419106702,
 'a_follower_count': 4.079159386188082,
 'n_photos': 3.9480506393699124,
 'TE_reply_a_B': 3.9379830537976117,
 'n_present_a_B': 3.883223687553503,
 'n_reply_a_B': 3.805701231468646,
 'n_like_a_B': 3.8025541985251543,
 'n_present_b_A': 3.7770048679311787,
 'TE_repl

## Try sample test run

In [None]:
import utils.model
import utils.dataloader

dl = utils.dataloader.RecSys2021TSVDataLoader("test", "user_index.parquet", mode="test", load_n_batches=-1)
recsysxgb = utils.model.RecSysXGB1("xgb_models_05_submission")

recsysxgb.evaluate_test_set(testLoader = dl, output_file = "res.csv")

## Testing Custom batch sizes

In [1]:
from utils import dataloader
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join

filter_timestamp = None#int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","user_index.parquet")#"train_user_index.parquet"
csv_data_location = join("data","test_files")
model_save_location = join("saved_models","xgb_models_06_submission")

In [2]:
dl = dataloader.RecSys2021TSVDataLoader(csv_data_location, use_user_index, mode="test", filter_timestamp=filter_timestamp, load_n_batches=-1, batch_size=1000000, verbose=2, random_file_sampling=True)

In [3]:
r = [a[1] for a in dl]

just one
1000000
Loaded Batch Nr. 1 in 6.10
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of 1 in 0.20
Did prepro part 2 of 1 in 11.80
Did prepro part 3 of 1 in 0.05
Merged Users of 1 in 21.76
Extracted TE of 1 in 3.21
Finished Batch Nr. 1 from file part-00003.csv in 45.47s!
just one
1000000
Loaded Batch Nr. 2 in 7.32
Timestamp Filtered Batch Nr. 2 in 0.00
Did prepro part 1 of 2 in 0.20
Did prepro part 2 of 2 in 11.87
Did prepro part 3 of 2 in 0.05
Merged Users of 2 in 22.05
Extracted TE of 2 in 3.20
Finished Batch Nr. 2 from file part-00003.csv in 47.02s!
just one
1000000
Loaded Batch Nr. 3 in 8.79
Timestamp Filtered Batch Nr. 3 in 0.00
Did prepro part 1 of 3 in 0.20
Did prepro part 2 of 3 in 11.85
Did prepro part 3 of 3 in 0.05
Merged Users of 3 in 22.73
Extracted TE of 3 in 3.20
Finished Batch Nr. 3 from file part-00003.csv in 49.16s!
more than one 1000000
1000000
Loaded Batch Nr. 4 in 12.99
Timestamp Filtered Batch Nr. 4 in 0.00
Did prepro part 1 of 4 in 0.20
Did prepro 

In [4]:
for df in r:
    print(len(df))

1000000
1000000
1000000
1000000
1000000
1000000
21788


In [17]:
import utils.features as fe
import utils.constants as co
import pandas as pd
import numpy as np

In [12]:
current_file = pd.read_csv(
                "data/test_files/part-00002.csv",
                sep='\x01',
                header=None,
                names=co.all_features,
                dtype={k: v for k, v in co.dtypes_of_features.items() if k in co.all_features}
            )

In [14]:
current_file["medias"] = current_file["medias"].fillna("")
current_file["hashtags"] = current_file["hashtags"].fillna("")
current_file["links"] = current_file["links"].fillna("")
current_file["domains"] = current_file["domains"].fillna("")
current_file["medias"] = current_file["medias"].fillna("")

In [27]:
import numpy as np
set(np.exp(train_data["n_gifs"]+ train_data["n_photos"] +train_data["n_videos"])-1)

{-0.9502129554748535, 1.0, 2.0, 3.0, 4.000000476837158, 5.0, 7.0}

# Other