## Train the model

In [2]:
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join
from importlib import reload

filter_timestamp = int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","train_user_index_w_type.parquet")#"train_user_index.parquet"
csv_data_location = join("data","downloaded_data")
model_save_location = join("saved_models","xgb_models_17_long_training")

In [14]:
dl = dataloader.RecSys2021TSVDataLoader(
    csv_data_location, 
    use_user_index, 
    mode="train", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    remove_day_counts=True, 
    keep_user_percent=0.5, 
    batch_size = 1000000,
    load_n_batches = 30
    )

Loading User Index
Randomly keeping only 50.0% of the users.
Removing day counts
Extracting combined user counts
Created Dataloader in 22.36 seconds!


In [16]:
%%time
recsysxgb = utils.model.RecSysXGB1(model_dir = model_save_location)
xgb_params = {'objective': 'binary:logistic', 'eval_metric':'map', 'max_depth':10}

CPU times: user 1.71 ms, sys: 78 µs, total: 1.79 ms
Wall time: 23.6 ms


In [17]:
recsysxgb.fit(
    train_loader= dl,
    xgb_parameters = xgb_params,
    boost_rounds_per_iteration = 10,
    verbose = 1,
    n_epochs = 1
)

Loaded Batch Nr. 1 in 4.86
Timestamp Filtered Batch Nr. 1 in 0.16
Did prepro part 1 of 1 in 0.19
Did prepro part 2 of 1 in 8.70
Did prepro part 3 of 1 in 0.04
Merged Users of 1 in 9.34
Extracted TE of 1 in 6.24
Finished Batch Nr. 1 from file part-00000.tsv in 31.81s!
Loaded Batch Nr. 2 in 5.80
Timestamp Filtered Batch Nr. 2 in 0.17
Did prepro part 1 of 2 in 0.17
Did prepro part 2 of 2 in 8.63
Did prepro part 3 of 2 in 0.04
Merged Users of 2 in 9.32
Extracted TE of 2 in 6.23
Finished Batch Nr. 2 from file part-00000.tsv in 32.69s!
Loaded Batch Nr. 3 in 9.20
Timestamp Filtered Batch Nr. 3 in 0.16
Did prepro part 1 of 3 in 0.18
Did prepro part 2 of 3 in 8.75
Did prepro part 3 of 3 in 0.04
Merged Users of 3 in 9.46
Extracted TE of 3 in 6.24
Finished Batch Nr. 3 from file part-00000.tsv in 36.36s!
Loaded Batch Nr. 4 in 10.07
Timestamp Filtered Batch Nr. 4 in 0.38
Did prepro part 1 of 4 in 0.16
Did prepro part 2 of 4 in 8.29
Did prepro part 3 of 4 in 0.03
Merged Users of 4 in 8.53
Extracted 

## Evaluate the model

In [3]:
recsysxgb = utils.model.RecSysXGB1(model_save_location)
dl = dataloader.RecSys2021TSVDataLoader(
    data_directory = csv_data_location, 
    user_index_location = join("indices","user_index_w_type.parquet"),#use_user_index, 
    mode="val", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    random_file_sampling=True, 
    load_n_batches=3, 
    remove_day_counts=True,
    keep_user_percent=1
)

Loading User Index
Removing day counts
Extracting combined user counts
Created Dataloader in 13.79 seconds!


In [4]:
res = recsysxgb.evaluate_validation_set(dl)

Loaded Batch Nr. 1 in 20.61
Timestamp Filtered Batch Nr. 1 in 0.39
Did prepro part 1 of 1 in 0.23
Did prepro part 2 of 1 in 10.31
Did prepro part 3 of 1 in 0.04
Merged Users of 1 in 23.14
Extracted TE of 1 in 12.76
Finished Batch Nr. 1 from file part-00142.tsv in 69.88s!
Loaded Batch Nr. 2 in 20.76
Timestamp Filtered Batch Nr. 2 in 0.40
Did prepro part 1 of 2 in 0.23
Did prepro part 2 of 2 in 10.36
Did prepro part 3 of 2 in 0.04
Merged Users of 2 in 22.74
Extracted TE of 2 in 12.74
Finished Batch Nr. 2 from file part-00038.tsv in 69.70s!
Loaded Batch Nr. 3 in 20.18
Timestamp Filtered Batch Nr. 3 in 0.39
Did prepro part 1 of 3 in 0.22
Did prepro part 2 of 3 in 10.88
Did prepro part 3 of 3 in 0.04
Merged Users of 3 in 24.62
Extracted TE of 3 in 13.11
Finished Batch Nr. 3 from file part-00151.tsv in 71.98s!


In [5]:
for key in res:
    print(f"{key:32}: {res[key]}")

Q1_reply_rce                    : 31.965043874533517
Q1_retweet_rce                  : 37.19519389850131
Q1_retweet_comment_rce          : -6.613239380417002
Q1_like_rce                     : 38.690061700877095
Q1_reply_avg_prec               : 0.46727806000774486
Q1_retweet_avg_prec             : 0.601238406188639
Q1_retweet_comment_avg_prec     : 0.2244352033494541
Q1_like_avg_prec                : 0.842852978835866
Q2_reply_rce                    : 21.181695912925434
Q2_retweet_rce                  : 28.63620842474095
Q2_retweet_comment_rce          : -19.73141715700617
Q2_like_rce                     : 30.576676249946388
Q2_reply_avg_prec               : 0.3657215431893473
Q2_retweet_avg_prec             : 0.5112469616250706
Q2_retweet_comment_avg_prec     : 0.14863262523337073
Q2_like_avg_prec                : 0.7910485712599772
Q3_reply_rce                    : 12.699308295528123
Q3_retweet_rce                  : 21.819434032414275
Q3_retweet_comment_rce          : -28.1148042840

In [5]:
dict(sorted(recsysxgb.clfs_["has_reply"].get_score(importance_type='gain').items(), key=lambda item: item[1], reverse=True))

{'type_encoding': 61.62220535634151,
 'a_follows_b': 46.650430013040015,
 'TE_reply_b_B': 37.44942618886715,
 'n_reply_b_B': 23.095458300374673,
 'TE_reply_a_A': 22.246823007659398,
 'TE_retweet_comment_a_Quote_B': 20.542540637438734,
 'n_reply_a_A': 18.34867954385838,
 'TE_reply_b_TopLevel_B': 14.378844105746493,
 'n_like_b_B': 14.172060342488278,
 'n_reply_b_TopLevel_B': 13.956302263336331,
 'n_retweet_comment_b_Quote_B': 13.947119664857142,
 'TE_reply_a_TopLevel_A': 13.631062231180488,
 'n_reply_b_Quote_B': 13.437879992250243,
 'n_retweet_comment_a_Quote_B': 13.342948623333335,
 'TE_like_b_Quote_A': 13.217098923865748,
 'TE_reply_b_Retweet_B': 12.908840923630692,
 'n_present_a_A': 12.234450315147056,
 'n_retweet_b_Quote_A': 12.091599398267604,
 'n_retweet_a_Retweet_B': 12.080321862633584,
 'n_retweet_comment_a_B': 12.050153463317075,
 'TE_retweet_a_Quote_B': 11.95957932136457,
 'n_like_b_TopLevel_B': 11.939778022310408,
 'n_retweet_comment_b_TopLevel_A': 11.923213499012984,
 'TE_lik

## Try sample test run

In [22]:
out = next(iter(dl))

Loaded Batch Nr. 1 in 20.16
Timestamp Filtered Batch Nr. 1 in 0.41
Did prepro part 1 of 1 in 0.27
Did prepro part 2 of 1 in 10.67
Did prepro part 3 of 1 in 0.05
Merged Users of 1 in 23.01
Extracted TE of 1 in 13.02
Finished Batch Nr. 1 from file part-00073.tsv in 70.12s!


In [26]:
pred = recsysxgb.infer(out[0])

In [37]:
(((pred[3]-0.5)*0.9999999)+0.5).min()

5.9604645e-08

## Testing Custom batch sizes

In [1]:
from utils import dataloader
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join

filter_timestamp = None#int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","user_index.parquet")#"train_user_index.parquet"
csv_data_location = join("data","test_files")
model_save_location = join("saved_models","xgb_models_06_submission")

In [2]:
dl = dataloader.RecSys2021TSVDataLoader(csv_data_location, use_user_index, mode="test", filter_timestamp=filter_timestamp, load_n_batches=-1, batch_size=1000000, verbose=2, random_file_sampling=True)

In [3]:
r = [a[1] for a in dl]

just one
1000000
Loaded Batch Nr. 1 in 6.10
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of 1 in 0.20
Did prepro part 2 of 1 in 11.80
Did prepro part 3 of 1 in 0.05
Merged Users of 1 in 21.76
Extracted TE of 1 in 3.21
Finished Batch Nr. 1 from file part-00003.csv in 45.47s!
just one
1000000
Loaded Batch Nr. 2 in 7.32
Timestamp Filtered Batch Nr. 2 in 0.00
Did prepro part 1 of 2 in 0.20
Did prepro part 2 of 2 in 11.87
Did prepro part 3 of 2 in 0.05
Merged Users of 2 in 22.05
Extracted TE of 2 in 3.20
Finished Batch Nr. 2 from file part-00003.csv in 47.02s!
just one
1000000
Loaded Batch Nr. 3 in 8.79
Timestamp Filtered Batch Nr. 3 in 0.00
Did prepro part 1 of 3 in 0.20
Did prepro part 2 of 3 in 11.85
Did prepro part 3 of 3 in 0.05
Merged Users of 3 in 22.73
Extracted TE of 3 in 3.20
Finished Batch Nr. 3 from file part-00003.csv in 49.16s!
more than one 1000000
1000000
Loaded Batch Nr. 4 in 12.99
Timestamp Filtered Batch Nr. 4 in 0.00
Did prepro part 1 of 4 in 0.20
Did prepro 

In [4]:
for df in r:
    print(len(df))

1000000
1000000
1000000
1000000
1000000
1000000
21788


In [17]:
import utils.features as fe
import utils.constants as co
import pandas as pd
import numpy as np

In [12]:
current_file = pd.read_csv(
                "data/test_files/part-00002.csv",
                sep='\x01',
                header=None,
                names=co.all_features,
                dtype={k: v for k, v in co.dtypes_of_features.items() if k in co.all_features}
            )

In [14]:
current_file["medias"] = current_file["medias"].fillna("")
current_file["hashtags"] = current_file["hashtags"].fillna("")
current_file["links"] = current_file["links"].fillna("")
current_file["domains"] = current_file["domains"].fillna("")
current_file["medias"] = current_file["medias"].fillna("")

In [27]:
import numpy as np
set(np.exp(train_data["n_gifs"]+ train_data["n_photos"] +train_data["n_videos"])-1)

{-0.9502129554748535, 1.0, 2.0, 3.0, 4.000000476837158, 5.0, 7.0}

# Other

In [1]:
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join
from importlib import reload
import torch

filter_timestamp = int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","train_user_index_w_type.parquet")#"train_user_index.parquet"
csv_data_location = join("data","downloaded_data")
model_save_location = join("saved_models","xgb_models_17_long_training")

In [2]:
dl = dataloader.RecSys2021TSVDataLoader(
    csv_data_location, 
    use_user_index, 
    mode="train", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    remove_day_counts=True, 
    keep_user_percent=0.5, 
    batch_size = 100000,
    minibatches_size = 64,
    load_n_batches = 2,
    normalize_batch = True
    )

Loading User Index
Randomly keeping only 50.0% of the users.
Removing day counts
Extracting combined user counts
Created Dataloader in 21.51 seconds!


In [3]:
nn = utils.model.RecSysNeural1("saved_models/nn1",166).to("cuda")

In [4]:
nn.fit(dl, 1)

Loaded Batch Nr. 1 in 0.53
Timestamp Filtered Batch Nr. 1 in 0.02
Did prepro part 1 of 1 in 0.02
Did prepro part 2 of 1 in 1.00
Did prepro part 3 of 1 in 0.01
Merged Users of 1 in 8.74
Extracted TE of 1 in 5.33
Finished Batch Nr. 1 from file part-00000.tsv in 15.91s!
Loaded Batch Nr. 2 in 1.33
Timestamp Filtered Batch Nr. 2 in 0.02
Did prepro part 1 of 2 in 0.02
Did prepro part 2 of 2 in 1.00
Did prepro part 3 of 2 in 0.01
Merged Users of 2 in 8.78
Extracted TE of 2 in 5.30
Finished Batch Nr. 2 from file part-00000.tsv in 16.72s!


In [6]:
dl = dataloader.RecSys2021TSVDataLoader(
    csv_data_location, 
    use_user_index, 
    mode="val", 
    filter_timestamp=filter_timestamp, 
    verbose=2,
    remove_day_counts=True, 
    keep_user_percent=1, 
    batch_size = 100000,
    minibatches_size = 64,
    load_n_batches = 1,
    normalize_batch = True
)

nn.evaluate_validation_set(dl)

Loading User Index
Removing day counts
Extracting combined user counts
Created Dataloader in 8.69 seconds!
Loaded Batch Nr. 1 in 0.52
Timestamp Filtered Batch Nr. 1 in 0.02
Did prepro part 1 of 1 in 0.01
Did prepro part 2 of 1 in 0.39
Did prepro part 3 of 1 in 0.00
Merged Users of 1 in 19.52
Extracted TE of 1 in 10.67
Finished Batch Nr. 1 from file part-00000.tsv in 31.25s!


{'Q1_reply_rce': 8.68125501103274,
 'Q1_retweet_rce': 3.400062978361129,
 'Q1_retweet_comment_rce': -44.404965645466746,
 'Q1_like_rce': 8.125657632640237,
 'Q1_reply_avg_prec': 0.1315984137720102,
 'Q1_retweet_avg_prec': 0.3321852703862555,
 'Q1_retweet_comment_avg_prec': 0.004715542128430863,
 'Q1_like_avg_prec': 0.6609896239976686,
 'Q2_reply_rce': 12.13713074310232,
 'Q2_retweet_rce': 8.197141367376236,
 'Q2_retweet_comment_rce': -33.55661430274217,
 'Q2_like_rce': 12.078644737768263,
 'Q2_reply_avg_prec': 0.14269045272667516,
 'Q2_retweet_avg_prec': 0.3180733598775403,
 'Q2_retweet_comment_avg_prec': 0.013366630634222066,
 'Q2_like_avg_prec': 0.6636784282153383,
 'Q3_reply_rce': 12.486972645735438,
 'Q3_retweet_rce': 9.905978649974767,
 'Q3_retweet_comment_rce': -18.88081583461232,
 'Q3_like_rce': 14.159144487434638,
 'Q3_reply_avg_prec': 0.1476714204382969,
 'Q3_retweet_avg_prec': 0.3590855036125054,
 'Q3_retweet_comment_avg_prec': 0.008938643475619129,
 'Q3_like_avg_prec': 0.654