## Train the model

In [21]:
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join
from importlib import reload

method_name = "25_more_hyperparams"
filter_timestamp = None#int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","user_index_w_type.parquet")#"train_user_index.parquet"
csv_data_location = join("data","downloaded_data")
model_save_location = join("saved_models","xgb_models_"+method_name)
result_logfile = join("logs","3batches_results.txt")

In [22]:
dl = dataloader.RecSys2021TSVDataLoader(
    csv_data_location, 
    use_user_index, 
    mode="train", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    remove_day_counts=True, 
    keep_user_percent=0.98, 
    batch_size = 1500000,
    load_n_batches = 1,
    TE_smoothing = {"reply":50, "like":50, "retweet":50, "retweet_comment":50},
    remove_user_counts=True
    )

Loading User Index
Randomly keeping only 98.0% of the users.
Removing day counts
Extracting combined user counts
Created Dataloader in 58.95 seconds!


In [23]:
recsysxgb = utils.model.RecSysXGB1(model_dir = model_save_location)
xgb_params = {'objective': 'binary:logistic', 'eval_metric':'logloss', 'max_depth':8, "subsample": 0.5, "max_delta_step":5}

In [24]:
%%time
recsysxgb.fit(
    train_loader= dl,
    xgb_parameters = xgb_params,
    boost_rounds_per_iteration = 25,
    verbose = 1,
    n_epochs = 1
)

Loaded Batch Nr. 1 in 11.52
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of 1 in 0.35
Did prepro part 2 of 1 in 18.96
Did prepro part 3 of 1 in 0.09
Merged Users of 1 in 24.92
Extracted TE of 1 in 13.80
Finished Batch Nr. 1 from file part-00000.tsv in 74.43s!
Finished 1 epochs.
CPU times: user 1h 3min 44s, sys: 41.6 s, total: 1h 4min 26s
Wall time: 10min 35s


## Evaluate the model

In [25]:
recsysxgb = utils.model.RecSysXGB1(model_save_location)
val_data_location = join("data","validation_data")

dl = dataloader.RecSys2021TSVDataLoader(
    data_directory = val_data_location, 
    user_index_location = use_user_index,
    mode="val", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    random_file_sampling=False, 
    load_n_batches=3, 
    remove_day_counts=True,
    batch_size = 1000000,
    keep_user_percent=1,
    remove_user_counts=True
)

Loading User Index
Removing day counts
Extracting combined user counts
Created Dataloader in 26.28 seconds!


In [26]:
res = recsysxgb.evaluate_validation_set(dl, store_results_file=result_logfile, validation_run_name=method_name)

Loaded Batch Nr. 1 in 10.29
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of 1 in 0.23
Did prepro part 2 of 1 in 12.80
Did prepro part 3 of 1 in 0.06
Merged Users of 1 in 24.80
Extracted TE of 1 in 13.39
Finished Batch Nr. 1 from file part-00000.csv in 64.74s!
Loaded Batch Nr. 2 in 9.38
Timestamp Filtered Batch Nr. 2 in 0.00
Did prepro part 1 of 2 in 0.23
Did prepro part 2 of 2 in 12.54
Did prepro part 3 of 2 in 0.06
Merged Users of 2 in 23.57
Extracted TE of 2 in 13.38
Finished Batch Nr. 2 from file part-00000.csv in 62.27s!
Loaded Batch Nr. 3 in 10.07
Timestamp Filtered Batch Nr. 3 in 0.00
Did prepro part 1 of 3 in 0.23
Did prepro part 2 of 3 in 12.80
Did prepro part 3 of 3 in 0.06
Merged Users of 3 in 23.94
Extracted TE of 3 in 13.45
Finished Batch Nr. 3 from file part-00000.csv in 63.85s!


In [27]:
for key in res:
    print(f"{key:32}: {res[key]}")

Q1_reply_rce                    : -6.848454990409425
Q1_retweet_rce                  : 2.3079910628311495
Q1_retweet_comment_rce          : -9.774465983757352
Q1_like_rce                     : -6.104001368411605
Q1_reply_avg_prec               : 0.1287411639947537
Q1_retweet_avg_prec             : 0.32520413516251934
Q1_retweet_comment_avg_prec     : 0.027912266512364683
Q1_like_avg_prec                : 0.5587119580927447
Q2_reply_rce                    : 1.7375350414364554
Q2_retweet_rce                  : 8.82767005801357
Q2_retweet_comment_rce          : -7.8136136118959865
Q2_like_rce                     : -2.7123691771427705
Q2_reply_avg_prec               : 0.15520109280658845
Q2_retweet_avg_prec             : 0.32766757124291285
Q2_retweet_comment_avg_prec     : 0.025836250962125076
Q2_like_avg_prec                : 0.5487933743852569
Q3_reply_rce                    : 9.034945005253636
Q3_retweet_rce                  : 12.700955333577046
Q3_retweet_comment_rce          : -3.967

In [28]:
for target in recsysxgb.targets__:
    print(f"\n{target}:")
    print(dict(sorted(recsysxgb.clfs_[target].get_score(importance_type='gain').items(), key=lambda item: item[1], reverse=True)))


has_reply:
{'TE_reply_b_B': 234.43800379276223, 'TE_reply_a_A': 183.38751511208744, 'type_encoding': 71.84508312773193, 'a_follows_b': 50.33437807333334, 'TE_reply_b_TopLevel_B': 40.942810946663315, 'TE_reply_b_Quote_B': 30.013468765991863, 'TE_reply_b_Retweet_B': 29.084792003413217, 'TE_like_b_B': 26.218162188920633, 'TE_reply_a_TopLevel_A': 25.532681905691888, 'TE_like_b_A': 25.31703953156383, 'TE_like_a_TopLevel_A': 23.85139826032968, 'TE_like_b_TopLevel_B': 22.611456997065787, 'TE_retweet_comment_a_TopLevel_A': 22.26193084683582, 'TE_retweet_comment_b_TopLevel_B': 20.90299531067442, 'TE_like_a_TopLevel_B': 20.426115327222227, 'TE_reply_a_Quote_A': 20.07289814727272, 'TE_retweet_comment_b_B': 19.965633638179476, 'TE_reply_a_Retweet_A': 19.919135052306824, 'TE_retweet_b_TopLevel_B': 19.721829861950003, 'TE_retweet_comment_b_Quote_B': 19.33468691243678, 'a_follower_count': 18.273520042440563, 'TE_like_b_Retweet_B': 17.65043320066667, 'TE_retweet_b_B': 17.410525958259996, 'a_is_verifi

## Try sample test run

In [22]:
out = next(iter(dl))

Loaded Batch Nr. 1 in 20.16
Timestamp Filtered Batch Nr. 1 in 0.41
Did prepro part 1 of 1 in 0.27
Did prepro part 2 of 1 in 10.67
Did prepro part 3 of 1 in 0.05
Merged Users of 1 in 23.01
Extracted TE of 1 in 13.02
Finished Batch Nr. 1 from file part-00073.tsv in 70.12s!


In [26]:
pred = recsysxgb.infer(out[0])

In [37]:
(((pred[3]-0.5)*0.9999999)+0.5).min()

5.9604645e-08

## Testing Custom batch sizes

In [1]:
from utils import dataloader
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join

filter_timestamp = None#int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","user_index.parquet")#"train_user_index.parquet"
csv_data_location = join("data","test_files")
model_save_location = join("saved_models","xgb_models_06_submission")

In [2]:
dl = dataloader.RecSys2021TSVDataLoader(csv_data_location, use_user_index, mode="test", filter_timestamp=filter_timestamp, load_n_batches=-1, batch_size=1000000, verbose=2, random_file_sampling=True)

In [3]:
r = [a[1] for a in dl]

just one
1000000
Loaded Batch Nr. 1 in 6.10
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of 1 in 0.20
Did prepro part 2 of 1 in 11.80
Did prepro part 3 of 1 in 0.05
Merged Users of 1 in 21.76
Extracted TE of 1 in 3.21
Finished Batch Nr. 1 from file part-00003.csv in 45.47s!
just one
1000000
Loaded Batch Nr. 2 in 7.32
Timestamp Filtered Batch Nr. 2 in 0.00
Did prepro part 1 of 2 in 0.20
Did prepro part 2 of 2 in 11.87
Did prepro part 3 of 2 in 0.05
Merged Users of 2 in 22.05
Extracted TE of 2 in 3.20
Finished Batch Nr. 2 from file part-00003.csv in 47.02s!
just one
1000000
Loaded Batch Nr. 3 in 8.79
Timestamp Filtered Batch Nr. 3 in 0.00
Did prepro part 1 of 3 in 0.20
Did prepro part 2 of 3 in 11.85
Did prepro part 3 of 3 in 0.05
Merged Users of 3 in 22.73
Extracted TE of 3 in 3.20
Finished Batch Nr. 3 from file part-00003.csv in 49.16s!
more than one 1000000
1000000
Loaded Batch Nr. 4 in 12.99
Timestamp Filtered Batch Nr. 4 in 0.00
Did prepro part 1 of 4 in 0.20
Did prepro 

In [4]:
for df in r:
    print(len(df))

1000000
1000000
1000000
1000000
1000000
1000000
21788


In [17]:
import utils.features as fe
import utils.constants as co
import pandas as pd
import numpy as np

In [12]:
current_file = pd.read_csv(
                "data/test_files/part-00002.csv",
                sep='\x01',
                header=None,
                names=co.all_features,
                dtype={k: v for k, v in co.dtypes_of_features.items() if k in co.all_features}
            )

In [14]:
current_file["medias"] = current_file["medias"].fillna("")
current_file["hashtags"] = current_file["hashtags"].fillna("")
current_file["links"] = current_file["links"].fillna("")
current_file["domains"] = current_file["domains"].fillna("")
current_file["medias"] = current_file["medias"].fillna("")

In [27]:
import numpy as np
set(np.exp(train_data["n_gifs"]+ train_data["n_photos"] +train_data["n_videos"])-1)

{-0.9502129554748535, 1.0, 2.0, 3.0, 4.000000476837158, 5.0, 7.0}

# Other

In [3]:
from utils import dataloader
import utils.model
from datetime import datetime
from os.path import join
from importlib import reload
import torch

filter_timestamp = int(datetime(2021, 2, 19, 0).timestamp())
use_user_index = join("indices","train_user_index_w_type.parquet")#"train_user_index.parquet"
csv_data_location = join("data","downloaded_data")
model_save_location = join("saved_models","xgb_models_15_better_index")

In [2]:
dl = dataloader.RecSys2021TSVDataLoader(
    csv_data_location, 
    use_user_index, 
    mode="train", 
    filter_timestamp=filter_timestamp, 
    verbose=2, 
    remove_day_counts=True, 
    keep_user_percent=0.5, 
    batch_size = 100000,
    minibatches_size = 64,
    load_n_batches = 20,
    normalize_batch = True
    )

Loading User Index
Randomly keeping only 50.0% of the users.
Removing day counts
Extracting combined user counts
Created Dataloader in 21.99 seconds!


In [3]:
nn = utils.model.RecSysNeural1("saved_models/nn1",166).to("cuda")

In [4]:
nn.fit(dl, 1)

Loaded Batch Nr. 1 in 0.53
Timestamp Filtered Batch Nr. 1 in 0.02
Did prepro part 1 of 1 in 0.02
Did prepro part 2 of 1 in 0.88
Did prepro part 3 of 1 in 0.01
Merged Users of 1 in 8.78
Extracted TE of 1 in 5.36
Finished Batch Nr. 1 from file part-00000.tsv in 15.86s!
Loaded Batch Nr. 2 in 0.61
Timestamp Filtered Batch Nr. 2 in 0.02
Did prepro part 1 of 2 in 0.02
Did prepro part 2 of 2 in 0.88
Did prepro part 3 of 2 in 0.01
Merged Users of 2 in 8.81
Extracted TE of 2 in 5.29
Finished Batch Nr. 2 from file part-00000.tsv in 15.91s!
Loaded Batch Nr. 3 in 1.41
Timestamp Filtered Batch Nr. 3 in 0.02
Did prepro part 1 of 3 in 0.02
Did prepro part 2 of 3 in 0.86
Did prepro part 3 of 3 in 0.00
Merged Users of 3 in 8.80
Extracted TE of 3 in 5.32
Finished Batch Nr. 3 from file part-00000.tsv in 16.70s!
Loaded Batch Nr. 4 in 1.51
Timestamp Filtered Batch Nr. 4 in 0.02
Did prepro part 1 of 4 in 0.02
Did prepro part 2 of 4 in 0.87
Did prepro part 3 of 4 in 0.01
Merged Users of 4 in 8.77
Extracted T

In [5]:
dl = dataloader.RecSys2021TSVDataLoader(
    csv_data_location, 
    use_user_index, 
    mode="val", 
    filter_timestamp=filter_timestamp, 
    verbose=2,
    remove_day_counts=True, 
    keep_user_percent=1, 
    batch_size = 100000,
    minibatches_size = 64,
    load_n_batches = 2,
    normalize_batch = True
)

nn.evaluate_validation_set(dl)

Loading User Index
Removing day counts
Extracting combined user counts
Created Dataloader in 9.23 seconds!
Loaded Batch Nr. 1 in 0.50
Timestamp Filtered Batch Nr. 1 in 0.01
Did prepro part 1 of 1 in 0.01
Did prepro part 2 of 1 in 0.35
Did prepro part 3 of 1 in 0.00
Merged Users of 1 in 20.75
Extracted TE of 1 in 10.51
Finished Batch Nr. 1 from file part-00000.tsv in 32.27s!
Loaded Batch Nr. 2 in 0.60
Timestamp Filtered Batch Nr. 2 in 0.01
Did prepro part 1 of 2 in 0.01
Did prepro part 2 of 2 in 0.36
Did prepro part 3 of 2 in 0.00
Merged Users of 2 in 19.24
Extracted TE of 2 in 10.39
Finished Batch Nr. 2 from file part-00000.tsv in 30.75s!


{'Q1_reply_rce': -55.06375529232655,
 'Q1_retweet_rce': 2.022846040964088,
 'Q1_retweet_comment_rce': -1779.8738613419546,
 'Q1_like_rce': -33.556324489285075,
 'Q1_reply_avg_prec': 0.15847665901759944,
 'Q1_retweet_avg_prec': 0.3183992278567487,
 'Q1_retweet_comment_avg_prec': 0.012456490593981896,
 'Q1_like_avg_prec': 0.6371322559900724,
 'Q2_reply_rce': -69.87847181000113,
 'Q2_retweet_rce': 6.4290856674497965,
 'Q2_retweet_comment_rce': -1713.128849104653,
 'Q2_like_rce': -26.586873867452155,
 'Q2_reply_avg_prec': 0.15787283026510363,
 'Q2_retweet_avg_prec': 0.30418172967899715,
 'Q2_retweet_comment_avg_prec': 0.0320185234437078,
 'Q2_like_avg_prec': 0.6575847322885718,
 'Q3_reply_rce': -68.04658336242227,
 'Q3_retweet_rce': 11.167206860620848,
 'Q3_retweet_comment_rce': -1096.8625199070543,
 'Q3_like_rce': -15.533225203282308,
 'Q3_reply_avg_prec': 0.17978344581739844,
 'Q3_retweet_avg_prec': 0.3482818652909233,
 'Q3_retweet_comment_avg_prec': 0.015857833740217535,
 'Q3_like_avg_p

In [2]:
from utils.constants import user_group_weights
import torch

In [7]:
groups = [1,2,4,5]

In [8]:
[user_group_weights[it] for it in groups]

[0.9870774328685435,
 0.9705157535873209,
 0.8466576397178028,
 0.4343195950966685]

# Other

In [1]:
import utils.download

In [2]:
utils.download.download_data("validation_urls.txt","data/validation_data", delete_compressed = False)

Downloading https://storage.googleapis.com to data/validation_data/part-00000.lzo... done
Uncompressing data/validation_data/part-00000.lzo... done
Downloading https://storage.googleapis.com to data/validation_data/part-00001.lzo... done
Uncompressing data/validation_data/part-00001.lzo... done


In [10]:
import pandas as pd
from utils.constants import all_columns, dtypes_of_features, all_features

In [11]:
pd.read_csv("data/validation_data/part-00000.lzo",
                sep='\x01',
                header=None,
                names=all_columns,
                dtype={k: v for k, v in dtypes_of_features.items() if k in all_features},
                nrows = 100000
                )

Unnamed: 0,bert_base_multilingual_cased_tokens,hashtags,tweet_id,medias,links,domains,type,language,timestamp,a_user_id,...,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,a_follows_b,reply,retweet,retweet_comment,like
0,101\t157\t39554\t117\t51747\t10479\t27874\t101...,,0C8E7372269942BB173EED7C0A72DA09,,,,Quote,488B32D24BD4BB44172EB981C1BCA6FA,1614523782,8B7BB615A39DF112B0037E960C27F220,...,3044AFDB7E977FB7F62D49C5C97794B7,1128,4028,False,1419763073,False,,,,1.614526e+09
1,101\t14120\t131\t120\t120\t188\t119\t11170\t12...,,858720854891DFED04A5B91758049833,Photo,,,TopLevel,313ECD3A1E5BB07406E4249475C2D6D6,1614335436,74B09D5BC3FCE5CC4DEE34BF049A0EE5,...,23ACD97ACEF57BF6416670BE1133A664,72,115,False,1600058407,False,,,,1.614379e+09
2,101\t56898\t137\t10279\t10908\t10138\t168\t398...,90C52DDF506D1C98EE678E84C08C36AB\tCD8639BABE54...,1988AA4069C46F57990B6983FBD427A6,,D249E94F6D37A7D5E37185E02C94CDFF,9EFF000CDB18B710CDDB43EE1D8C300B,Retweet,E7F038DE3EAD397AEC9193686C911677,1614618153,316C2BE47E445DA930E56C12F3AF31C5,...,99F829F88A12BF8B92EDF11A10B6533C,130,638,False,1257076693,False,,,,
3,101\t56898\t137\t12275\t10738\t11534\t15417\t9...,D704B4128E35F9BC995E701523676542\tCFD3AF9039C6...,89C86150124016236B7D4A286B041210,,,,Retweet,E7F038DE3EAD397AEC9193686C911677,1614714932,C19FC40FBF0AA0BAC4797BD47483349B,...,A3A7B6D928FAD730BA9967B53EA7CC28,234,640,False,1599725409,False,,,,1.614715e+09
4,101\t109821\t24093\t14703\t30118\t11259\t160\t...,,971197A8E595128D8BAF8F3F9D20CFF3,Photo\tPhoto,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1614559113,46846BF25E56F8BA11C0AE36ED31B28F,...,BA94D75FC03B42F88CAE61485E075960,8,208,False,1532879982,False,,,,1.614618e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,101\t49056\t27354\t34935\t69618\t39752\t10230\...,E27EDC9E620D43F17663DCAEAA958585,B434BB652FBBA4D4CA2C8B75CD65D60D,Photo\tPhoto\tPhoto\tPhoto,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1614747844,967E7C2A6A7F277E37C8E77D224B7E18,...,F9414BF0E778E51C5BC48655D0C52583,4542,1401,False,1344160517,True,,,,1.614755e+09
99996,101\t10911\t146\t100\t172\t16863\t29597\t86607...,,710CA52FDB579F8C375D21CF967DD32A,,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1614670776,04F8A6F207D69EF1E63F4926D1D4DBAB,...,D8462FB3BBD84B0431CF79EC2D106983,738,677,False,1288417538,False,,,,
99997,101\t56898\t137\t14424\t168\t48742\t131\t108\t...,4A99F523FECC5224DFB8B55027794D21\t18B9EA3B7FDC...,ACF411715253EA6B18794E067D62D530,,,,Retweet,E7F038DE3EAD397AEC9193686C911677,1614768600,3AD09AAD3F97E386B12E14772CAA5E67,...,7A7A62AC9D03EC812B0971FFCE113A9B,324,1736,False,1339671557,True,,,,
99998,101\t1894\t88218\t7860\t2316\t1895\t108\t2087\...,35825087788754DC26ED3A6364FD3A55\tD1E4BFB9BD83...,0F6E4AF7D5A70C4C9519D224604EB61C,,A33C298D80264AFAB5612248F785024C,1CE85CC733BA6144D83EDF75F95DCA4E,TopLevel,E7F038DE3EAD397AEC9193686C911677,1614238664,2D2E9B9AD37B72325FDEA86D1197975D,...,3EA329F4500E2FB2DFEF0C482E43C2D5,8,57,False,1548231263,False,,,,


In [3]:
res = next(iter(dl))

Loaded Batch Nr. 1 in 8.59
Timestamp Filtered Batch Nr. 1 in 0.00
Did prepro part 1 of 1 in 0.36
Did prepro part 2 of 1 in 20.40
Did prepro part 3 of 1 in 0.09
Merged Users of 1 in 25.17
Extracted TE of 1 in 14.11
Finished Batch Nr. 1 from file part-00000.tsv in 73.28s!


In [5]:
for col in res[0].columns:
    if col.startswith("n_") or True:
        print(f"{col:60}", end="")

a_follower_count                                            a_following_count                                           a_is_verified                                               b_follower_count                                            b_following_count                                           b_is_verified                                               a_follows_b                                                 bert_token_len                                              photo_count                                                 video_count                                                 gif_count                                                   type_encoding                                               language_encoding                                           a_followers                                                 a_following                                                 b_followers                                                 b_following                             

In [None]:
for key in res:
    print(f"{key:32}: {res[key]}")