In [1]:
import os
import pandas as pd
import numpy as np
import gc

from os.path import join
from pathlib import Path
from datetime import datetime
import tqdm

In [2]:
all_features = ["bert_base_multilingual_cased_tokens",
                "hashtags",
                "tweet_id",
                "medias",
                "links",
                "domains",
                "type",
                "language",
                "timestamp",
                "a_user_id",
                "a_follower_count",
                "a_following_count",
                "a_is_verified",
                "a_account_creation",
                "b_user_id",
                "b_follower_count",
                "b_following_count",
                "b_is_verified",
                "b_account_creation",
                "a_follows_b"]

all_labels = ["reply",
              "retweet",
              "retweet_comment",
              "like"]

dtypes_of_features = {
    "bert_base_multilingual_cased_tokens": str,
    "hashtags": str,
    "tweet_id": str,
    "medias": str,
    "links": str,
    "domains": str,
    "type": str,
    "language": str,
    "timestamp": np.uint32,
    "a_user_id": str,
    "a_follower_count": np.uint32,
    "a_following_count": np.uint32,
    "a_is_verified": bool,
    "a_account_creation": np.uint32,
    "b_user_id": str,
    "b_follower_count": np.uint32,
    "b_following_count": np.uint32,
    "b_is_verified": bool,
    "b_account_creation": np.uint32,
    "a_follows_b": bool,
    "reply": np.uint32,
    "retweet": np.uint32,
    "retweet_comment": np.uint32,
    "like": np.uint32
}
all_columns = all_features + all_labels

user_centric_cols = ["a_user_id",
                "a_follower_count",
                "a_following_count",
                "a_is_verified",
                "a_account_creation",
                "b_user_id",
                "b_follower_count",
                "b_following_count",
                "b_is_verified",
                "b_account_creation",
                "reply",
                "retweet",
                "retweet_comment",
                "like",
                "timestamp"]

training_csv = "downloaded_data"
temp_csv = "train_temp_user_mappings"
index_file = "train_user_index.parquet"
md = 2**64

cutoff_timestamp = int(datetime(2021, 2, 19, 0).timestamp())#then we have first 14 days training and last 7 validation

In [3]:
%%time


Path(temp_csv).mkdir(exist_ok=True, parents=True)

for file in tqdm.tqdm(os.listdir(training_csv)):
    if ".csv" not in file and ".tsv" not in file:
        continue
    print(f"Reading CSV {file}...", end="")
    df = pd.read_csv(join(training_csv, file), sep='\x01', header=None, names=all_columns, 
        dtype={k: v for k, v in dtypes_of_features.items() if k in all_features}, usecols=user_centric_cols)

    
    print(f"Applying filters for {file}...", end="")
    df = df[
        (df["timestamp"] < cutoff_timestamp) & 
        ((df["reply"] < cutoff_timestamp) | (df["reply"].isnull())) &
        ((df["retweet"] < cutoff_timestamp) | (df["retweet"].isnull())) &
        ((df["like"] < cutoff_timestamp) | (df["like"].isnull())) &
        ((df["retweet_comment"] < cutoff_timestamp) | (df["retweet_comment"].isnull()))
        ]

    
    print(f"\rCreating User Maps for {file}...", end="")
    df["a_user_id"] = df["a_user_id"].apply(lambda x: int(x, base=16)%md).astype(np.uint64)
    df["b_user_id"] = df["b_user_id"].apply(lambda x: int(x, base=16)%md).astype(np.uint64)

    user_dfs = []
    cols = ["user_id", "follower_count", "following_count", "verified", "account_creation", "timestamp", "action_type"]


    df_a = df[["a_user_id", "a_follower_count", "a_following_count", "a_is_verified", "a_account_creation","timestamp"]].copy()
    df_a.loc[:,"action_type"] = 0
    df_a.columns = cols
    df_a["day"] = df_a["timestamp"].apply(lambda x: datetime.fromtimestamp(x).timetuple().tm_yday).astype(np.uint16)#day of year
    user_dfs.append(df_a)

    df_b = df[["b_user_id", "b_follower_count", "b_following_count", "b_is_verified", "b_account_creation", "timestamp"]].copy()
    df_b.loc[:,"action_type"] = 1
    df_b.columns = cols
    user_dfs.append(df_b)

    for idx, col in enumerate(['reply',"retweet","retweet_comment","like"]):
        #userb_encode
        temp_df = df[["b_user_id", "b_follower_count", "b_following_count", "b_is_verified", "b_account_creation", col]].copy()
        temp_df = temp_df.dropna(subset=[col])
        temp_df.loc[:,"action_type"] = idx + 2
        temp_df.columns = cols
        temp_df["day"] = temp_df["timestamp"].apply(lambda x: datetime.fromtimestamp(x).timetuple().tm_yday).astype(np.uint16)#day of year
        user_dfs.append(temp_df)
        #usera_encode
        temp_df = df[["a_user_id", "a_follower_count", "a_following_count", "a_is_verified", "a_account_creation", col]].copy()
        temp_df = temp_df.dropna(subset=[col])
        temp_df.loc[:,"action_type"] = idx + 6
        temp_df.columns = cols
        user_dfs.append(temp_df)


    user_df = pd.concat(user_dfs)

    gb = user_df.groupby("user_id")
    gb_cnt = user_df.groupby(["user_id", "action_type"])
    gb_day_cnt = user_df.groupby(["user_id", "day"])

    print(f"\rExtracting Features for {file}...", end="")


    res = gb.agg({
        'follower_count': "first", 
        'following_count':'first', 
        'verified':'first', 
        'account_creation': "first"
        })

    print(f"\rExtracting Counts for {file}...", end="")

    cnt_res = gb_cnt.size().unstack(fill_value=0)
    day_cnt = gb_day_cnt.size().unstack(fill_value=0)
    cnt_res.columns =  ["n_present_a","n_present_b","n_reply_b","n_retweet_b","n_retweet_comment_b","n_like_b","n_reply_a","n_retweet_a","n_retweet_comment_a","n_like_a"]
    day_cnt.columns = ["n_day_"+str(int(a)) for a in day_cnt.columns]

    print(f"\rMerging {file}...", end="")

    user_df = pd.merge(res, cnt_res, how='inner', left_index=True, right_index=True)
    user_df = pd.merge(user_df, day_cnt, how="inner", left_index=True, right_index=True)
    print(f"\rWriting File {file}...", end="")
    user_df.to_parquet(join(temp_csv, file.split(".")[0]+".parquet"))
    gc.collect()

100%|██████████| 253/253 [2:04:03<00:00, 29.42s/it]CPU times: user 1h 40min 12s, sys: 5min 30s, total: 1h 45min 43s
Wall time: 2h 4min 3s



In [4]:
%%time

big_user_df = None
for file in tqdm.tqdm(os.listdir(temp_csv)):
    print()

    if ".parquet" not in file:
        continue

    df = pd.read_parquet(join(temp_csv,file))
    for col in df.columns:
        if col.startswith("n_"):
            df[col] = df[col].astype(np.uint16)

    print(f"\rReading temp file {file}...", end="")
    if big_user_df is None:
        big_user_df = df
        continue

    user_df = df
    user_df.columns = ["next_"+a for a in user_df.columns]

    print(f"\r Collecting overlaps of {file}...", end="")
    from_left = big_user_df.loc[big_user_df.index.difference(user_df.index)]
    from_right = user_df.loc[user_df.index.difference(big_user_df.index)]
    both_extracted = pd.merge(big_user_df, user_df, how="inner", left_index=True, right_index=True)

    print(f"\r Accumulating Features {file}...", end="")
    for col in both_extracted.columns:
        if col.startswith("n_"):
            if str("next_"+col) in both_extracted.columns:#safety
                both_extracted[col] = both_extracted[col]  +  both_extracted["next_"+col]


    from_right.columns = [a[5:] for a in from_right.columns]
    both_extracted = both_extracted[from_left.columns]

    print(f"\r Final Concat with sort {file}...", end="")
    big_user_df = pd.concat([from_left, from_right, both_extracted])
    big_user_df = big_user_df.sort_index()
    gc.collect()
big_user_df.to_parquet(index_file)

  0%|          | 0/253 [00:00<?, ?it/s]
Reading temp file part-00000.parquet...
  1%|          | 2/253 [00:06<14:31,  3.47s/it]
  1%|          | 3/253 [00:10<15:25,  3.70s/it]
  2%|▏         | 4/253 [00:15<16:29,  3.97s/it]
  2%|▏         | 5/253 [00:20<18:02,  4.36s/it]
  2%|▏         | 6/253 [00:25<19:24,  4.72s/it]
  3%|▎         | 7/253 [00:31<20:35,  5.02s/it]
  3%|▎         | 8/253 [00:37<21:48,  5.34s/it]
  4%|▎         | 9/253 [00:43<22:58,  5.65s/it]
  4%|▍         | 10/253 [00:50<23:56,  5.91s/it]
  4%|▍         | 11/253 [00:57<24:55,  6.18s/it]
  5%|▍         | 12/253 [01:04<25:58,  6.47s/it]
  5%|▌         | 13/253 [01:11<26:54,  6.73s/it]
  6%|▌         | 14/253 [01:19<28:01,  7.03s/it]
  6%|▌         | 15/253 [01:27<28:50,  7.27s/it]
  6%|▋         | 16/253 [01:35<29:56,  7.58s/it]
  7%|▋         | 17/253 [01:43<30:42,  7.81s/it]
  7%|▋         | 18/253 [01:52<31:38,  8.08s/it]
  8%|▊         | 19/253 [02:01<33:06,  8.49s/it]
  8%|▊         | 20/253 [02:11<34:08,  8.79s/i

In [6]:
big_user_df.columns

Index(['follower_count', 'following_count', 'verified', 'account_creation',
       'n_present_a', 'n_present_b', 'n_reply_b', 'n_retweet_b',
       'n_retweet_comment_b', 'n_like_b', 'n_reply_a', 'n_retweet_a',
       'n_retweet_comment_a', 'n_like_a', 'n_day_35', 'n_day_36', 'n_day_37',
       'n_day_38', 'n_day_39', 'n_day_40', 'n_day_41', 'n_day_42', 'n_day_43',
       'n_day_44', 'n_day_45', 'n_day_46', 'n_day_47', 'n_day_48', 'n_day_49'],
      dtype='object')

In [7]:
56 - 35

21

In [3]:
df = pd.read_csv("downloaded_data/part-00002.tsv", sep='\x01', header=None, names=all_columns, 
        dtype={k: v for k, v in dtypes_of_features.items() if k in all_features})
        

In [4]:
df

Unnamed: 0,bert_base_multilingual_cased_tokens,hashtags,tweet_id,medias,links,domains,type,language,timestamp,a_user_id,...,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,a_follows_b,reply,retweet,retweet_comment,like
0,101\t441\t111044\t111038\t111046\t60290\t11103...,1880EF63C6378F1A58536496C38C1E2B,F8B58B5C5C837EE788E87E2DB08BF370,Photo,,,TopLevel,159541FA269CA8A9CDB93658CAEC4CA2,1613424456,57D6DE65A2B5AD62DA70C9E8DDF1D4BC,...,0E44892D10402A32CEE33B4D294C7F60,12,13,False,1578683724,False,,,,1.613430e+09
1,101\t56898\t137\t31167\t20124\t77799\t131\t501...,,422F7C7FD5CB87E07508EC9787559B42,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1614061810,12C71367708636D0DFADF8DD40A26102,...,486F4F49293D7E3A38D1CA3FE9EDD67D,80,414,False,1594108738,False,,,,
2,101\t1996\t77689\t15221\t69652\t15355\t16867\t...,,A1BEEE5D330515750D34D6C3BCBF5706,Photo\tPhoto,,,TopLevel,E7F038DE3EAD397AEC9193686C911677,1613537772,FF3783575DD7398A0EF488F60B0A2AEC,...,B8BA94E831328CBEF68CCADE3E841725,105,90,False,1549622775,True,1.613543e+09,,,
3,101\t11469\t51747\t10189\t28615\t10531\t117\t1...,,2D5ABBE5A3755375AB127C4430F49F29,,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1613350368,1F41391FBC081B285AA2D63D6228EE7A,...,6E003F1DE7A6CB86456659F0B3F7914F,195,256,False,1514080158,True,,,,1.613359e+09
4,101\t56898\t137\t15308\t10858\t10526\t13034\t1...,,7D9F397C19835537E4FD19ABA888DFE6,Photo\tPhoto,,,Retweet,E7F038DE3EAD397AEC9193686C911677,1613292595,EC0B5ADCC7D439FE5CC616C2CDB58A15,...,1A896884B0694A05DB7DCD9E819CD008,61,115,False,1537607022,True,,,,1.613297e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3011119,101\t56898\t137\t184\t10418\t12387\t98990\t103...,,78DE3B76804E53A1A895CC5351040C5D,,,,Retweet,B8B04128918BBF54E2E178BFF1ABA833,1613178362,A78C43293CEA7B39DCC2DECF1088558A,...,6B1A4D84D470E34E27FC16CD535B0D39,2467,1969,False,1492973912,True,,1.613179e+09,,1.613179e+09
3011120,101\t11831\t2210\t1942\t18628\t12236\t22445\t1...,,A319149BCCF2856E05D664542D6C88D2,,,,Quote,E7F038DE3EAD397AEC9193686C911677,1614075963,0F7BC2AAC000E519F6A5D623B8D280F1,...,4ACF8F56AB33782F55557AFCE062E59D,739,718,False,1292318026,False,,,,1.614077e+09
3011121,101\t100\t102,,05E60FC2A4B8C52D04B2380A6F230C8E,,,,Quote,E7F038DE3EAD397AEC9193686C911677,1612960957,760562C97E84EA40C0ADA52317A8256A,...,DF701E480D733700492127A4A0F9B94C,1277,184,False,1581601326,True,,,,
3011122,101\t1855\t2822\t3642\t2215\t5618\t100\t1855\t...,,8ECBE25DC5A9F0DFA9EC2AB8A78C3087,Photo\tPhoto\tPhoto\tPhoto,,,TopLevel,E7F038DE3EAD397AEC9193686C911677,1613999568,4F78CF00A0CA5AAF4EA0D4A2A1CC7C38,...,8C595EB1E97A97656A3A57E4DDEFE53D,319,362,False,1532788568,False,,,,1.614002e+09


In [22]:
df["reply"].fillna(0).astype(int).max()

1614211162

In [25]:
datetime.fromtimestamp(1613688965).timetuple().tm_yday

49

In [5]:
df["like"] < cutoff_timestamp

0           True
1          False
2          False
3           True
4           True
           ...  
3011119     True
3011120    False
3011121    False
3011122    False
3011123    False
Name: like, Length: 3011124, dtype: bool

In [16]:
df[(df["reply"] < cutoff_timestamp) | (df["reply"].isnull())]["reply"].max()

1613689181.0

In [11]:
(df["reply"] == np.nan).sum()

0

In [13]:
df["reply"].isnull()

0           True
1           True
2          False
3           True
4           True
           ...  
3011119     True
3011120     True
3011121     True
3011122     True
3011123     True
Name: reply, Length: 3011124, dtype: bool

In [23]:
df[
        (df["timestamp"] < cutoff_timestamp) & 
        ((df["reply"] < cutoff_timestamp) | (df["reply"].isnull())) &
        ((df["retweet"] < cutoff_timestamp) | (df["retweet"].isnull())) &
        ((df["like"] < cutoff_timestamp) | (df["like"].isnull())) &
        ((df["retweet_comment"] < cutoff_timestamp) | (df["retweet_comment"].isnull()))
        ].max()

bert_base_multilingual_cased_tokens    101\t99996\t13987\t86074\t10114\t19261\t10189\...
tweet_id                                                FFFFE6854FD78FF99C0F98BB919589F9
type                                                                            TopLevel
language                                                F9D8F1DB5A398E1225A2C42E34A51DF6
timestamp                                                                     1613689199
a_user_id                                               FFFFF8D0A1E9C84BD356965A0C74ADB8
a_follower_count                                                               129240509
a_following_count                                                                4256371
a_is_verified                                                                       True
a_account_creation                                                            4290301696
b_user_id                                               FFFFFEF4B15A35C97D998A0929A770E7
b_follower_count     

In [20]:
df.__len__()

3011124