In [3]:
import os
import pandas as pd
import numpy as np
import gc

from os.path import join
from datetime import datetime
import tqdm

In [4]:
all_features = ["bert_base_multilingual_cased_tokens",
                "hashtags",
                "tweet_id",
                "medias",
                "links",
                "domains",
                "type",
                "language",
                "timestamp",
                "a_user_id",
                "a_follower_count",
                "a_following_count",
                "a_is_verified",
                "a_account_creation",
                "b_user_id",
                "b_follower_count",
                "b_following_count",
                "b_is_verified",
                "b_account_creation",
                "a_follows_b"]

all_labels = ["reply",
              "retweet",
              "retweet_comment",
              "like"]

dtypes_of_features = {
    "bert_base_multilingual_cased_tokens": str,
    "hashtags": str,
    "tweet_id": str,
    "medias": str,
    "links": str,
    "domains": str,
    "type": str,
    "language": str,
    "timestamp": np.uint32,
    "a_user_id": str,
    "a_follower_count": np.uint32,
    "a_following_count": np.uint32,
    "a_is_verified": bool,
    "a_account_creation": np.uint32,
    "b_user_id": str,
    "b_follower_count": np.uint32,
    "b_following_count": np.uint32,
    "b_is_verified": bool,
    "b_account_creation": np.uint32,
    "a_follows_b": bool,
    "reply": np.uint32,
    "retweet": np.uint32,
    "retweet_comment": np.uint32,
    "like": np.uint32
}
all_columns = all_features + all_labels

user_centric_cols = ["a_user_id",
                "a_follower_count",
                "a_following_count",
                "a_is_verified",
                "a_account_creation",
                "b_user_id",
                "b_follower_count",
                "b_following_count",
                "b_is_verified",
                "b_account_creation",
                "reply",
                "retweet",
                "retweet_comment",
                "like",
                "timestamp"]

training_csv = "downloaded_data"
temp_csv = "temp_user_mappings"

In [3]:
%%time

md = 2**64


for file in tqdm.tqdm(os.listdir(training_csv)):
    print()
    if ".csv" not in file and ".tsv" not in file:
        continue
    print(f"\rReading CSV {file}...", end="")
    df = pd.read_csv(join(training_csv, file), sep='\x01', header=None, names=all_columns, 
        dtype={k: v for k, v in dtypes_of_features.items() if k in all_features}, usecols=user_centric_cols)
    
    print(f"\rCreating User Maps for {file}...", end="")
    df["a_user_id"] = df["a_user_id"].apply(lambda x: int(x, base=16)%md).astype(np.uint64)
    df["b_user_id"] = df["b_user_id"].apply(lambda x: int(x, base=16)%md).astype(np.uint64)

    user_dfs = []
    cols = ["user_id", "follower_count", "following_count", "verified", "account_creation", "timestamp", "action_type"]


    df_a = df[["a_user_id", "a_follower_count", "a_following_count", "a_is_verified", "a_account_creation","timestamp"]].copy()
    df_a.loc[:,"action_type"] = 0
    df_a.columns = cols
    df_a["day"] = df_a["timestamp"].apply(lambda x: datetime.fromtimestamp(x).day).astype(np.uint8)
    user_dfs.append(df_a)

    df_b = df[["b_user_id", "b_follower_count", "b_following_count", "b_is_verified", "b_account_creation", "timestamp"]].copy()
    df_b.loc[:,"action_type"] = 1
    df_b.columns = cols
    user_dfs.append(df_b)

    for idx, col in enumerate(['reply',"retweet","retweet_comment","like"]):
        #userb_encode
        temp_df = df[["b_user_id", "b_follower_count", "b_following_count", "b_is_verified", "b_account_creation", col]].copy()
        temp_df = temp_df.dropna(subset=[col])
        temp_df.loc[:,"action_type"] = idx + 2
        temp_df.columns = cols
        temp_df["day"] = temp_df["timestamp"].apply(lambda x: datetime.fromtimestamp(x).day).astype(np.uint8)
        user_dfs.append(temp_df)
        #usera_encode
        temp_df = df[["a_user_id", "a_follower_count", "a_following_count", "a_is_verified", "a_account_creation", col]].copy()
        temp_df = temp_df.dropna(subset=[col])
        temp_df.loc[:,"action_type"] = idx + 6
        temp_df.columns = cols
        user_dfs.append(temp_df)


    user_df = pd.concat(user_dfs)

    gb = user_df.groupby("user_id")
    gb_cnt = user_df.groupby(["user_id", "action_type"])
    gb_day_cnt = user_df.groupby(["user_id", "day"])

    print(f"\rExtracting Features for {file}...", end="")


    res = gb.agg({
        'follower_count': "first", 
        'following_count':'first', 
        'verified':'first', 
        'account_creation': "first"
        })

    print(f"\rExtracting Counts for {file}...", end="")

    cnt_res = gb_cnt.size().unstack(fill_value=0)
    day_cnt = gb_day_cnt.size().unstack(fill_value=0)
    cnt_res.columns =  ["n_present_a","n_present_b","n_reply_b","n_retweet_b","n_retweet_comment_b","n_like_b","n_reply_a","n_retweet_a","n_retweet_comment_a","n_like_a"]
    day_cnt.columns = ["n_day_"+str(int(a)) for a in day_cnt.columns]

    print(f"\rMerging {file}...", end="")

    user_df = pd.merge(res, cnt_res, how='inner', left_index=True, right_index=True)
    user_df = pd.merge(user_df, day_cnt, how="inner", left_index=True, right_index=True)
    print(f"\rWriting File {file}...", end="")
    user_df.to_parquet(join(temp_csv, file.split(".")[0]+".parquet"))
    gc.collect()

  0%|          | 0/253 [00:00<?, ?it/s]
  0%|          | 1/253 [00:42<3:00:22, 42.94s/it]
  1%|          | 2/253 [01:23<2:53:02, 41.36s/it]
  1%|          | 3/253 [02:01<2:45:37, 39.75s/it]
  2%|▏         | 4/253 [02:38<2:41:13, 38.85s/it]
  2%|▏         | 5/253 [03:15<2:38:03, 38.24s/it]
  2%|▏         | 6/253 [03:51<2:34:02, 37.42s/it]
  3%|▎         | 7/253 [04:29<2:33:33, 37.45s/it]
  3%|▎         | 8/253 [05:06<2:32:24, 37.32s/it]
  4%|▎         | 9/253 [05:42<2:30:21, 36.97s/it]
  4%|▍         | 10/253 [06:19<2:30:35, 37.18s/it]
  4%|▍         | 11/253 [06:55<2:28:12, 36.75s/it]
  5%|▍         | 12/253 [07:30<2:24:55, 36.08s/it]
  5%|▌         | 13/253 [08:04<2:22:44, 35.69s/it]
  6%|▌         | 14/253 [08:39<2:21:08, 35.43s/it]
  6%|▌         | 15/253 [09:14<2:19:42, 35.22s/it]
  6%|▋         | 16/253 [09:50<2:19:27, 35.31s/it]
  7%|▋         | 17/253 [10:24<2:18:18, 35.16s/it]
  7%|▋         | 18/253 [10:59<2:17:17, 35.05s/it]
  8%|▊         | 19/253 [11:34<2:16:20, 34.96s/it]


In [5]:
%%time

big_user_df = None
for file in tqdm.tqdm(os.listdir(temp_csv)):
    print()

    if ".parquet" not in file:
        continue

    df = pd.read_parquet(join(temp_csv,file))
    for col in df.columns:
        if col.startswith("n_"):
            df[col] = df[col].astype(np.uint16)

    print(f"\rReading temp file {file}...", end="")
    if big_user_df is None:
        big_user_df = df
        continue

    user_df = df
    user_df.columns = ["next_"+a for a in user_df.columns]

    print(f"\r Collecting overlaps of {file}...", end="")
    from_left = big_user_df.loc[big_user_df.index.difference(user_df.index)]
    from_right = user_df.loc[user_df.index.difference(big_user_df.index)]
    both_extracted = pd.merge(big_user_df, user_df, how="inner", left_index=True, right_index=True)

    print(f"\r Accumulating Features {file}...", end="")
    for col in both_extracted.columns:
        if col.startswith("n_"):
            both_extracted[col] = both_extracted[col]  +  both_extracted["next_"+col] 


    from_right.columns = [a[5:] for a in from_right.columns]
    both_extracted = both_extracted[from_left.columns]

    print(f"\r Final Concat with sort {file}...", end="")
    big_user_df = pd.concat([from_left, from_right, both_extracted])
    big_user_df = big_user_df.sort_index()
    gc.collect()
big_user_df.to_parquet("user_index.parquet")

  0%|          | 0/253 [00:00<?, ?it/s]
Reading temp file part-00000.parquet...
  1%|          | 2/253 [00:09<20:31,  4.91s/it]
  1%|          | 3/253 [00:15<23:07,  5.55s/it]
  2%|▏         | 4/253 [00:22<25:36,  6.17s/it]
  2%|▏         | 5/253 [00:30<27:54,  6.75s/it]
  2%|▏         | 6/253 [00:38<29:47,  7.24s/it]
  3%|▎         | 7/253 [00:47<31:36,  7.71s/it]
  3%|▎         | 8/253 [00:56<33:10,  8.12s/it]
  4%|▎         | 9/253 [01:06<35:24,  8.71s/it]
  4%|▍         | 10/253 [01:16<37:22,  9.23s/it]
  4%|▍         | 11/253 [01:27<39:12,  9.72s/it]
  5%|▍         | 12/253 [01:38<40:49, 10.17s/it]
  5%|▌         | 13/253 [01:51<43:08, 10.78s/it]
  6%|▌         | 14/253 [02:03<44:17, 11.12s/it]
  6%|▌         | 15/253 [02:14<44:50, 11.31s/it]
  6%|▋         | 16/253 [02:27<45:55, 11.63s/it]
  7%|▋         | 17/253 [02:39<47:09, 11.99s/it]
  7%|▋         | 18/253 [02:52<47:31, 12.14s/it]
  8%|▊         | 19/253 [03:06<49:13, 12.62s/it]
  8%|▊         | 20/253 [03:19<49:45, 12.81s/i

In [6]:
big_user_df

Unnamed: 0_level_0,follower_count,following_count,verified,account_creation,n_present_a,n_present_b,n_reply_b,n_retweet_b,n_retweet_comment_b,n_like_b,...,n_day_16,n_day_17,n_day_18,n_day_19,n_day_20,n_day_21,n_day_22,n_day_23,n_day_24,n_day_25
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
135818039135,1976,1816,False,1349313262,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
172576553826,12,23,False,1602322817,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
682710903775,487,477,False,1417309686,3,4,0,0,0,4,...,0,0,0,0,0,0,0,1,0,0
2049088614082,532,717,False,1403031460,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2307705279631,1,65,False,1598286192,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18446740944060768855,55,33,False,1604570426,3,4,2,2,0,0,...,0,0,0,0,0,3,0,0,1,0
18446741747024410398,0,165,False,1604089009,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
18446742836979068699,1164,1673,False,1417526046,9,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
18446743027311136627,27,46,False,1596738343,1,7,4,0,0,3,...,0,0,0,0,1,0,0,0,0,0


In [7]:
df = pd.read_parquet("user_index.parquet")

In [8]:
df

Unnamed: 0_level_0,follower_count,following_count,verified,account_creation,n_present_a,n_present_b,n_reply_b,n_retweet_b,n_retweet_comment_b,n_like_b,...,n_day_16,n_day_17,n_day_18,n_day_19,n_day_20,n_day_21,n_day_22,n_day_23,n_day_24,n_day_25
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
135818039135,1976,1816,False,1349313262,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
172576553826,12,23,False,1602322817,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
682710903775,487,477,False,1417309686,3,4,0,0,0,4,...,0,0,0,0,0,0,0,1,0,0
2049088614082,532,717,False,1403031460,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2307705279631,1,65,False,1598286192,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18446740944060768855,55,33,False,1604570426,3,4,2,2,0,0,...,0,0,0,0,0,3,0,0,1,0
18446741747024410398,0,165,False,1604089009,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
18446742836979068699,1164,1673,False,1417526046,9,1,0,1,0,0,...,0,0,0,1,1,0,0,0,0,0
18446743027311136627,27,46,False,1596738343,1,7,4,0,0,3,...,0,0,0,0,1,0,0,0,0,0
