In [1]:
import os
import pandas as pd
import numpy as np
import gc

from os.path import join
from datetime import datetime
import tqdm

In [2]:
all_features = ["bert_base_multilingual_cased_tokens",
                "hashtags",
                "tweet_id",
                "medias",
                "links",
                "domains",
                "type",
                "language",
                "timestamp",
                "a_user_id",
                "a_follower_count",
                "a_following_count",
                "a_is_verified",
                "a_account_creation",
                "b_user_id",
                "b_follower_count",
                "b_following_count",
                "b_is_verified",
                "b_account_creation",
                "a_follows_b"]

all_labels = ["reply",
              "retweet",
              "retweet_comment",
              "like"]

dtypes_of_features = {
    "bert_base_multilingual_cased_tokens": str,
    "hashtags": str,
    "tweet_id": str,
    "medias": str,
    "links": str,
    "domains": str,
    "type": str,
    "language": str,
    "timestamp": np.uint32,
    "a_user_id": str,
    "a_follower_count": np.uint32,
    "a_following_count": np.uint32,
    "a_is_verified": bool,
    "a_account_creation": np.uint32,
    "b_user_id": str,
    "b_follower_count": np.uint32,
    "b_following_count": np.uint32,
    "b_is_verified": bool,
    "b_account_creation": np.uint32,
    "a_follows_b": bool,
    "reply": np.uint32,
    "retweet": np.uint32,
    "retweet_comment": np.uint32,
    "like": np.uint32
}
all_columns = all_features + all_labels

user_centric_cols = ["a_user_id",
                "a_follower_count",
                "a_following_count",
                "a_is_verified",
                "a_account_creation",
                "b_user_id",
                "b_follower_count",
                "b_following_count",
                "b_is_verified",
                "b_account_creation",
                "reply",
                "retweet",
                "retweet_comment",
                "like",
                "timestamp"]

training_csv = "downloaded_data"
temp_csv = "temp_user_mappings"

In [18]:
%%time

md = 2**64


for file in tqdm.tqdm(os.listdir(training_csv)):
    print()
    if ".csv" not in file and ".tsv" not in file:
        continue
    print(f"\rReading CSV {file}...", end="")
    df = pd.read_csv(join(training_csv, file), sep='\x01', header=None, names=all_columns, 
        dtype={k: v for k, v in dtypes_of_features.items() if k in all_features}, usecols=user_centric_cols)
    
    print(f"\rCreating User Maps for {file}...", end="")
    df["a_user_id"] = df["a_user_id"].apply(lambda x: int(x, base=16)%md).astype(np.uint64)
    df["b_user_id"] = df["b_user_id"].apply(lambda x: int(x, base=16)%md).astype(np.uint64)

    user_dfs = []
    cols = ["user_id", "follower_count", "following_count", "verified", "account_creation", "timestamp", "action_type"]


    df_a = df[["a_user_id", "a_follower_count", "a_following_count", "a_is_verified", "a_account_creation","timestamp"]].copy()
    df_a.loc[:,"action_type"] = 0
    df_a.columns = cols
    df_a["day"] = df_a["timestamp"].apply(lambda x: datetime.fromtimestamp(x).day).astype(np.uint8)
    user_dfs.append(df_a)

    df_b = df[["b_user_id", "b_follower_count", "b_following_count", "b_is_verified", "b_account_creation", "timestamp"]].copy()
    df_b.loc[:,"action_type"] = 1
    df_b.columns = cols
    user_dfs.append(df_b)

    for idx, col in enumerate(['reply',"retweet","retweet_comment","like"]):
        #userb_encode
        temp_df = df[["b_user_id", "b_follower_count", "b_following_count", "b_is_verified", "b_account_creation", col]].copy()
        temp_df = temp_df.dropna(subset=[col])
        temp_df.loc[:,"action_type"] = idx + 2
        temp_df.columns = cols
        temp_df["day"] = temp_df["timestamp"].apply(lambda x: datetime.fromtimestamp(x).day).astype(np.uint8)
        user_dfs.append(temp_df)
        #usera_encode
        temp_df = df[["a_user_id", "a_follower_count", "a_following_count", "a_is_verified", "a_account_creation", col]].copy()
        temp_df = temp_df.dropna(subset=[col])
        temp_df.loc[:,"action_type"] = idx + 6
        temp_df.columns = cols
        user_dfs.append(temp_df)


    user_df = pd.concat(user_dfs)

    gb = user_df.groupby("user_id")
    gb_cnt = user_df.groupby(["user_id", "action_type"])
    gb_day_cnt = user_df.groupby(["user_id", "day"])

    print(f"\rExtracting Features for {file}...", end="")


    res = gb.agg({
        'follower_count': "first", 
        'following_count':'first', 
        'verified':'first', 
        'account_creation': "first"
        })

    print(f"\rExtracting Counts for {file}...", end="")

    cnt_res = gb_cnt.size().unstack(fill_value=0)
    day_cnt = gb_day_cnt.size().unstack(fill_value=0)
    cnt_res.columns =  ["n_present_a","n_present_b","n_reply_b","n_retweet_b","n_retweet_comment_b","n_like_b","n_reply_a","n_retweet_a","n_retweet_comment_a","n_like_a"]
    day_cnt.columns = ["n_day_"+str(int(a)) for a in day_cnt.columns]

    print(f"\rMerging {file}...", end="")

    user_df = pd.merge(res, cnt_res, how='inner', left_index=True, right_index=True)
    user_df = pd.merge(user_df, day_cnt, how="inner", left_index=True, right_index=True)
    print(f"\rWriting File {file}...", end="")
    user_df.to_parquet(join(temp_csv, file.split(".")[0]+".parquet"))
    gc.collect()

100%|██████████| 6/6 [03:44<00:00, 37.35s/it]CPU times: user 3min 1s, sys: 7.42 s, total: 3min 9s
Wall time: 3min 44s



In [13]:
%%time

big_user_df = None
for file in tqdm.tqdm(os.listdir(temp_csv)):
    print()

    if ".parquet" not in file:
        continue

    df = pd.read_parquet(join(temp_csv,file))
    for col in df.columns:
        if col.startswith("n_"):
            df[col] = df[col].astype(np.uint16)

    print(f"\rReading temp file {file}...", end="")
    if big_user_df is None:
        big_user_df = df
        continue

    user_df = df
    user_df.columns = ["next_"+a for a in user_df.columns]

    print(f"\r Collecting overlaps of {file}...", end="")
    from_left = big_user_df.loc[big_user_df.index.difference(user_df.index)]
    from_right = user_df.loc[user_df.index.difference(big_user_df.index)]
    both_extracted = pd.merge(big_user_df, user_df, how="inner", left_index=True, right_index=True)

    print(f"\r Accumulating Features {file}...", end="")
    for col in both_extracted.columns:
        if col.startswith("n_"):
            both_extracted[col] = both_extracted[col]  +  both_extracted["next_"+col] 


    from_right.columns = [a[5:] for a in from_right.columns]
    both_extracted = both_extracted[from_left.columns]

    print(f"\r Final Concat with sort {file}...", end="")
    big_user_df = pd.concat([from_left, from_right, both_extracted])
    big_user_df = big_user_df.sort_index()
    gc.collect()
big_user_df.to_parquet("user_index.parquet")

  0%|          | 0/6 [00:00<?, ?it/s]
Reading temp file part-00000.parquet...
 33%|███▎      | 2/6 [00:08<00:17,  4.40s/it]
 50%|█████     | 3/6 [00:14<00:16,  5.35s/it]
 67%|██████▋   | 4/6 [00:21<00:12,  6.01s/it]
 83%|████████▎ | 5/6 [00:29<00:06,  6.64s/it]
100%|██████████| 6/6 [00:37<00:00,  6.32s/it]
CPU times: user 37.8 s, sys: 13.7 s, total: 51.4 s
Wall time: 43.8 s


In [14]:
big_user_df

Unnamed: 0_level_0,follower_count,following_count,verified,account_creation,n_present_a,n_present_b,n_reply_b,n_retweet_b,n_retweet_comment_b,n_like_b,...,n_day_16,n_day_17,n_day_18,n_day_19,n_day_20,n_day_21,n_day_22,n_day_23,n_day_24,n_day_25
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5095999849877,134,194,False,1326631751,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
5606560318682,1255,119,False,1466269220,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5676759707981,34,389,False,1457276045,0,1,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
7580329235296,257,271,False,1580153934,1,2,0,0,0,1,...,0,0,1,0,0,0,0,0,0,0
14004575296401,2654,16,False,1528433836,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18446740438450378374,43,37,False,1569514047,0,1,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
18446740470616118922,9771,151,False,1541038995,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18446740827010683372,419,519,False,1471697010,0,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
18446740944060768855,55,33,False,1604570426,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [11]:
big_user_df.index.intersection(user_df.index)

UInt64Index([      14005750574332,       14209190929472,       15004493922570,
                   32242587922418,       34830347432300,       37940756972326,
                   40246125534723,       43825937760880,       44255161551108,
                   44365871515907,
             ...
             18446665961312770570, 18446678100392651362, 18446685301021175419,
             18446690783397560162, 18446699437962483015, 18446725783297247363,
             18446728088360070848, 18446735550274496652, 18446736382489131107,
             18446743388464047409],
            dtype='uint64', name='user_id', length=2537083)