In [1]:
import os
import pandas as pd
import numpy as np
import gc

from os.path import join
from datetime import datetime

In [2]:
all_features = ["bert_base_multilingual_cased_tokens",
                "hashtags",
                "tweet_id",
                "medias",
                "links",
                "domains",
                "type",
                "language",
                "timestamp",
                "a_user_id",
                "a_follower_count",
                "a_following_count",
                "a_is_verified",
                "a_account_creation",
                "b_user_id",
                "b_follower_count",
                "b_following_count",
                "b_is_verified",
                "b_account_creation",
                "a_follows_b"]

all_labels = ["reply",
              "retweet",
              "retweet_comment",
              "like"]

dtypes_of_features = {
    "bert_base_multilingual_cased_tokens": str,
    "hashtags": str,
    "tweet_id": str,
    "medias": str,
    "links": str,
    "domains": str,
    "type": str,
    "language": str,
    "timestamp": np.uint32,
    "a_user_id": str,
    "a_follower_count": np.uint32,
    "a_following_count": np.uint32,
    "a_is_verified": bool,
    "a_account_creation": np.uint32,
    "b_user_id": str,
    "b_follower_count": np.uint32,
    "b_following_count": np.uint32,
    "b_is_verified": bool,
    "b_account_creation": np.uint32,
    "a_follows_b": bool,
    "reply": np.uint32,
    "retweet": np.uint32,
    "retweet_comment": np.uint32,
    "like": np.uint32
}
all_columns = all_features + all_labels

user_centric_cols = ["a_user_id",
                "a_follower_count",
                "a_following_count",
                "a_is_verified",
                "a_account_creation",
                "b_user_id",
                "b_follower_count",
                "b_following_count",
                "b_is_verified",
                "b_account_creation",
                "reply",
                "retweet",
                "retweet_comment",
                "like",
                "timestamp"]

training_csv = "training_files"
temp_csv = "temp_user_mappings"

In [5]:
%%time

md = 2**64


for file in os.listdir(training_csv):
    if ".csv" not in file and ".tsv" not in file:
        continue
    print(f"\rReading CSV {file}...", end="")
    df = pd.read_csv(join(training_csv, file), sep='\x01', header=None, names=all_columns, 
        dtype={k: v for k, v in dtypes_of_features.items() if k in all_features}, usecols=user_centric_cols)
    
    print(f"\rCreating User Maps for {file}...", end="")
    df["a_user_id"] = df["a_user_id"].apply(lambda x: int(x, base=16)%md).astype(np.uint64)
    df["b_user_id"] = df["b_user_id"].apply(lambda x: int(x, base=16)%md).astype(np.uint64)

    user_dfs = []
    cols = ["user_id", "follower_count", "following_count", "verified", "account_creation", "timestamp", "action_type"]


    df_a = df[["a_user_id", "a_follower_count", "a_following_count", "a_is_verified", "a_account_creation","timestamp"]].copy()
    df_a.loc[:,"action_type"] = 0
    df_a.columns = cols
    df_a["day"] = df_a["timestamp"].apply(lambda x: datetime.fromtimestamp(x).day).astype(np.uint8)
    user_dfs.append(df_a)

    df_b = df[["b_user_id", "b_follower_count", "b_following_count", "b_is_verified", "b_account_creation", "timestamp"]].copy()
    df_b.loc[:,"action_type"] = 1
    df_b.columns = cols
    df_b["day"] = df_b["timestamp"].apply(lambda x: datetime.fromtimestamp(x).day).astype(np.uint8)
    user_dfs.append(df_b)

    for idx, col in enumerate(['reply',"retweet","retweet_comment","like"]):
        temp_df = df[["b_user_id", "b_follower_count", "b_following_count", "b_is_verified", "b_account_creation", col]].copy()
        temp_df = temp_df.dropna(subset=[col])
        temp_df.loc[:,"action_type"] = idx + 2
        temp_df.columns = cols
        temp_df["day"] = temp_df["timestamp"].apply(lambda x: datetime.fromtimestamp(x).day).astype(np.uint8)
        user_dfs.append(temp_df)

    user_df = pd.concat(user_dfs)

    gb = user_df.groupby("user_id")
    gb_cnt = user_df.groupby(["user_id", "action_type"])
    gb_day_cnt = user_df.groupby(["user_id", "day"])

    print(f"\rExtracting Features for {file}...", end="")


    res = gb.agg({
        'follower_count': "first", 
        'following_count':'first', 
        'verified':'first', 
        'account_creation': "first"
        })

    print(f"\rExtracting Counts for {file}...", end="")

    cnt_res = gb_cnt.size().unstack(fill_value=0)
    day_cnt = gb_day_cnt.size().unstack(fill_value=0)
    cnt_res.columns =  ["n_present_a","n_present_b","n_reply","n_retweet","n_retweet_comment","n_like"]
    day_cnt.columns = ["n_day_"+str(a) for a in day_cnt.columns]

    print(f"\rMerging {file}...", end="")

    user_df = pd.merge(res, cnt_res, how='inner', left_index=True, right_index=True)
    user_df = pd.merge(user_df, day_cnt, how="inner", left_index=True, right_index=True)
    print(f"\rWriting File {file}...", end="")
    user_df.to_parquet(join(temp_csv, file.split(".")[0]+".parquet"))
    gc.collect()
    break

Merging part-00000.csv...Wall time: 43.6 s


In [3]:
%%time

big_user_df = None
for file in os.listdir(temp_csv):
    if ".parquet" not in file:
        continue

    print(f"\rReading temp file {file}...", end="")
    if big_user_df is None:
        big_user_df = pd.read_parquet(join(temp_csv,file))
        continue

    user_df = pd.read_parquet(join(temp_csv,file))
    user_df.columns = ["next_"+a for a in user_df.columns]

    print(f"\r Collecting overlaps of {file}...", end="")
    res = pd.merge(big_user_df, user_df, how="outer", left_index=True, right_index=True, indicator=True)

    from_left = big_user_df.loc[res.query('_merge=="left_only"').index]
    from_right = user_df.loc[res.query('_merge=="right_only"').index]
    both_extracted = res.query('_merge=="both"')

    print(f"\r Accumulating Features {file}...", end="")
    for col in both_extracted.columns:
        if col.startswith("n"):
            both_extracted[col] = both_extracted[col]  +  both_extracted["next_"+col] 


    from_right.columns = [a[5:] for a in from_right.columns]
    both_extracted = both_extracted[from_left.columns]

    print(f"\r Final Concat with sort {file}...", end="")
    big_user_df = pd.concat([from_left, from_right, both_extracted])
    big_user_df = big_user_df.sort_index()
    gc.collect()
    break
#big_user_df.to_csv("user_index.csv")

 Collecting overlaps of part-00080.parquet...
KeyboardInterrupt



In [6]:
user_df.columns

Index(['follower_count', 'following_count', 'verified', 'account_creation',
       'n_present_a', 'n_present_b', 'n_reply', 'n_retweet',
       'n_retweet_comment', 'n_like', 'n_day_4', 'n_day_5', 'n_day_6',
       'n_day_7', 'n_day_8', 'n_day_9', 'n_day_10', 'n_day_11', 'n_day_12',
       'n_day_13', 'n_day_14', 'n_day_15', 'n_day_16', 'n_day_17', 'n_day_18',
       'n_day_19', 'n_day_20', 'n_day_21', 'n_day_22', 'n_day_23', 'n_day_24',
       'n_day_25'],
      dtype='object')