In [8]:
import pandas as pd
import numpy as np
from transformers import BertModel, BertConfig, PreTrainedTokenizer, BertTokenizer
from datetime import datetime
import os
from os.path import join
#   import gc       # see no reason to manually invoke garbage collector
from pathlib import Path
from tqdm.notebook import tqdm
import itertools
import dask
import dask.dataframe as dd
from dask.array import from_array as fa
import ast

pd.options.display.expand_frame_repr = True
tok = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

basepath = "."

# Set Data Locations

In [5]:
compressed_dir = "compressed_data"
uncompressed_dir = "uncompressed_data"

# Extract Data Pipline

### Rename Compressed Files to make them uncompress-able using zlop

In [6]:
%%time
files = os.listdir(compressed_dir)
for file in files:
    if "@" in file:
        old_file = os.path.join(compressed_dir, file)
        new_file = os.path.join(compressed_dir, file.split('@')[0])
        os.rename(old_file, new_file)

CPU times: user 648 µs, sys: 0 ns, total: 648 µs
Wall time: 412 µs


### Uncompress lzo files and delete compressed files

In [10]:
%%time
Path(uncompressed_dir).mkdir(parents=True, exist_ok=True)

for file in tqdm(os.listdir(compressed_dir)):
    if ".index" not in file:
        old_file = os.path.join(compressed_dir, file)
        #THIS DELETES THE COMPRESSED FILES!!!(careful)
        !lzop -U -p$uncompressed_dir -d $old_file
        
for file in os.listdir(uncompressed_dir):
    if ".tsv" not in file:
        os.rename(os.path.join(uncompressed_dir, file), os.path.join(uncompressed_dir, f"{file}.tsv"))

  0%|          | 0/3 [00:00<?, ?it/s]

CPU times: user 631 ms, sys: 171 ms, total: 802 ms
Wall time: 20.5 s


# Reduce Data Size Step 1 Pipeline

### Define Columns

In [11]:
all_features = ["bert_base_multilingual_cased_tokens",
                "hashtags",
                "tweet_id",
                "medias",
                "links",
                "domains",
                "type",
                "language",
                "timestamp",
                "a_user_id",
                "a_follower_count",
                "a_following_count",
                "a_is_verified",
                "a_account_creation",
                "b_user_id",
                "b_follower_count",
                "b_following_count",
                "b_is_verified",
                "b_account_creation",
                "a_follows_b"] #as far as I know from the forum (b always follows a in this dataset according to the forum)

all_labels = ["reply",
              "retweet",
              "retweet_comment",
              "like"]

all_columns = all_features + all_labels

### Helper Function for backing up sets (since otherwise I could not recover them)

In [12]:
preprocessing_backup = join(basepath,"preprocessing_backup")
Path(preprocessing_backup).mkdir(parents=True, exist_ok=True)

In [13]:
reduced_1_data = "reduced_1_data"
Path(reduced_1_data).mkdir(parents=True, exist_ok=True)

media_types = set()
language_types = set()
type_types = set()

users = set()
links = set()
hashtags = set()
domains = set()

highest_id = 0

#media_types, language_types, type_types, users, links, hashtags, domains = load_backup_sets()

In [19]:
from dask.distributed import wait
from dask.distributed import Client, progress
import subprocess

# this way each client has 2GB / 5 clients - maximum of 10 gb, it will notify you if the clients run dry on memory, there are also opportunities to cache things
# but quite frankly I havent figured the cache out even on the local machine.
n_cores = 8
client = Client(memory_limit="2GB", n_workers=n_cores//2, threads_per_worker=2, processes=True)
client


Perhaps you already have a cluster running?
Hosting the HTTP server on port 34815 instead


0,1
Client  Scheduler: tcp://127.0.0.1:43451  Dashboard: http://127.0.0.1:34815/status,Cluster  Workers: 4  Cores: 8  Memory: 7.45 GiB


In [20]:
%%time
from os import listdir
from os.path import isfile, join
unpacked_files = [join(uncompressed_dir, f) for f in listdir(uncompressed_dir) if isfile(join(uncompressed_dir, f))]

df = dd.read_csv(unpacked_files, sep='\x01', header=None, names=all_columns, blocksize="128MB")
print(f"Data is split into {df.npartitions} partitions.")

Data is split into 36 partitions.
CPU times: user 16.9 ms, sys: 0 ns, total: 16.9 ms
Wall time: 17.9 ms


In [21]:
%%time
a = df['bert_base_multilingual_cased_tokens'].apply(len)
future = a.persist()
progress(future)
c = future.compute()
c

CPU times: user 1.2 s, sys: 357 ms, total: 1.56 s
Wall time: 15.6 s


0          90
1         575
2         164
3         124
4         137
         ... 
165408     95
165409     41
165410     83
165411    309
165412    411
Name: bert_base_multilingual_cased_tokens, Length: 9033210, dtype: int64

In [11]:
ddf = dd.read_parquet("reduced_2_data/part-00000.parquet/part.0.parquet")
bert_ddf = ddf[["bert_base_multilingual_cased_tokens", "tweet_id"]]

In [13]:
bert_ddf.to_parquet("reduced_2_data/part-00000-bert.parquet")

In [10]:
df = pd.read_parquet("reduced_1_data/nobert/part.1.parquet")
df
#df.drop("bert_base_multilingual_cased_tokens", axis="columns", inplace=True)
#df.to_csv("reduced_2_data/part.0.nobert.csv")
#df.to_parquet("reduced_2_data/part.0.nobert.parquet")

Unnamed: 0_level_0,hashtags,tweet_id,medias,links,domains,type,language,timestamp,a_user_id,a_follower_count,...,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,a_follows_b,reply,retweet,retweet_comment,like
__null_dask_index__,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,[],200001,[],[],[],Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1612437373,B568C5D3550C163F0B68E00697FC0B02,34914,...,5E7D64F8D600D235BBC19BD1170B2004,17,40,False,1582132790,False,0,0,0,1612510899
1,[],200002,[],[],[],TopLevel,7F4FAB1EB12CD95EDCD9DB2A6634EFCE,1612787476,D825FBE47DD50ED4AAD2C40B2BF313AA,7481763,...,2F671518B76A34756F38B3F11CA83EAA,120,129,False,1582658246,False,0,0,0,0
2,[67BF485A82F99AC4A3736E8AC2BB04A1],200003,[],[],[],Retweet,B8B04128918BBF54E2E178BFF1ABA833,1613768762,0CE3555615F92EA4C6089A537F09FE42,1082,...,6C459413729F0F9067501F29A092F040,343,223,False,1435319013,False,0,1613769754,0,0
3,[],200004,[],[],[],TopLevel,E7F038DE3EAD397AEC9193686C911677,1613773165,09209394D7DE5D2C33F5E6928073DED2,1538,...,1A2DC720AB62E285D4DEA362C1470F83,62,104,False,1407726700,True,0,0,0,0
4,"[34262C6327B347C949F30919D57BC11B, CF7B68CE933...",200005,[],"[88B2E60A9DDAB405908F7C95AEAE9187, FFE452F3A5E...","[F595B7DE8992A3D8C7948B4E81419D78, 145F12B39F0...",TopLevel,E7F038DE3EAD397AEC9193686C911677,1613290655,6954E916AA6EC7FA5A980CE15905E3BA,5484,...,8D35D02F41E4D9DBF21CB8F4EDE1F010,259,603,False,1514292948,False,0,1613290819,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129943,[],329944,[],[E7983A01A5C6F58F1549E00FA3276C22],[716B873CFEF929AEE7B4CC042529DB85],TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612503046,1EE693DC547F0156D7FCA40AC3188464,17543760,...,997D3AEB4DF0E4BC4A6F4163EB03D669,607,723,False,1241802581,False,0,0,0,1612531386
129944,[],329945,[],[],[],TopLevel,E7F038DE3EAD397AEC9193686C911677,1613092409,A8E10ABE3D5863908A7A06662D6D889A,1268,...,F93DC93EFB13C7C4FB476684904AF44B,478,252,False,1485342566,True,0,0,0,1613092939
129945,[],329946,[],[],[],Retweet,E7F038DE3EAD397AEC9193686C911677,1612520264,93B6E5AB3B82C6C1744BF877C85B7CCC,3775,...,C98BD4DDC44EEE7705A408CA3A0716AC,327,376,False,1512357511,False,0,0,0,0
129946,"[A7A5787E47304786559B666CCE2BA6C5, 77A38A94E20...",329947,[Photo],[],[],TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1613403865,ADEB9E0EDF1A701A594D003CCC0B67AA,4062,...,17670A9B2BC62BC754F41CE736D7D70A,727,663,False,1379447030,True,0,0,0,1613404982


In [14]:
dd.read_parquet("reduced_2_data/part-00000.parquet/part.0.parquet").head()to_parquet

Unnamed: 0,bert_base_multilingual_cased_tokens,hashtags,tweet_id,medias,links,domains,type,language,timestamp,a_user_id,...,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,a_follows_b,reply,retweet,retweet_comment,like
0,101\t56898\t137\t14657\t11462\t11460\t66730\t1...,[0],0,0,[0],,1,0,1613492182,2745122,...,902515,59,78,False,1555621137,True,0,0,0,0
1,101\t56898\t137\t52544\t10147\t12396\t11233\t1...,[0],1,5,[0],,1,7,1613259606,1310558,...,2113572,189,262,False,1592608530,True,0,0,0,1613261933
2,101\t56991\t216\t216\t19318\t11301\t14120\t131...,[0],2,4,[0],,0,7,1612860156,1320508,...,256593,467,296,False,1417481116,True,0,0,0,0
3,101\t56898\t137\t18087\t11205\t11090\t11010\t1...,[0],3,0,[0],,1,7,1613601599,2667630,...,429403,422,626,False,1245594023,True,0,1613601757,0,0
4,101\t56898\t137\t12882\t31604\t10291\t89525\t1...,[0],4,0,[0],,1,7,1612458567,87684,...,2492071,1901,888,False,1404223230,True,0,0,0,0


In [32]:
%%time
df["timestamp"] = df["timestamp"].astype(np.uint32)
df["a_follower_count"] = df["a_follower_count"].astype(np.uint32)
df["a_following_count"] = df["a_following_count"].astype(np.uint32)
df["a_account_creation"] = df["a_account_creation"].astype(np.uint32)
df["b_follower_count"] = df["b_follower_count"].astype(np.uint32)
df["b_following_count"] = df["b_following_count"].astype(np.uint32)
df["b_account_creation"] = df["b_account_creation"].astype(np.uint32)

df, = dask.persist(df)
_ = wait(df)

Wall time: 2.44 s


In [33]:
%%time
    df['reply']   = df['reply'].fillna(0)
    df['retweet'] = df['retweet'].fillna(0)
    df['retweet_comment'] = df['retweet_comment'].fillna(0)
    df['like']    = df['like'].fillna(0)

    df['reply']   = df['reply'].astype(np.uint32)
    df['retweet'] = df['retweet'].astype(np.uint32)
    df['retweet_comment'] = df['retweet_comment'].astype(np.uint32)
    df['like']    = df['like'].astype(np.uint32)
    gc.collect()
    
    df, = dask.persist(df)
    _ = wait(df)

Wall time: 2.39 s


In [36]:
%%time
df["tweet_id"] = 1
df["tweet_id"] = df["tweet_id"].astype(np.uint32)
df["tweet_id"] = df["tweet_id"].cumsum()
df["tweet_id"] = df["tweet_id"] + highest_id - 1
highest_id+=len(df)

df, = dask.persist(df)
_ = wait(df)

Wall time: 2.32 s


In [37]:
%%time
    #Collect Statistics from this Dataframe
    df['medias'] = df['medias'].fillna("")
    df['medias'] = df['medias'].apply(lambda x:  '_'.join(x.split('\t')[:3]))
    gc.collect()

    media_types.update(list(set(df['medias'])))
    language_types.update(list(set(df['language'])))
    type_types.update(list(set(df["type"])))
    gc.collect()
    
    users.update(list(set(df["a_user_id"])))
    users.update(list(set(df["b_user_id"])))
    gc.collect()
    
    df['hashtags'] = df['hashtags'].fillna("")
    df['hashtags'] = df['hashtags'].apply(lambda x: list(x.split("\t")))
    hashtags.update(list(itertools.chain.from_iterable(df['hashtags'].tolist())))
    gc.collect()

    df['links'] = df['links'].fillna("")
    df['links'] = df['links'].apply(lambda x: list(x.split("\t")))
    links.update(list(itertools.chain.from_iterable(df["links"].tolist())))
    gc.collect()
    
    df['domains'] = df['domains'].fillna("")
    df['domains'] = df['domains'].apply(lambda x: list(x.split("\t")))
    domains.update(list(itertools.chain.from_iterable(df["domains"].tolist())))
    gc.collect()
    
    df, = dask.persist(df)
    _ = wait(df)

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('medias', 'object'))

You did not provide metadata, so Dask is running your function on a small dataset to guess output types. It is possible that Dask will guess incorrectly.
To provide an explicit output types or to silence this message, please provide the `meta=` keyword, as described in the map or apply function that you are using.
  Before: .apply(func)
  After:  .apply(func, meta=('hashtags', 'object'))



AttributeError: 'Series' object has no attribute 'tolist'

In [5]:
def partition_indexing(df, partition_info=None):
    df["tweet_id"] = 1
    df["tweet_id"] = df["tweet_id"].cumsum()
    df["tweet_id"] = df["tweet_id"] + partition_info["number"] * 200000
    return df["tweet_id"]


In [None]:
%%time
ddf = dd.read_csv(join("test", "part-*.tsv"), sep='\x01', header=None, names=all_columns, blocksize="64MB")
gc.collect()

ddf["tweet_id"] = ddf[["tweet_id"]].map_partitions(partition_indexing, meta=pd.Series(dtype=np.uint32))
ddf["tweet_id"] = ddf["tweet_id"].astype(np.uint32)

#bert_ddf = ddf[["bert_base_multilingual_cased_tokens", "tweet_id"]]
#bert_ddf.to_parquet("reduced_1_data/bert")
ddf = ddf.drop("bert_base_multilingual_cased_tokens", axis="columns")


#Remove empties
ddf['reply']   = ddf['reply'].fillna(0)
ddf['retweet'] = ddf['retweet'].fillna(0)
ddf['retweet_comment'] = ddf['retweet_comment'].fillna(0)
ddf['like']    = ddf['like'].fillna(0)

#Change dtypes
ddf["timestamp"] = ddf["timestamp"].astype(np.uint32)
ddf["a_follower_count"] = ddf["a_follower_count"].astype(np.uint32)
ddf["a_following_count"] = ddf["a_following_count"].astype(np.uint32)
ddf["a_account_creation"] = ddf["a_account_creation"].astype(np.uint32)
ddf["b_follower_count"] = ddf["b_follower_count"].astype(np.uint32)
ddf["b_following_count"] = ddf["b_following_count"].astype(np.uint32)
ddf["b_account_creation"] = ddf["b_account_creation"].astype(np.uint32)
ddf['reply'] = ddf['reply'].astype(np.uint32)
ddf['retweet'] = ddf['retweet'].astype(np.uint32)
ddf['retweet_comment'] = ddf['retweet_comment'].astype(np.uint32)
ddf['like'] = ddf['like'].astype(np.uint32)

ddf['medias'] = ddf['medias'].fillna("")
ddf['medias'] = ddf['medias'].map_partitions(lambda x: list(x.str.split("\t")), meta=list)

ddf['hashtags'] = ddf['hashtags'].fillna("")
ddf['hashtags'] = ddf['hashtags'].map_partitions(lambda x: list(x.str.split("\t")), meta=list)

ddf['links'] = ddf['links'].fillna("")
ddf['links'] = ddf['links'].map_partitions(lambda x: list(x.str.split("\t")), meta=list)

ddf['domains'] = ddf['domains'].fillna("")
ddf['domains'] = ddf['domains'].map_partitions(lambda x: list(x.str.split("\t")), meta=list)

lists_task = ddf.map_partitions(lambda x: (list(set([a for b in x.medias.tolist() for a in b]))), meta=list)
media_set = set(itertools.chain.from_iterable(lists_task.compute()))
media_types_mapping = dict((o, idx) for idx, o in enumerate(media_set))
del media_set
gc.collect()
ddf['medias'] = ddf["medias"].map_partitions(lambda x: [media_types_mapping[item] for item in x], meta=list)


ddf.to_parquet("reduced_1_data/nobert")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tweet_id"] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tweet_id"] = df["tweet_id"].cumsum()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tweet_id"] = df["tweet_id"] + partition_info["number"] * 200000


In [17]:
big_list = list(itertools.chain.from_iterable(lists))

4

In [10]:
##OLD !!!!
%%time
for file in tqdm(os.listdir("test")):
    #Read in Data from one of the files
    df = dd.read_csv(os.path.join(uncompressed_dir, file), sep='\x01', header=None, names=all_columns)
    gc.collect()
    
    #Choose Simpler Types
    df["timestamp"] = df["timestamp"].astype(np.uint32)
    df["a_follower_count"] = df["a_follower_count"].astype(np.uint32)
    df["a_following_count"] = df["a_following_count"].astype(np.uint32)
    df["a_account_creation"] = df["a_account_creation"].astype(np.uint32)
    df["b_follower_count"] = df["b_follower_count"].astype(np.uint32)
    df["b_following_count"] = df["b_following_count"].astype(np.uint32)
    df["b_account_creation"] = df["b_account_creation"].astype(np.uint32)
    
    df['reply']   = df['reply'].fillna(0)
    df['retweet'] = df['retweet'].fillna(0)
    df['retweet_comment'] = df['retweet_comment'].fillna(0)
    df['like']    = df['like'].fillna(0)

    df['reply']   = df['reply'].astype(np.uint32)
    df['retweet'] = df['retweet'].astype(np.uint32)
    df['retweet_comment'] = df['retweet_comment'].astype(np.uint32)
    df['like']    = df['like'].astype(np.uint32)
    gc.collect()

    #Here I simply replace the tweet id by a (unique) number in the this dataset
    df["tweet_id"] = np.arange(start=highest_id, stop=highest_id + len(df))
    df["tweet_id"] = df["tweet_id"].astype(np.uint32)
    highest_id+=len(df)
    gc.collect()
    
    #Collect Statistics from this Dataframe
    df['medias'] = df['medias'].fillna("")
    df['medias'] = df['medias'].apply(lambda x:  '_'.join(x.split('\t')[:3]))
    gc.collect()

    media_types.update(list(set(df['medias'])))
    language_types.update(list(set(df['language'])))
    type_types.update(list(set(df["type"])))
    gc.collect()
    
    users.update(list(set(df["a_user_id"])))
    users.update(list(set(df["b_user_id"])))
    gc.collect()
    
    df['hashtags'] = df['hashtags'].fillna("")
    df['hashtags'] = df['hashtags'].apply(lambda x: list(x.split("\t")))
    hashtags.update(list(itertools.chain.from_iterable(df['hashtags'].tolist())))
    gc.collect()

    df['links'] = df['links'].fillna("")
    df['links'] = df['links'].apply(lambda x: list(x.split("\t")))
    links.update(list(itertools.chain.from_iterable(df["links"].tolist())))
    gc.collect()
    
    df['domains'] = df['domains'].fillna("")
    df['domains'] = df['domains'].apply(lambda x: list(x.split("\t")))
    domains.update(list(itertools.chain.from_iterable(df["domains"].tolist())))
    gc.collect()

    backup_sets(media_types, language_types, type_types, users, links, hashtags, domains)
    #Save Dataframe to filesysystem and delete tsv
    df.to_parquet(os.path.join(reduced_1_data, "{0}.parquet".format(file.split(".")[0])))
    #os.remove(os.path.join(uncompressed_dir, file)) #CAREFUL, first check if everything works fine
    gc.collect()
    break

  0%|          | 0/139 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Reduce Data Size Step 2 Pipeline

### Create Mappings for Sets to replace string ids by numeric ids

In [41]:
#media_types, language_types, type_types, users, links, hashtags, domains = load_backup_sets()

In [18]:
media_types_mapping = dict((o, idx) for idx, o in enumerate(media_types))
language_types_mapping = dict((o, idx) for idx, o in enumerate(language_types))
type_types_mapping = dict((o, idx) for idx, o in enumerate(type_types))
users_mapping = dict((o, idx) for idx, o in enumerate(users))
links_mapping = dict((o, idx) for idx, o in enumerate(links))
hashtags_mapping = dict((o, idx) for idx, o in enumerate(hashtags))
domains_mapping = dict((o, idx) for idx, o in enumerate(domains))

In [42]:
%%time

reduced_2_data = "reduced_2_data"
Path(reduced_2_data).mkdir(parents=True, exist_ok=True)

for file in tqdm(os.listdir(reduced_1_data)):
    df = dd.read_parquet(os.path.join(reduced_1_data, file))
    #df = pd.read_parquet()
    gc.collect()
    print("mapping medias, languages and tweet types")
    df["medias"] = df["medias"].map(media_types_mapping)
    df["medias"] = df["medias"].astype(np.uint8)
    df["language"] = df["language"].map(language_types_mapping)
    df["language"] = df["language"].astype(np.uint8)
    df["type"] = df["type"].map(type_types_mapping)
    df["type"] = df["type"].astype(np.uint8)
    gc.collect()
    
    #these will be problematic for test or validation sets!!! Also might take a long time!!
    print("mapping a users")
    df["a_user_id"] = df["a_user_id"].map(users_mapping)
    df["a_user_id"] = df["a_user_id"].astype(np.uint32)
    gc.collect()
    print("mapping b users")
    df["b_user_id"] = df["b_user_id"].map(users_mapping)
    df["b_user_id"] = df["b_user_id"].astype(np.uint32)
    gc.collect()
    
    print("mapping links")
    df["links"] = df["links"].apply(lambda x: [links_mapping[item] for item in x], meta=list)
    gc.collect()
    print("mapping hashtags")
    df["hashtags"] = df["hashtags"].apply(lambda x: [hashtags_mapping[item] for item in x], meta=list)
    gc.collect()
    print("mapping domains")
    df["domains"] = df["domains"].apply(lambda x: [domains_mapping[item] for item in x], meta=list)
    gc.collect()
    
    print("persisting")
    df = dask.persist(df)[0]
    df.to_parquet(os.path.join(reduced_2_data, file))
    gc.collect()
    break

  0%|          | 0/1 [00:00<?, ?it/s]

mapping medias, languages and tweet types
mapping a users
mapping b users
mapping links
mapping hashtags
Wall time: 2min 2s


In [46]:
df["domains"].head(10)

0                                None
1                                None
2                                None
3                                None
4                                None
5                                None
6                                None
7                                None
8    B878DFE1B736AFD9F6777AF38F8DE61D
9                                None
Name: domains, dtype: object

# Preprocess Data (Simple)

In [29]:
all_features = ["bert_base_multilingual_cased_tokens",
                "hashtags",
                "tweet_id",
                "medias",
                "links",
                "domains",
                "type",
                "language",
                "timestamp",
                "a_user_id",
                "a_follower_count",
                "a_following_count",
                "a_is_verified",
                "a_account_creation",
                "b_user_id",
                "b_follower_count",
                "b_following_count",
                "b_is_verified",
                "b_account_creation",
                "a_follows_b"] #as far as I know from the forum (b always follows a in this dataset according to the forum)

all_labels = ["reply",
              "retweet",
              "retweet_comment",
              "like"]

all_columns = all_features + all_labels

In [34]:
%%time

df = pd.read_csv(os.path.join("data","part-00000.tsv"), sep='\x01', header=None, names=all_columns)
gc.collect()
print(df.dtypes)
df.head(10)

bert_base_multilingual_cased_tokens     object
hashtags                                object
tweet_id                                object
medias                                  object
links                                   object
domains                                 object
type                                    object
language                                object
timestamp                                int64
a_user_id                               object
a_follower_count                         int64
a_following_count                        int64
a_is_verified                             bool
a_account_creation                       int64
b_user_id                               object
b_follower_count                         int64
b_following_count                        int64
b_is_verified                             bool
b_account_creation                       int64
a_follows_b                               bool
reply                                  float64
retweet      

Unnamed: 0,bert_base_multilingual_cased_tokens,hashtags,tweet_id,medias,links,domains,type,language,timestamp,a_user_id,...,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,a_follows_b,reply,retweet,retweet_comment,like
0,101\t56898\t137\t14657\t11462\t11460\t66730\t1...,,D6621E1038904DA83CBBA1DE9F4FFA7A,,,,Retweet,B0FA488F2911701DD8EC5B1EA5E322D8,1613492182,80711DFD42F98EBC140C98081AB45CE4,...,D6E115617E251242E4F52331088CB095,59,78,False,1555621137,True,,,,
1,101\t56898\t137\t52544\t10147\t12396\t11233\t1...,,1F0C624B6B3455AA8C14A7C4EF6B342E,Photo,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1613259606,0DD55916F81777368210F2560772F7EB,...,5E045FAFEAF68D25A88EB7B1206204C4,189,262,False,1592608530,True,,,,1613262000.0
2,101\t56991\t216\t216\t19318\t11301\t14120\t131...,,F944E479EBDEEECBFBA03F47D8B5B79A,Photo\tPhoto,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612860156,69473ED3A7A25C8736BE4238BE5D94E4,...,8E521197048F755F77F443CB0BA68561,467,296,False,1417481116,True,,,,
3,101\t56898\t137\t18087\t11205\t11090\t11010\t1...,,0F98BD50C159E189E7F6F8203227FC36,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1613601599,16DA9077158CD9C50EC570402BA41BCB,...,0074BD9F61EB60EA4442B960FE75160B,422,626,False,1245594023,True,,1613602000.0,,
4,101\t56898\t137\t12882\t31604\t10291\t89525\t1...,,3C1A2B662FBA0436DCBCCD488B08E2D4,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1612458567,C499B39D013A6466A83E46C73A26162E,...,7741F23566F50AAFD70C0AA3A39B32CD,1901,888,False,1404223230,True,,,,
5,101\t56898\t137\t189\t10237\t73099\t10627\t131...,,03FF0F1C5A96792542723EFF028112D3,Photo\tPhoto,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1612903265,C99224CDB27B8557B848F5C8C8EF085B,...,1BB8A3C18DA42B85DC88AB6D8DD6CA32,367,208,False,1485276466,True,,,,1612905000.0
6,101\t56898\t137\t85849\t11274\t54781\t84846\t4...,,0CA2709A523CBB7D669A11689B915EB1,Photo,,,Retweet,313ECD3A1E5BB07406E4249475C2D6D6,1614150437,2776742A9AC6DCD218C9EAFFDA2BD3D5,...,C0F833C0FC7EB90723AE1819D84283A2,160,156,False,1585449675,True,,,,1614176000.0
7,101\t56898\t137\t97801\t26960\t18369\t131\t112...,,0F715EB73B87B625B538208433C1A518,,,,Retweet,B8B04128918BBF54E2E178BFF1ABA833,1614035001,57BEF3F2526333A21AD46FC8D35DF276,...,772668C750B34CBAD5A5A5A50669DFAA,2548,3619,False,1553168686,True,,1614035000.0,,1614035000.0
8,101\t100\t100\t216\t216\t1901\t56287\t16558\t1...,,533F0566ACC1897FD4D66E5E00F0598A,Photo,D583E03AD7F9E478C61C5F8D4C6E142D,B878DFE1B736AFD9F6777AF38F8DE61D,TopLevel,E7F038DE3EAD397AEC9193686C911677,1613814835,A3CF5031ABA0C8D3055918865B5670C3,...,56ACD194C47AC4DC1F27082C65EEC259,388,471,False,1295352783,True,,,,
9,101\t56898\t137\t10117\t11565\t35819\t11183\t1...,D6F1E57D13C227F4D3019B6D2D4F0993\t3FD24C610015...,09CEBA79430D9B9FE22C9FBEF8D7416F,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1613599591,B10C4DE37CCC2C64DF5F33E80C4AC808,...,51E7EFF8FEBC258B5F45A59C36F7423F,1131,1573,False,1315763646,False,,,,


In [35]:
%%time

df["timestamp"] = df["timestamp"].astype(np.uint32)
df["a_follower_count"] = df["a_follower_count"].astype(np.uint32)
df["a_following_count"] = df["a_following_count"].astype(np.uint32)
df["a_account_creation"] = df["a_account_creation"].astype(np.uint32)
df["b_follower_count"] = df["b_follower_count"].astype(np.uint32)
df["b_following_count"] = df["b_following_count"].astype(np.uint32)
df["b_account_creation"] = df["b_account_creation"].astype(np.uint32)


df['reply']   = df['reply'].fillna(0)
df['retweet'] = df['retweet'].fillna(0)
df['retweet_comment'] = df['retweet_comment'].fillna(0)
df['like']    = df['like'].fillna(0)

df['reply']   = df['reply'].astype(np.uint32)
df['retweet'] = df['retweet'].astype(np.uint32)
df['retweet_comment'] = df['retweet_comment'].astype(np.uint32)
df['like']    = df['like'].astype(np.uint32)

gc.collect()
df.head(10)

Wall time: 2.81 s


Unnamed: 0,bert_base_multilingual_cased_tokens,hashtags,tweet_id,medias,links,domains,type,language,timestamp,a_user_id,...,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,a_follows_b,reply,retweet,retweet_comment,like
0,101\t56898\t137\t14657\t11462\t11460\t66730\t1...,,D6621E1038904DA83CBBA1DE9F4FFA7A,,,,Retweet,B0FA488F2911701DD8EC5B1EA5E322D8,1613492182,80711DFD42F98EBC140C98081AB45CE4,...,D6E115617E251242E4F52331088CB095,59,78,False,1555621137,True,0,0,0,0
1,101\t56898\t137\t52544\t10147\t12396\t11233\t1...,,1F0C624B6B3455AA8C14A7C4EF6B342E,Photo,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1613259606,0DD55916F81777368210F2560772F7EB,...,5E045FAFEAF68D25A88EB7B1206204C4,189,262,False,1592608530,True,0,0,0,1613261933
2,101\t56991\t216\t216\t19318\t11301\t14120\t131...,,F944E479EBDEEECBFBA03F47D8B5B79A,Photo\tPhoto,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612860156,69473ED3A7A25C8736BE4238BE5D94E4,...,8E521197048F755F77F443CB0BA68561,467,296,False,1417481116,True,0,0,0,0
3,101\t56898\t137\t18087\t11205\t11090\t11010\t1...,,0F98BD50C159E189E7F6F8203227FC36,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1613601599,16DA9077158CD9C50EC570402BA41BCB,...,0074BD9F61EB60EA4442B960FE75160B,422,626,False,1245594023,True,0,1613601757,0,0
4,101\t56898\t137\t12882\t31604\t10291\t89525\t1...,,3C1A2B662FBA0436DCBCCD488B08E2D4,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1612458567,C499B39D013A6466A83E46C73A26162E,...,7741F23566F50AAFD70C0AA3A39B32CD,1901,888,False,1404223230,True,0,0,0,0
5,101\t56898\t137\t189\t10237\t73099\t10627\t131...,,03FF0F1C5A96792542723EFF028112D3,Photo\tPhoto,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1612903265,C99224CDB27B8557B848F5C8C8EF085B,...,1BB8A3C18DA42B85DC88AB6D8DD6CA32,367,208,False,1485276466,True,0,0,0,1612904516
6,101\t56898\t137\t85849\t11274\t54781\t84846\t4...,,0CA2709A523CBB7D669A11689B915EB1,Photo,,,Retweet,313ECD3A1E5BB07406E4249475C2D6D6,1614150437,2776742A9AC6DCD218C9EAFFDA2BD3D5,...,C0F833C0FC7EB90723AE1819D84283A2,160,156,False,1585449675,True,0,0,0,1614175795
7,101\t56898\t137\t97801\t26960\t18369\t131\t112...,,0F715EB73B87B625B538208433C1A518,,,,Retweet,B8B04128918BBF54E2E178BFF1ABA833,1614035001,57BEF3F2526333A21AD46FC8D35DF276,...,772668C750B34CBAD5A5A5A50669DFAA,2548,3619,False,1553168686,True,0,1614035221,0,1614035221
8,101\t100\t100\t216\t216\t1901\t56287\t16558\t1...,,533F0566ACC1897FD4D66E5E00F0598A,Photo,D583E03AD7F9E478C61C5F8D4C6E142D,B878DFE1B736AFD9F6777AF38F8DE61D,TopLevel,E7F038DE3EAD397AEC9193686C911677,1613814835,A3CF5031ABA0C8D3055918865B5670C3,...,56ACD194C47AC4DC1F27082C65EEC259,388,471,False,1295352783,True,0,0,0,0
9,101\t56898\t137\t10117\t11565\t35819\t11183\t1...,D6F1E57D13C227F4D3019B6D2D4F0993\t3FD24C610015...,09CEBA79430D9B9FE22C9FBEF8D7416F,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1613599591,B10C4DE37CCC2C64DF5F33E80C4AC808,...,51E7EFF8FEBC258B5F45A59C36F7423F,1131,1573,False,1315763646,False,0,0,0,0


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3021422 entries, 0 to 3021421
Data columns (total 24 columns):
 #   Column                               Dtype 
---  ------                               ----- 
 0   bert_base_multilingual_cased_tokens  object
 1   hashtags                             object
 2   tweet_id                             object
 3   medias                               object
 4   links                                object
 5   domains                              object
 6   type                                 object
 7   language                             object
 8   timestamp                            uint32
 9   a_user_id                            object
 10  a_follower_count                     uint32
 11  a_following_count                    uint32
 12  a_is_verified                        bool  
 13  a_account_creation                   uint32
 14  b_user_id                            object
 15  b_follower_count                     uint32
 16  

## Here is assumtion that there are no more combinations in the other datasets

In [37]:
%%time

df['medias'] = df['medias'].fillna("")
df['medias'] = df['medias'].apply(lambda x:  '_'.join(x.split('\t')[:3]))
df['medias'] = pd.factorize( df['medias'], sort=True)[0]
df['medias'] = df['medias'].astype(np.uint8)

df['language'] = pd.factorize( df['language'], sort=True)[0]
df['language'] = df['language'].astype(np.uint8)

df['type'] = pd.factorize( df['type'], sort=True)[0]
df['type'] = df['type'].astype(np.uint8)

Wall time: 2.63 s


In [38]:
df.head(10)

Unnamed: 0,bert_base_multilingual_cased_tokens,hashtags,tweet_id,medias,links,domains,type,language,timestamp,a_user_id,...,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,a_follows_b,reply,retweet,retweet_comment,like
0,101\t56898\t137\t14657\t11462\t11460\t66730\t1...,,D6621E1038904DA83CBBA1DE9F4FFA7A,0,,,1,45,1613492182,80711DFD42F98EBC140C98081AB45CE4,...,D6E115617E251242E4F52331088CB095,59,78,False,1555621137,True,0,0,0,0
1,101\t56898\t137\t52544\t10147\t12396\t11233\t1...,,1F0C624B6B3455AA8C14A7C4EF6B342E,5,,,1,19,1613259606,0DD55916F81777368210F2560772F7EB,...,5E045FAFEAF68D25A88EB7B1206204C4,189,262,False,1592608530,True,0,0,0,1613261933
2,101\t56991\t216\t216\t19318\t11301\t14120\t131...,,F944E479EBDEEECBFBA03F47D8B5B79A,6,,,2,19,1612860156,69473ED3A7A25C8736BE4238BE5D94E4,...,8E521197048F755F77F443CB0BA68561,467,296,False,1417481116,True,0,0,0,0
3,101\t56898\t137\t18087\t11205\t11090\t11010\t1...,,0F98BD50C159E189E7F6F8203227FC36,0,,,1,19,1613601599,16DA9077158CD9C50EC570402BA41BCB,...,0074BD9F61EB60EA4442B960FE75160B,422,626,False,1245594023,True,0,1613601757,0,0
4,101\t56898\t137\t12882\t31604\t10291\t89525\t1...,,3C1A2B662FBA0436DCBCCD488B08E2D4,0,,,1,19,1612458567,C499B39D013A6466A83E46C73A26162E,...,7741F23566F50AAFD70C0AA3A39B32CD,1901,888,False,1404223230,True,0,0,0,0
5,101\t56898\t137\t189\t10237\t73099\t10627\t131...,,03FF0F1C5A96792542723EFF028112D3,6,,,1,19,1612903265,C99224CDB27B8557B848F5C8C8EF085B,...,1BB8A3C18DA42B85DC88AB6D8DD6CA32,367,208,False,1485276466,True,0,0,0,1612904516
6,101\t56898\t137\t85849\t11274\t54781\t84846\t4...,,0CA2709A523CBB7D669A11689B915EB1,5,,,1,10,1614150437,2776742A9AC6DCD218C9EAFFDA2BD3D5,...,C0F833C0FC7EB90723AE1819D84283A2,160,156,False,1585449675,True,0,0,0,1614175795
7,101\t56898\t137\t97801\t26960\t18369\t131\t112...,,0F715EB73B87B625B538208433C1A518,0,,,1,47,1614035001,57BEF3F2526333A21AD46FC8D35DF276,...,772668C750B34CBAD5A5A5A50669DFAA,2548,3619,False,1553168686,True,0,1614035221,0,1614035221
8,101\t100\t100\t216\t216\t1901\t56287\t16558\t1...,,533F0566ACC1897FD4D66E5E00F0598A,5,D583E03AD7F9E478C61C5F8D4C6E142D,B878DFE1B736AFD9F6777AF38F8DE61D,2,60,1613814835,A3CF5031ABA0C8D3055918865B5670C3,...,56ACD194C47AC4DC1F27082C65EEC259,388,471,False,1295352783,True,0,0,0,0
9,101\t56898\t137\t10117\t11565\t35819\t11183\t1...,D6F1E57D13C227F4D3019B6D2D4F0993\t3FD24C610015...,09CEBA79430D9B9FE22C9FBEF8D7416F,0,,,1,19,1613599591,B10C4DE37CCC2C64DF5F33E80C4AC808,...,51E7EFF8FEBC258B5F45A59C36F7423F,1131,1573,False,1315763646,False,0,0,0,0


In [5]:
print(tok.convert_tokens_to_string(tok.convert_ids_to_tokens(df.iloc[df["text_tokens"].map(len).argmax(),0].split("\t"))), df["language"][df["text_tokens"].map(len).argmax()])
print(tok.convert_tokens_to_string(tok.convert_ids_to_tokens(df.iloc[129,0].split("\t"))), df["language"][129])
for i in range(0, 10):
    print("{0}: {1}; l={2}".format(i, tok.convert_tokens_to_string(tok.convert_ids_to_tokens(df.iloc[i,0].split("\t"))), df["language"][i]))

[CLS] Dad bods & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; abs & amp ; v line [SEP] 488B32D24BD4BB44172EB981C1BCA6FA
[CLS] RT @ jennyprioli : LUMENA SEM MAKE ¶ ¶ 

'\t' (tab) is the split symbol in the data. Dataset has no nan entries.

The engaging user is the one that posts the tweet and the engaged user is the one who reacts or does not react. (at least how I understand it)

In [6]:
df.isna().sum() 

text_tokens                           0
hashtags                              0
tweet_id                              0
present_media                         0
present_links                         0
present_domains                       0
tweet_type                            0
language                              0
tweet_timestamp                       0
engaged_with_user_id                  0
engaged_with_user_follower_count      0
engaged_with_user_following_count     0
engaged_with_user_is_verified         0
engaged_with_user_account_creation    0
engaging_user_id                      0
engaging_user_follower_count          0
engaging_user_following_count         0
engaging_user_is_verified             0
engaging_user_account_creation        0
engagee_follows_engager               0
reply_timestamp                       0
retweet_timestamp                     0
retweet_with_comment_timestamp        0
like_timestamp                        0
dtype: int64

In [87]:
prettyprintrow(df.iloc[127])

[CLS] [UNK] @ wise shipmentressedPA _ 7 : ứ [UNK] [UNK] [UNK]az [UNK] competitors LED „ LED strategies LED Flat [UNK] 184 1816 LED Hanna LED competitors [UNK] ‡ [UNK] liberty LEDaz [UNK] Théâtre realised LED ‒ LEDaz LED Santo [UNK]rra [UNK] Cairo [UNK] 184 Théâtre ‡ [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] accord [UNK]rra LED ợ strategies LEDrra [UNK] Rivera ῦ LEDaz [UNK] Théâtre Rivera ‒ LEDaz LED Legislature [UNK] competitors 184 „ LEDnian LEDaz 184 Théâtre strategies [UNK] Rivera ỳ LEDnian [UNK]az [UNK] [UNK] [SEP]
2021-02-19 22:49:52
0F1C8CDCF6ED710DC85FD57F7D70E426


In [80]:
def prettyprintrow(row):
    print(tok.convert_tokens_to_string(tok.convert_ids_to_tokens(row["text_tokens"].split("\t"))))
    print(datetime.fromtimestamp(int(row["tweet_timestamp"])))
    print(row["engaged_with_user_id"])

In [79]:
df.loc[df['enaging_user_id'] == some_value]

text_tokens                           0
hashtags                              0
tweet_id                              0
present_media                         0
present_links                         0
present_domains                       0
tweet_type                            0
language                              0
tweet_timestamp                       0
engaged_with_user_id                  0
engaged_with_user_follower_count      0
engaged_with_user_following_count     0
engaged_with_user_is_verified         0
engaged_with_user_account_creation    0
enaging_user_id                       0
enaging_user_follower_count           0
enaging_user_following_count          0
enaging_user_is_verified              0
enaging_user_account_creation         0
engagee_follows_engager               0
reply_timestamp                       0
retweet_timestamp                     0
retweet_with_comment_timestamp        0
like_timestamp                        0
dtype: int64

In [11]:
print(datetime.fromtimestamp(int(min(df["tweet_timestamp"]))))
print(datetime.fromtimestamp(int(max(df["tweet_timestamp"]))))

2021-02-04 01:00:00
2021-02-25 00:59:59
