In [17]:
import pandas as pd
import numpy as np
from transformers import BertModel, BertConfig, PreTrainedTokenizer, BertTokenizer
from datetime import datetime
import os
import gc

pd.options.display.expand_frame_repr = True
tok = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

In [29]:
all_features = ["bert_base_multilingual_cased_tokens",
                "hashtags",
                "tweet_id",
                "medias",
                "links",
                "domains",
                "type",
                "language",
                "timestamp",
                "a_user_id",
                "a_follower_count",
                "a_following_count",
                "a_is_verified",
                "a_account_creation",
                "b_user_id",
                "b_follower_count",
                "b_following_count",
                "b_is_verified",
                "b_account_creation",
                "a_follows_b"] #as far as I know from the forum (b always follows a in this dataset according to the forum)

all_labels = ["reply",
              "retweet",
              "retweet_comment",
              "like"]

all_columns = all_features + all_labels

In [34]:
%%time

df = pd.read_csv(os.path.join("data","part-00000.tsv"), sep='\x01', header=None, names=all_columns)
gc.collect()
print(df.dtypes)
df.head(10)

bert_base_multilingual_cased_tokens     object
hashtags                                object
tweet_id                                object
medias                                  object
links                                   object
domains                                 object
type                                    object
language                                object
timestamp                                int64
a_user_id                               object
a_follower_count                         int64
a_following_count                        int64
a_is_verified                             bool
a_account_creation                       int64
b_user_id                               object
b_follower_count                         int64
b_following_count                        int64
b_is_verified                             bool
b_account_creation                       int64
a_follows_b                               bool
reply                                  float64
retweet      

Unnamed: 0,bert_base_multilingual_cased_tokens,hashtags,tweet_id,medias,links,domains,type,language,timestamp,a_user_id,...,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,a_follows_b,reply,retweet,retweet_comment,like
0,101\t56898\t137\t14657\t11462\t11460\t66730\t1...,,D6621E1038904DA83CBBA1DE9F4FFA7A,,,,Retweet,B0FA488F2911701DD8EC5B1EA5E322D8,1613492182,80711DFD42F98EBC140C98081AB45CE4,...,D6E115617E251242E4F52331088CB095,59,78,False,1555621137,True,,,,
1,101\t56898\t137\t52544\t10147\t12396\t11233\t1...,,1F0C624B6B3455AA8C14A7C4EF6B342E,Photo,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1613259606,0DD55916F81777368210F2560772F7EB,...,5E045FAFEAF68D25A88EB7B1206204C4,189,262,False,1592608530,True,,,,1613262000.0
2,101\t56991\t216\t216\t19318\t11301\t14120\t131...,,F944E479EBDEEECBFBA03F47D8B5B79A,Photo\tPhoto,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612860156,69473ED3A7A25C8736BE4238BE5D94E4,...,8E521197048F755F77F443CB0BA68561,467,296,False,1417481116,True,,,,
3,101\t56898\t137\t18087\t11205\t11090\t11010\t1...,,0F98BD50C159E189E7F6F8203227FC36,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1613601599,16DA9077158CD9C50EC570402BA41BCB,...,0074BD9F61EB60EA4442B960FE75160B,422,626,False,1245594023,True,,1613602000.0,,
4,101\t56898\t137\t12882\t31604\t10291\t89525\t1...,,3C1A2B662FBA0436DCBCCD488B08E2D4,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1612458567,C499B39D013A6466A83E46C73A26162E,...,7741F23566F50AAFD70C0AA3A39B32CD,1901,888,False,1404223230,True,,,,
5,101\t56898\t137\t189\t10237\t73099\t10627\t131...,,03FF0F1C5A96792542723EFF028112D3,Photo\tPhoto,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1612903265,C99224CDB27B8557B848F5C8C8EF085B,...,1BB8A3C18DA42B85DC88AB6D8DD6CA32,367,208,False,1485276466,True,,,,1612905000.0
6,101\t56898\t137\t85849\t11274\t54781\t84846\t4...,,0CA2709A523CBB7D669A11689B915EB1,Photo,,,Retweet,313ECD3A1E5BB07406E4249475C2D6D6,1614150437,2776742A9AC6DCD218C9EAFFDA2BD3D5,...,C0F833C0FC7EB90723AE1819D84283A2,160,156,False,1585449675,True,,,,1614176000.0
7,101\t56898\t137\t97801\t26960\t18369\t131\t112...,,0F715EB73B87B625B538208433C1A518,,,,Retweet,B8B04128918BBF54E2E178BFF1ABA833,1614035001,57BEF3F2526333A21AD46FC8D35DF276,...,772668C750B34CBAD5A5A5A50669DFAA,2548,3619,False,1553168686,True,,1614035000.0,,1614035000.0
8,101\t100\t100\t216\t216\t1901\t56287\t16558\t1...,,533F0566ACC1897FD4D66E5E00F0598A,Photo,D583E03AD7F9E478C61C5F8D4C6E142D,B878DFE1B736AFD9F6777AF38F8DE61D,TopLevel,E7F038DE3EAD397AEC9193686C911677,1613814835,A3CF5031ABA0C8D3055918865B5670C3,...,56ACD194C47AC4DC1F27082C65EEC259,388,471,False,1295352783,True,,,,
9,101\t56898\t137\t10117\t11565\t35819\t11183\t1...,D6F1E57D13C227F4D3019B6D2D4F0993\t3FD24C610015...,09CEBA79430D9B9FE22C9FBEF8D7416F,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1613599591,B10C4DE37CCC2C64DF5F33E80C4AC808,...,51E7EFF8FEBC258B5F45A59C36F7423F,1131,1573,False,1315763646,False,,,,


In [35]:
%%time

df["timestamp"] = df["timestamp"].astype(np.uint32)
df["a_follower_count"] = df["a_follower_count"].astype(np.uint32)
df["a_following_count"] = df["a_following_count"].astype(np.uint32)
df["a_account_creation"] = df["a_account_creation"].astype(np.uint32)
df["b_follower_count"] = df["b_follower_count"].astype(np.uint32)
df["b_following_count"] = df["b_following_count"].astype(np.uint32)
df["b_account_creation"] = df["b_account_creation"].astype(np.uint32)


df['reply']   = df['reply'].fillna(0)
df['retweet'] = df['retweet'].fillna(0)
df['retweet_comment'] = df['retweet_comment'].fillna(0)
df['like']    = df['like'].fillna(0)

df['reply']   = df['reply'].astype(np.uint32)
df['retweet'] = df['retweet'].astype(np.uint32)
df['retweet_comment'] = df['retweet_comment'].astype(np.uint32)
df['like']    = df['like'].astype(np.uint32)

gc.collect()
df.head(10)

Wall time: 2.81 s


Unnamed: 0,bert_base_multilingual_cased_tokens,hashtags,tweet_id,medias,links,domains,type,language,timestamp,a_user_id,...,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,a_follows_b,reply,retweet,retweet_comment,like
0,101\t56898\t137\t14657\t11462\t11460\t66730\t1...,,D6621E1038904DA83CBBA1DE9F4FFA7A,,,,Retweet,B0FA488F2911701DD8EC5B1EA5E322D8,1613492182,80711DFD42F98EBC140C98081AB45CE4,...,D6E115617E251242E4F52331088CB095,59,78,False,1555621137,True,0,0,0,0
1,101\t56898\t137\t52544\t10147\t12396\t11233\t1...,,1F0C624B6B3455AA8C14A7C4EF6B342E,Photo,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1613259606,0DD55916F81777368210F2560772F7EB,...,5E045FAFEAF68D25A88EB7B1206204C4,189,262,False,1592608530,True,0,0,0,1613261933
2,101\t56991\t216\t216\t19318\t11301\t14120\t131...,,F944E479EBDEEECBFBA03F47D8B5B79A,Photo\tPhoto,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1612860156,69473ED3A7A25C8736BE4238BE5D94E4,...,8E521197048F755F77F443CB0BA68561,467,296,False,1417481116,True,0,0,0,0
3,101\t56898\t137\t18087\t11205\t11090\t11010\t1...,,0F98BD50C159E189E7F6F8203227FC36,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1613601599,16DA9077158CD9C50EC570402BA41BCB,...,0074BD9F61EB60EA4442B960FE75160B,422,626,False,1245594023,True,0,1613601757,0,0
4,101\t56898\t137\t12882\t31604\t10291\t89525\t1...,,3C1A2B662FBA0436DCBCCD488B08E2D4,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1612458567,C499B39D013A6466A83E46C73A26162E,...,7741F23566F50AAFD70C0AA3A39B32CD,1901,888,False,1404223230,True,0,0,0,0
5,101\t56898\t137\t189\t10237\t73099\t10627\t131...,,03FF0F1C5A96792542723EFF028112D3,Photo\tPhoto,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1612903265,C99224CDB27B8557B848F5C8C8EF085B,...,1BB8A3C18DA42B85DC88AB6D8DD6CA32,367,208,False,1485276466,True,0,0,0,1612904516
6,101\t56898\t137\t85849\t11274\t54781\t84846\t4...,,0CA2709A523CBB7D669A11689B915EB1,Photo,,,Retweet,313ECD3A1E5BB07406E4249475C2D6D6,1614150437,2776742A9AC6DCD218C9EAFFDA2BD3D5,...,C0F833C0FC7EB90723AE1819D84283A2,160,156,False,1585449675,True,0,0,0,1614175795
7,101\t56898\t137\t97801\t26960\t18369\t131\t112...,,0F715EB73B87B625B538208433C1A518,,,,Retweet,B8B04128918BBF54E2E178BFF1ABA833,1614035001,57BEF3F2526333A21AD46FC8D35DF276,...,772668C750B34CBAD5A5A5A50669DFAA,2548,3619,False,1553168686,True,0,1614035221,0,1614035221
8,101\t100\t100\t216\t216\t1901\t56287\t16558\t1...,,533F0566ACC1897FD4D66E5E00F0598A,Photo,D583E03AD7F9E478C61C5F8D4C6E142D,B878DFE1B736AFD9F6777AF38F8DE61D,TopLevel,E7F038DE3EAD397AEC9193686C911677,1613814835,A3CF5031ABA0C8D3055918865B5670C3,...,56ACD194C47AC4DC1F27082C65EEC259,388,471,False,1295352783,True,0,0,0,0
9,101\t56898\t137\t10117\t11565\t35819\t11183\t1...,D6F1E57D13C227F4D3019B6D2D4F0993\t3FD24C610015...,09CEBA79430D9B9FE22C9FBEF8D7416F,,,,Retweet,488B32D24BD4BB44172EB981C1BCA6FA,1613599591,B10C4DE37CCC2C64DF5F33E80C4AC808,...,51E7EFF8FEBC258B5F45A59C36F7423F,1131,1573,False,1315763646,False,0,0,0,0


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3021422 entries, 0 to 3021421
Data columns (total 24 columns):
 #   Column                               Dtype 
---  ------                               ----- 
 0   bert_base_multilingual_cased_tokens  object
 1   hashtags                             object
 2   tweet_id                             object
 3   medias                               object
 4   links                                object
 5   domains                              object
 6   type                                 object
 7   language                             object
 8   timestamp                            uint32
 9   a_user_id                            object
 10  a_follower_count                     uint32
 11  a_following_count                    uint32
 12  a_is_verified                        bool  
 13  a_account_creation                   uint32
 14  b_user_id                            object
 15  b_follower_count                     uint32
 16  

## Here is assumtion that there are no more combinations in the other datasets

In [37]:
%%time

df['medias'] = df['medias'].fillna("")
df['medias'] = df['medias'].apply(lambda x:  '_'.join(x.split('\t')[:3]))
df['medias'] = pd.factorize( df['medias'], sort=True)[0]
df['medias'] = df['medias'].astype(np.uint8)

df['language'] = pd.factorize( df['language'], sort=True)[0]
df['language'] = df['language'].astype(np.uint8)

df['type'] = pd.factorize( df['type'], sort=True)[0]
df['type'] = df['type'].astype(np.uint8)

Wall time: 2.63 s


In [38]:
df.head(10)

Unnamed: 0,bert_base_multilingual_cased_tokens,hashtags,tweet_id,medias,links,domains,type,language,timestamp,a_user_id,...,b_user_id,b_follower_count,b_following_count,b_is_verified,b_account_creation,a_follows_b,reply,retweet,retweet_comment,like
0,101\t56898\t137\t14657\t11462\t11460\t66730\t1...,,D6621E1038904DA83CBBA1DE9F4FFA7A,0,,,1,45,1613492182,80711DFD42F98EBC140C98081AB45CE4,...,D6E115617E251242E4F52331088CB095,59,78,False,1555621137,True,0,0,0,0
1,101\t56898\t137\t52544\t10147\t12396\t11233\t1...,,1F0C624B6B3455AA8C14A7C4EF6B342E,5,,,1,19,1613259606,0DD55916F81777368210F2560772F7EB,...,5E045FAFEAF68D25A88EB7B1206204C4,189,262,False,1592608530,True,0,0,0,1613261933
2,101\t56991\t216\t216\t19318\t11301\t14120\t131...,,F944E479EBDEEECBFBA03F47D8B5B79A,6,,,2,19,1612860156,69473ED3A7A25C8736BE4238BE5D94E4,...,8E521197048F755F77F443CB0BA68561,467,296,False,1417481116,True,0,0,0,0
3,101\t56898\t137\t18087\t11205\t11090\t11010\t1...,,0F98BD50C159E189E7F6F8203227FC36,0,,,1,19,1613601599,16DA9077158CD9C50EC570402BA41BCB,...,0074BD9F61EB60EA4442B960FE75160B,422,626,False,1245594023,True,0,1613601757,0,0
4,101\t56898\t137\t12882\t31604\t10291\t89525\t1...,,3C1A2B662FBA0436DCBCCD488B08E2D4,0,,,1,19,1612458567,C499B39D013A6466A83E46C73A26162E,...,7741F23566F50AAFD70C0AA3A39B32CD,1901,888,False,1404223230,True,0,0,0,0
5,101\t56898\t137\t189\t10237\t73099\t10627\t131...,,03FF0F1C5A96792542723EFF028112D3,6,,,1,19,1612903265,C99224CDB27B8557B848F5C8C8EF085B,...,1BB8A3C18DA42B85DC88AB6D8DD6CA32,367,208,False,1485276466,True,0,0,0,1612904516
6,101\t56898\t137\t85849\t11274\t54781\t84846\t4...,,0CA2709A523CBB7D669A11689B915EB1,5,,,1,10,1614150437,2776742A9AC6DCD218C9EAFFDA2BD3D5,...,C0F833C0FC7EB90723AE1819D84283A2,160,156,False,1585449675,True,0,0,0,1614175795
7,101\t56898\t137\t97801\t26960\t18369\t131\t112...,,0F715EB73B87B625B538208433C1A518,0,,,1,47,1614035001,57BEF3F2526333A21AD46FC8D35DF276,...,772668C750B34CBAD5A5A5A50669DFAA,2548,3619,False,1553168686,True,0,1614035221,0,1614035221
8,101\t100\t100\t216\t216\t1901\t56287\t16558\t1...,,533F0566ACC1897FD4D66E5E00F0598A,5,D583E03AD7F9E478C61C5F8D4C6E142D,B878DFE1B736AFD9F6777AF38F8DE61D,2,60,1613814835,A3CF5031ABA0C8D3055918865B5670C3,...,56ACD194C47AC4DC1F27082C65EEC259,388,471,False,1295352783,True,0,0,0,0
9,101\t56898\t137\t10117\t11565\t35819\t11183\t1...,D6F1E57D13C227F4D3019B6D2D4F0993\t3FD24C610015...,09CEBA79430D9B9FE22C9FBEF8D7416F,0,,,1,19,1613599591,B10C4DE37CCC2C64DF5F33E80C4AC808,...,51E7EFF8FEBC258B5F45A59C36F7423F,1131,1573,False,1315763646,False,0,0,0,0


In [5]:
print(tok.convert_tokens_to_string(tok.convert_ids_to_tokens(df.iloc[df["text_tokens"].map(len).argmax(),0].split("\t"))), df["language"][df["text_tokens"].map(len).argmax()])
print(tok.convert_tokens_to_string(tok.convert_ids_to_tokens(df.iloc[129,0].split("\t"))), df["language"][129])
for i in range(0, 10):
    print("{0}: {1}; l={2}".format(i, tok.convert_tokens_to_string(tok.convert_ids_to_tokens(df.iloc[i,0].split("\t"))), df["language"][i]))

[CLS] Dad bods & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; & gt ; abs & amp ; v line [SEP] 488B32D24BD4BB44172EB981C1BCA6FA
[CLS] RT @ jennyprioli : LUMENA SEM MAKE ¶ ¶ 

'\t' (tab) is the split symbol in the data. Dataset has no nan entries.

The engaging user is the one that posts the tweet and the engaged user is the one who reacts or does not react. (at least how I understand it)

In [6]:
df.isna().sum() 

text_tokens                           0
hashtags                              0
tweet_id                              0
present_media                         0
present_links                         0
present_domains                       0
tweet_type                            0
language                              0
tweet_timestamp                       0
engaged_with_user_id                  0
engaged_with_user_follower_count      0
engaged_with_user_following_count     0
engaged_with_user_is_verified         0
engaged_with_user_account_creation    0
engaging_user_id                      0
engaging_user_follower_count          0
engaging_user_following_count         0
engaging_user_is_verified             0
engaging_user_account_creation        0
engagee_follows_engager               0
reply_timestamp                       0
retweet_timestamp                     0
retweet_with_comment_timestamp        0
like_timestamp                        0
dtype: int64

In [87]:
prettyprintrow(df.iloc[127])

[CLS] [UNK] @ wise shipmentressedPA _ 7 : ứ [UNK] [UNK] [UNK]az [UNK] competitors LED „ LED strategies LED Flat [UNK] 184 1816 LED Hanna LED competitors [UNK] ‡ [UNK] liberty LEDaz [UNK] Théâtre realised LED ‒ LEDaz LED Santo [UNK]rra [UNK] Cairo [UNK] 184 Théâtre ‡ [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] [UNK] accord [UNK]rra LED ợ strategies LEDrra [UNK] Rivera ῦ LEDaz [UNK] Théâtre Rivera ‒ LEDaz LED Legislature [UNK] competitors 184 „ LEDnian LEDaz 184 Théâtre strategies [UNK] Rivera ỳ LEDnian [UNK]az [UNK] [UNK] [SEP]
2021-02-19 22:49:52
0F1C8CDCF6ED710DC85FD57F7D70E426


In [80]:
def prettyprintrow(row):
    print(tok.convert_tokens_to_string(tok.convert_ids_to_tokens(row["text_tokens"].split("\t"))))
    print(datetime.fromtimestamp(int(row["tweet_timestamp"])))
    print(row["engaged_with_user_id"])

In [79]:
df.loc[df['enaging_user_id'] == some_value]

text_tokens                           0
hashtags                              0
tweet_id                              0
present_media                         0
present_links                         0
present_domains                       0
tweet_type                            0
language                              0
tweet_timestamp                       0
engaged_with_user_id                  0
engaged_with_user_follower_count      0
engaged_with_user_following_count     0
engaged_with_user_is_verified         0
engaged_with_user_account_creation    0
enaging_user_id                       0
enaging_user_follower_count           0
enaging_user_following_count          0
enaging_user_is_verified              0
enaging_user_account_creation         0
engagee_follows_engager               0
reply_timestamp                       0
retweet_timestamp                     0
retweet_with_comment_timestamp        0
like_timestamp                        0
dtype: int64

In [11]:
print(datetime.fromtimestamp(int(min(df["tweet_timestamp"]))))
print(datetime.fromtimestamp(int(max(df["tweet_timestamp"]))))

2021-02-04 01:00:00
2021-02-25 00:59:59
