In [1]:
import io
import re
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english')) | {'rt'}

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Semon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import torch
from transformers import AutoTokenizer, AutoModel

In [23]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [24]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [25]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/LaBSE-en-ru")
model = AutoModel.from_pretrained("cointegrated/LaBSE-en-ru")
#model = model.to(device)

Some weights of the model checkpoint at cointegrated/LaBSE-en-ru were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
%%time

sentences = ["Hello World " * 50, "Привет Мир " * 50] * 1
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=64, return_tensors='pt')#.to(device)
with torch.no_grad():
    model_output = model(**encoded_input)
    
encoded_input = encoded_input
model_output = model_output

embeddings = model_output.pooler_output
embeddings = torch.nn.functional.normalize(embeddings)
print(np.array(embeddings))

[[-0.05211287 -0.02777303 -0.0515324  ...  0.05114975 -0.00069914
   0.00683167]
 [-0.01679946 -0.01778223 -0.05142329 ...  0.0425858   0.0050984
   0.01522529]]
CPU times: total: 1.84 s
Wall time: 477 ms


In [35]:
start_date = '2019-01-01'
end_date = '2022-01-01'

In [32]:
def text_preprocessor(v: str) -> str:
    v = v.lower()
    v = re.sub(r'@[^\s]+', '', v)
    v = re.sub(r'https?://[^\s]+', '', v)
    v = re.sub(r'&\w+;', '', v)
    v = re.sub('\W+', ' ', v)
    v = v.strip()
    return v


def tweet_embedding(
    text: str
) -> np.array:
    text = text_preprocessor(text)
    encoded_input = tokenizer(text, padding=True, truncation=True, max_length=64, return_tensors='pt')#.to(device)
    with torch.no_grad():
        model_output = model(**encoded_input)
    embeddings = model_output.pooler_output
    embeddings = torch.nn.functional.normalize(embeddings)
    return np.array(embeddings).flatten()

## Elon Musk

In [36]:
elon_df = pd.concat([
    pd.read_csv('data/elon/2010.csv', index_col='id'),
    pd.read_csv('data/elon/2011.csv', index_col='id'),
    pd.read_csv('data/elon/2012.csv', index_col='id'),
    pd.read_csv('data/elon/2013.csv', index_col='id'),
    pd.read_csv('data/elon/2014.csv', index_col='id'),
    pd.read_csv('data/elon/2015.csv', index_col='id'),
    pd.read_csv('data/elon/2016.csv', index_col='id'),
    pd.read_csv('data/elon/2017.csv', index_col='id'),
    pd.read_csv('data/elon/2018.csv', index_col='id'),
    pd.read_csv('data/elon/2019.csv', index_col='id'),
    pd.read_csv('data/elon/2020.csv', index_col='id'),
    pd.read_csv('data/elon/2021.csv', index_col='id'),
    pd.read_csv('data/elon/2022.csv', index_col='id'),
])[['tweet', 'date']]
elon_df.rename(columns={'tweet': 'text'}, inplace=True)
elon_df['date'] = pd.to_datetime(pd.to_datetime(elon_df['date']).dt.date)
elon_df = elon_df.query(f"date >= '{start_date}' and date < '{end_date}'")
elon_df.drop_duplicates(inplace=True)
elon_df.head()

Unnamed: 0_level_0,text,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1211071324518531072,@Joe__Wakefield @austinbarnard45 @tjq1190 @tyg...,2019-12-28
1211069054779301894,@austinbarnard45 @tjq1190 @tyger_cyber @fawful...,2019-12-28
1211064937004589056,@IrinaAntony @tjq1190 @tyger_cyber @fawfulfan ...,2019-12-28
1211054942192119808,@tjq1190 @tyger_cyber @fawfulfan @_Mikemo He d...,2019-12-28
1211051740562366464,@geofficient Pretty much,2019-12-28


In [37]:
empty_tweets = (
    elon_df.text
    .apply(text_preprocessor)
    .apply(lambda v: ' '.join(word for word in v.split() if word not in stop_words))
    .apply(lambda v: not not re.fullmatch('\W*', v))
)
print(elon_df[empty_tweets].shape)
elon_df[empty_tweets]

(732, 2)


Unnamed: 0_level_0,text,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1210263196457504768,@teslaownersSV @tesla_unplugged @TeslaOwnersof...,2019-12-26
1209141597629243393,@markets 🤣🤣,2019-12-23
1205682780178280448,https://t.co/LxZAmZIcIS,2019-12-14
1205680930779590657,https://t.co/aAEFph9G6D,2019-12-14
1205050955927932928,@SamTalksTesla https://t.co/z6zljrfUmb,2019-12-12
...,...,...
1348017194349694978,@BoredElonMusk 🤣🤣,2021-01-10
1348007662491820033,@erigganewmoney 😢,2021-01-10
1347976356135518211,@Christo49699922 @sean18743005 @Erdayastronaut...,2021-01-09
1347254119237439495,https://t.co/2ja38Z8MRE,2021-01-07


In [38]:
elon_df = elon_df[~empty_tweets]
elon_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8643 entries, 1211071324518531072 to 1344810193952014336
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   text    8643 non-null   object        
 1   date    8643 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 202.6+ KB


In [39]:
elon_df['cleared_text'] = elon_df.text.apply(text_preprocessor)
elon_df.head()

Unnamed: 0_level_0,text,date,cleared_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1211071324518531072,@Joe__Wakefield @austinbarnard45 @tjq1190 @tyg...,2019-12-28,this is a pretty awful lie i left south africa...
1211069054779301894,@austinbarnard45 @tjq1190 @tyger_cyber @fawful...,2019-12-28,this person blocked me so can t read the tweet...
1211064937004589056,@IrinaAntony @tjq1190 @tyger_cyber @fawfulfan ...,2019-12-28,we started zip2 with 2k from me plus my overcl...
1211054942192119808,@tjq1190 @tyger_cyber @fawfulfan @_Mikemo He d...,2019-12-28,he didn t own an emerald mine i worked my way ...
1211051740562366464,@geofficient Pretty much,2019-12-28,pretty much


In [40]:
elon_tweet_words = set()
elon_df.text.apply(
    text_preprocessor
).str.split().apply(
    lambda arr: [v for v in arr if v not in set(stop_words)]
).apply(elon_tweet_words.update)
len(elon_tweet_words)

9651

In [41]:
text_embeddings = list()


for text in tqdm(elon_df.text.values):
    text_embeddings.append(tweet_embedding(
        text,
    ))

text_embeddings = np.stack(text_embeddings)
text_embeddings.shape

100%|██████████████████████████████████████████████████████████████████████████████| 8643/8643 [16:56<00:00,  8.50it/s]


(8643, 768)

In [42]:
elon_with_enb_df = pd.concat(
    [
        elon_df,
        pd.DataFrame(
            text_embeddings, 
            columns=[f'V{i+1}' for i in range(text_embeddings.shape[1])],
        ).set_index(elon_df.index),
    ],
    axis = 1
)
elon_with_enb_df.head()

Unnamed: 0_level_0,text,date,cleared_text,V1,V2,V3,V4,V5,V6,V7,...,V759,V760,V761,V762,V763,V764,V765,V766,V767,V768
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1211071324518531072,@Joe__Wakefield @austinbarnard45 @tjq1190 @tyg...,2019-12-28,this is a pretty awful lie i left south africa...,0.008623,-0.038959,-0.064938,-0.065588,-0.066482,0.041533,-0.046693,...,0.000482,0.031413,-0.0187,0.015388,0.007253,-0.021971,0.013284,-0.064772,-0.061415,0.006583
1211069054779301894,@austinbarnard45 @tjq1190 @tyger_cyber @fawful...,2019-12-28,this person blocked me so can t read the tweet...,-0.021382,0.033233,-0.019205,-0.063174,0.038882,0.045739,-0.050074,...,0.042554,-0.046082,-0.046342,0.020049,-0.061177,0.057064,-0.056571,-0.003063,0.036571,0.006675
1211064937004589056,@IrinaAntony @tjq1190 @tyger_cyber @fawfulfan ...,2019-12-28,we started zip2 with 2k from me plus my overcl...,0.007762,-0.028661,-0.041164,-0.054028,-0.016149,0.00304,-0.039877,...,0.054301,-0.0444,-0.031991,0.021364,-0.007534,-0.032499,-0.03112,0.003252,-0.049795,0.03092
1211054942192119808,@tjq1190 @tyger_cyber @fawfulfan @_Mikemo He d...,2019-12-28,he didn t own an emerald mine i worked my way ...,-0.032627,-0.034146,-0.067866,-0.067204,-0.046921,0.02416,-0.069717,...,0.058897,-0.004664,-0.013007,0.039658,-0.02399,0.011486,0.028584,-0.055376,-0.025286,0.007455
1211051740562366464,@geofficient Pretty much,2019-12-28,pretty much,0.029701,-0.054455,-0.056146,-0.05867,-0.007973,-0.011959,-0.008488,...,0.01132,0.013753,-0.019156,-0.015933,0.002867,0.03474,-0.044341,-0.02383,-0.021823,0.00562


In [43]:
elon_with_enb_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8643 entries, 1211071324518531072 to 1344810193952014336
Columns: 771 entries, text to V768
dtypes: datetime64[ns](1), float32(768), object(2)
memory usage: 25.6+ MB


In [44]:
elon_with_enb_df.to_csv('data/elon_prepared_tweets_transformer.csv', index_label='id')

## Donald Trump

In [45]:
trump_df = pd.read_csv('data/trump_tweets.csv', dtype={'id': np.int64}, index_col='id')[['text', 'date']]
trump_df['date'] = pd.to_datetime(trump_df['date'])
trump_df = trump_df.query(f"date >= '{start_date}' and date < '{end_date}'")
trump_df.head()

Unnamed: 0_level_0,text,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1234653427789070336,I was thrilled to be back in the Great city of...,2020-03-03
1218010753434820608,RT @CBS_Herridge: READ: Letter to surveillance...,2020-01-17
1304875170860015616,The Unsolicited Mail In Ballot Scam is a major...,2020-09-12
1218159531554897920,RT @MZHemingway: Very friendly telling of even...,2020-01-17
1217962723234983936,RT @WhiteHouse: President @realDonaldTrump ann...,2020-01-17


In [46]:
trump_empty_tweets = (
    trump_df.text
    .apply(text_preprocessor)
    .apply(lambda v: ' '.join(word for word in v.split() if word not in stop_words))
    .apply(lambda v: not not re.fullmatch('\W*', v))
)
print(trump_df[trump_empty_tweets].shape)
trump_df[trump_empty_tweets]

(1561, 2)


Unnamed: 0_level_0,text,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1319501865625784320,https://t.co/4qwCKQOiOw,2020-10-23
1319500520126664704,https://t.co/VlEu8yyovv,2020-10-23
1319500501269041152,https://t.co/z5CRqHO8vg,2020-10-23
1319500486370877440,https://t.co/TQCQiDrVOB,2020-10-23
1319496349092511488,RT @EricTrump: https://t.co/NcrNdOSfIV,2020-10-23
...,...,...
1082823394234716160,https://t.co/Ft6FqQmYfI,2019-01-09
1080923073820282752,https://t.co/jsOrDtwdEa,2019-01-03
1080858959404240768,https://t.co/JzfXMAPwKP,2019-01-03
1213316629666435072,RT @realDonaldTrump: https://t.co/VXeKiVzpTf,2020-01-04


In [47]:
trump_df = trump_df[~trump_empty_tweets]
trump_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18649 entries, 1234653427789070336 to 1319345719829008384
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   text    18649 non-null  object        
 1   date    18649 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 437.1+ KB


In [48]:
trump_df['cleared_text'] = trump_df.text.apply(text_preprocessor)
trump_df.head()

Unnamed: 0_level_0,text,date,cleared_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1234653427789070336,I was thrilled to be back in the Great city of...,2020-03-03,i was thrilled to be back in the great city of...
1218010753434820608,RT @CBS_Herridge: READ: Letter to surveillance...,2020-01-17,rt read letter to surveillance court obtained ...
1304875170860015616,The Unsolicited Mail In Ballot Scam is a major...,2020-09-12,the unsolicited mail in ballot scam is a major...
1218159531554897920,RT @MZHemingway: Very friendly telling of even...,2020-01-17,rt very friendly telling of events here about ...
1217962723234983936,RT @WhiteHouse: President @realDonaldTrump ann...,2020-01-17,rt president announced historic steps to prote...


In [49]:
trump_tweet_words = set()
trump_df.text.apply(
    text_preprocessor
).str.split().apply(
    lambda arr: [v for v in arr if v not in set(stop_words)]
).apply(trump_tweet_words.update)
len(trump_tweet_words)

17529

In [50]:
text_embeddings = list()

for text in tqdm(trump_df.text.values):
    text_embeddings.append(tweet_embedding(
        text,
    ))

text_embeddings = np.stack(text_embeddings)
text_embeddings.shape

100%|████████████████████████████████████████████████████████████████████████████| 18649/18649 [59:30<00:00,  5.22it/s]


(18649, 768)

In [51]:
trump_with_enb_df = pd.concat(
    [
        trump_df,
        pd.DataFrame(
            text_embeddings, 
            columns=[f'V{i+1}' for i in range(text_embeddings.shape[1])],
        ).set_index(trump_df.index),
    ],
    axis = 1
)
trump_with_enb_df.head()

Unnamed: 0_level_0,text,date,cleared_text,V1,V2,V3,V4,V5,V6,V7,...,V759,V760,V761,V762,V763,V764,V765,V766,V767,V768
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1234653427789070336,I was thrilled to be back in the Great city of...,2020-03-03,i was thrilled to be back in the great city of...,0.041417,-0.042731,0.006217,-0.067131,-0.014536,-0.020198,-0.037637,...,0.000968,-0.05807,-0.010195,-0.045221,-0.025572,-0.058799,-0.067292,-0.004863,-0.048326,-0.021921
1218010753434820608,RT @CBS_Herridge: READ: Letter to surveillance...,2020-01-17,rt read letter to surveillance court obtained ...,0.047121,0.012321,0.008286,-0.054554,0.031976,-0.01174,-0.06322,...,-0.006211,0.009399,-0.067262,-0.020455,-0.038795,-0.029703,-0.072416,0.035931,0.036871,0.038481
1304875170860015616,The Unsolicited Mail In Ballot Scam is a major...,2020-09-12,the unsolicited mail in ballot scam is a major...,-0.035107,0.001505,-0.007224,-0.01543,0.014132,0.044485,-0.048967,...,0.048927,-0.058868,-0.0008,-0.024277,-0.022527,-0.01761,0.020487,0.006837,-0.013474,0.061693
1218159531554897920,RT @MZHemingway: Very friendly telling of even...,2020-01-17,rt very friendly telling of events here about ...,0.016298,-0.068092,-0.004709,-0.085732,0.04095,-0.024538,-0.013751,...,0.028047,-0.00376,-0.032635,0.037088,-0.001459,0.018904,-0.076467,0.007178,-0.01285,0.008524
1217962723234983936,RT @WhiteHouse: President @realDonaldTrump ann...,2020-01-17,rt president announced historic steps to prote...,0.007337,-0.059023,0.021116,-0.036806,0.004089,-0.016398,-0.053553,...,0.046635,0.053444,0.002423,-0.04241,0.017107,-0.038225,-0.016442,-0.031505,0.020988,0.01922


In [52]:
trump_with_enb_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18649 entries, 1234653427789070336 to 1319345719829008384
Columns: 771 entries, text to V768
dtypes: datetime64[ns](1), float32(768), object(2)
memory usage: 55.2+ MB


In [53]:
trump_with_enb_df.to_csv('data/trump_prepared_tweets_transformer.csv', index_label='id')