In [1]:
import io
import re
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english')) | {'rt'}

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Semon\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [3]:
start_date = '2019-01-01'
end_date = '2022-01-01'

In [4]:
def text_preprocessor(v: str) -> str:
    v = v.lower()
    v = re.sub(r'@[^\s]+', '', v)
    v = re.sub(r'https?://[^\s]+', '', v)
    v = re.sub(r'&\w+;', '', v)
    v = re.sub('\W+', ' ', v)
    v = v.strip()
    return v

def load_vectors(fname, words: set):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in tqdm(fin, total=n):
        tokens = line.rstrip().split(' ')
        word = tokens[0].lower()
        if word in words:
            data[word] = np.array([*map(float, tokens[1:])])
    return data

def tfidf_dict(tfidf_vectorizer: TfidfVectorizer, text: str) -> dict:
    return {
        tfidf_vectorizer.get_feature_names_out()[i]: tfidf_v
        for (_, i), tfidf_v in (
            tfidf_vectorizer
            .transform([text])
            .todok()
            .items()
        )
    }

def tweet_embedding(
    word_embeddings: dict, 
    tfidf_vectorizer: TfidfVectorizer, 
    text: str
) -> np.array:
    embedding = np.zeros(300)
    tfidf = 0
    
    text = text_preprocessor(text)
    words_tfidf = tfidf_dict(tfidf_vectorizer, text)
    
    for word in text.split(' '):
        if word in word_embeddings and word in words_tfidf:
            word_tfidf = words_tfidf[word]
            tfidf += word_tfidf
            embedding += word_tfidf * word_embeddings[word]
    
    if tfidf:
        embedding /= tfidf
    
    return embedding

## Elon Musk

In [5]:
elon_df = pd.concat([
    pd.read_csv('data/elon/2010.csv', index_col='id'),
    pd.read_csv('data/elon/2011.csv', index_col='id'),
    pd.read_csv('data/elon/2012.csv', index_col='id'),
    pd.read_csv('data/elon/2013.csv', index_col='id'),
    pd.read_csv('data/elon/2014.csv', index_col='id'),
    pd.read_csv('data/elon/2015.csv', index_col='id'),
    pd.read_csv('data/elon/2016.csv', index_col='id'),
    pd.read_csv('data/elon/2017.csv', index_col='id'),
    pd.read_csv('data/elon/2018.csv', index_col='id'),
    pd.read_csv('data/elon/2019.csv', index_col='id'),
    pd.read_csv('data/elon/2020.csv', index_col='id'),
    pd.read_csv('data/elon/2021.csv', index_col='id'),
    pd.read_csv('data/elon/2022.csv', index_col='id'),
])[['tweet', 'date']]
elon_df.rename(columns={'tweet': 'text'}, inplace=True)
elon_df['date'] = pd.to_datetime(pd.to_datetime(elon_df['date']).dt.date)
elon_df = elon_df.query(f"date >= '{start_date}' and date < '{end_date}'")
elon_df.drop_duplicates(inplace=True)
elon_df.head()

Unnamed: 0_level_0,text,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1211071324518531072,@Joe__Wakefield @austinbarnard45 @tjq1190 @tyg...,2019-12-28
1211069054779301894,@austinbarnard45 @tjq1190 @tyger_cyber @fawful...,2019-12-28
1211064937004589056,@IrinaAntony @tjq1190 @tyger_cyber @fawfulfan ...,2019-12-28
1211054942192119808,@tjq1190 @tyger_cyber @fawfulfan @_Mikemo He d...,2019-12-28
1211051740562366464,@geofficient Pretty much,2019-12-28


In [6]:
empty_tweets = (
    elon_df.text
    .apply(text_preprocessor)
    .apply(lambda v: ' '.join(word for word in v.split() if word not in stop_words))
    .apply(lambda v: not not re.fullmatch('\W*', v))
)
print(elon_df[empty_tweets].shape)
elon_df[empty_tweets]

(732, 2)


Unnamed: 0_level_0,text,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1210263196457504768,@teslaownersSV @tesla_unplugged @TeslaOwnersof...,2019-12-26
1209141597629243393,@markets 🤣🤣,2019-12-23
1205682780178280448,https://t.co/LxZAmZIcIS,2019-12-14
1205680930779590657,https://t.co/aAEFph9G6D,2019-12-14
1205050955927932928,@SamTalksTesla https://t.co/z6zljrfUmb,2019-12-12
...,...,...
1348017194349694978,@BoredElonMusk 🤣🤣,2021-01-10
1348007662491820033,@erigganewmoney 😢,2021-01-10
1347976356135518211,@Christo49699922 @sean18743005 @Erdayastronaut...,2021-01-09
1347254119237439495,https://t.co/2ja38Z8MRE,2021-01-07


In [7]:
elon_df = elon_df[~empty_tweets]
elon_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8643 entries, 1211071324518531072 to 1344810193952014336
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   text    8643 non-null   object        
 1   date    8643 non-null   datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 202.6+ KB


In [8]:
elon_df['cleared_text'] = elon_df.text.apply(text_preprocessor)
elon_df.head()

Unnamed: 0_level_0,text,date,cleared_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1211071324518531072,@Joe__Wakefield @austinbarnard45 @tjq1190 @tyg...,2019-12-28,this is a pretty awful lie i left south africa...
1211069054779301894,@austinbarnard45 @tjq1190 @tyger_cyber @fawful...,2019-12-28,this person blocked me so can t read the tweet...
1211064937004589056,@IrinaAntony @tjq1190 @tyger_cyber @fawfulfan ...,2019-12-28,we started zip2 with 2k from me plus my overcl...
1211054942192119808,@tjq1190 @tyger_cyber @fawfulfan @_Mikemo He d...,2019-12-28,he didn t own an emerald mine i worked my way ...
1211051740562366464,@geofficient Pretty much,2019-12-28,pretty much


In [9]:
elon_tweet_words = set()
elon_df.text.apply(
    text_preprocessor
).str.split().apply(
    lambda arr: [v for v in arr if v not in set(stop_words)]
).apply(elon_tweet_words.update)
len(elon_tweet_words)

9651

In [10]:
elon_word_embeddings = load_vectors('wiki-news-300d-1M.vec', elon_tweet_words)

100%|███████████████████████████████████████████████████████████████████████| 999994/999994 [00:25<00:00, 38782.11it/s]


In [11]:
elon_tfidf_vectorizer = TfidfVectorizer( # leakage here
    stop_words=stop_words,
    preprocessor=text_preprocessor
)
elon_tfidf_vectorizer.fit(elon_df.text)
len(elon_tfidf_vectorizer.vocabulary_)

9615

In [12]:
text_embeddings = list()


for text in tqdm(elon_df.text.values):
    text_embeddings.append(tweet_embedding(
        elon_word_embeddings, 
        elon_tfidf_vectorizer,
        text,
    ))

text_embeddings = np.stack(text_embeddings)
text_embeddings.shape

100%|██████████████████████████████████████████████████████████████████████████████| 8643/8643 [07:03<00:00, 20.42it/s]


(8643, 300)

In [13]:
elon_with_enb_df = pd.concat(
    [
        elon_df,
        pd.DataFrame(
            text_embeddings, 
            columns=[f'V{i+1}' for i in range(text_embeddings.shape[1])],
        ).set_index(elon_df.index),
    ],
    axis = 1
)
elon_with_enb_df.head()

Unnamed: 0_level_0,text,date,cleared_text,V1,V2,V3,V4,V5,V6,V7,...,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1211071324518531072,@Joe__Wakefield @austinbarnard45 @tjq1190 @tyg...,2019-12-28,this is a pretty awful lie i left south africa...,-0.048342,-0.014244,0.016088,0.002211,-0.012234,0.082633,-0.016118,...,-0.000773,-0.019107,-0.039366,0.015982,0.049104,-0.056251,-0.057072,-0.057435,-0.011789,0.036748
1211069054779301894,@austinbarnard45 @tjq1190 @tyger_cyber @fawful...,2019-12-28,this person blocked me so can t read the tweet...,-0.01273,-0.017078,0.028585,0.025572,-0.045578,0.065623,-0.013617,...,-0.01255,-0.046741,-0.000691,0.019577,0.077184,-0.057226,-0.047901,0.0287,0.011547,0.033419
1211064937004589056,@IrinaAntony @tjq1190 @tyger_cyber @fawfulfan ...,2019-12-28,we started zip2 with 2k from me plus my overcl...,-0.042071,-0.075559,-0.029521,0.020394,-0.014714,0.073634,-0.013487,...,-0.02204,-0.001922,-0.011946,0.02849,0.086696,-0.028041,-0.038507,-0.005319,0.075069,-0.058273
1211054942192119808,@tjq1190 @tyger_cyber @fawfulfan @_Mikemo He d...,2019-12-28,he didn t own an emerald mine i worked my way ...,-0.026122,-0.068418,0.031035,-0.038473,-0.031547,0.080213,-0.034022,...,0.003521,-0.032675,-0.035726,-0.00644,0.051075,-0.005771,-0.002485,-0.073661,-0.000422,0.012104
1211051740562366464,@geofficient Pretty much,2019-12-28,pretty much,-0.061854,-0.111202,0.020066,0.037324,-0.120143,0.163722,-0.013768,...,0.025169,-0.046883,-0.015783,0.062415,0.068658,-0.104441,-0.167005,0.059559,0.263472,0.011272


In [14]:
elon_with_enb_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8643 entries, 1211071324518531072 to 1344810193952014336
Columns: 303 entries, text to V300
dtypes: datetime64[ns](1), float64(300), object(2)
memory usage: 20.0+ MB


In [15]:
elon_with_enb_df.to_csv('data/elon_prepared_tweets.csv', index_label='id')

## Donald Trump

In [16]:
trump_df = pd.read_csv('data/trump_tweets.csv', dtype={'id': np.int64}, index_col='id')[['text', 'date']]
trump_df['date'] = pd.to_datetime(trump_df['date'])
trump_df = trump_df.query(f"date >= '{start_date}' and date < '{end_date}'")
trump_df.head()

Unnamed: 0_level_0,text,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1234653427789070336,I was thrilled to be back in the Great city of...,2020-03-03
1218010753434820608,RT @CBS_Herridge: READ: Letter to surveillance...,2020-01-17
1304875170860015616,The Unsolicited Mail In Ballot Scam is a major...,2020-09-12
1218159531554897920,RT @MZHemingway: Very friendly telling of even...,2020-01-17
1217962723234983936,RT @WhiteHouse: President @realDonaldTrump ann...,2020-01-17


In [17]:
trump_empty_tweets = (
    trump_df.text
    .apply(text_preprocessor)
    .apply(lambda v: ' '.join(word for word in v.split() if word not in stop_words))
    .apply(lambda v: not not re.fullmatch('\W*', v))
)
print(trump_df[trump_empty_tweets].shape)
trump_df[trump_empty_tweets]

(1561, 2)


Unnamed: 0_level_0,text,date
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1319501865625784320,https://t.co/4qwCKQOiOw,2020-10-23
1319500520126664704,https://t.co/VlEu8yyovv,2020-10-23
1319500501269041152,https://t.co/z5CRqHO8vg,2020-10-23
1319500486370877440,https://t.co/TQCQiDrVOB,2020-10-23
1319496349092511488,RT @EricTrump: https://t.co/NcrNdOSfIV,2020-10-23
...,...,...
1082823394234716160,https://t.co/Ft6FqQmYfI,2019-01-09
1080923073820282752,https://t.co/jsOrDtwdEa,2019-01-03
1080858959404240768,https://t.co/JzfXMAPwKP,2019-01-03
1213316629666435072,RT @realDonaldTrump: https://t.co/VXeKiVzpTf,2020-01-04


In [18]:
trump_df = trump_df[~trump_empty_tweets]
trump_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18649 entries, 1234653427789070336 to 1319345719829008384
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   text    18649 non-null  object        
 1   date    18649 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 437.1+ KB


In [19]:
trump_df['cleared_text'] = trump_df.text.apply(text_preprocessor)
trump_df.head()

Unnamed: 0_level_0,text,date,cleared_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1234653427789070336,I was thrilled to be back in the Great city of...,2020-03-03,i was thrilled to be back in the great city of...
1218010753434820608,RT @CBS_Herridge: READ: Letter to surveillance...,2020-01-17,rt read letter to surveillance court obtained ...
1304875170860015616,The Unsolicited Mail In Ballot Scam is a major...,2020-09-12,the unsolicited mail in ballot scam is a major...
1218159531554897920,RT @MZHemingway: Very friendly telling of even...,2020-01-17,rt very friendly telling of events here about ...
1217962723234983936,RT @WhiteHouse: President @realDonaldTrump ann...,2020-01-17,rt president announced historic steps to prote...


In [20]:
trump_tweet_words = set()
trump_df.text.apply(
    text_preprocessor
).str.split().apply(
    lambda arr: [v for v in arr if v not in set(stop_words)]
).apply(trump_tweet_words.update)
len(trump_tweet_words)

17529

In [21]:
trump_word_embeddings = load_vectors('wiki-news-300d-1M.vec', trump_tweet_words)

100%|███████████████████████████████████████████████████████████████████████| 999994/999994 [00:31<00:00, 32112.91it/s]


In [22]:
trump_tfidf_vectorizer = TfidfVectorizer(
    stop_words=stop_words,
    preprocessor=text_preprocessor
)
trump_tfidf_vectorizer.fit(trump_df.text)
len(trump_tfidf_vectorizer.vocabulary_)

17469

In [23]:
text_embeddings = list()

for text in tqdm(trump_df.text.values):
    text_embeddings.append(tweet_embedding(
        elon_word_embeddings, 
        elon_tfidf_vectorizer,
        text,
    ))

text_embeddings = np.stack(text_embeddings)
text_embeddings.shape

100%|████████████████████████████████████████████████████████████████████████████| 18649/18649 [18:14<00:00, 17.04it/s]


(18649, 300)

In [24]:
trump_with_enb_df = pd.concat(
    [
        trump_df,
        pd.DataFrame(
            text_embeddings, 
            columns=[f'V{i+1}' for i in range(text_embeddings.shape[1])],
        ).set_index(trump_df.index),
    ],
    axis = 1
)
trump_with_enb_df.head()

Unnamed: 0_level_0,text,date,cleared_text,V1,V2,V3,V4,V5,V6,V7,...,V291,V292,V293,V294,V295,V296,V297,V298,V299,V300
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1234653427789070336,I was thrilled to be back in the Great city of...,2020-03-03,i was thrilled to be back in the great city of...,-0.025369,0.013185,0.025902,0.013779,-0.020144,0.085133,-0.006004,...,0.055005,-0.022654,-0.032455,-0.001537,0.075067,0.033545,-0.043231,-0.002946,0.027171,0.038785
1218010753434820608,RT @CBS_Herridge: READ: Letter to surveillance...,2020-01-17,rt read letter to surveillance court obtained ...,-0.147595,-0.055836,0.082966,0.016695,0.066793,0.136151,0.045195,...,0.058701,-0.063616,0.009711,-0.024108,0.090426,-0.083737,-0.04495,-0.058626,-0.031472,0.004875
1304875170860015616,The Unsolicited Mail In Ballot Scam is a major...,2020-09-12,the unsolicited mail in ballot scam is a major...,0.009686,-0.055457,0.049761,0.053444,-0.045312,0.108593,-0.006554,...,0.023672,-0.045594,0.020217,-0.005316,0.05825,-0.091956,-0.111169,0.030698,0.04521,0.012122
1218159531554897920,RT @MZHemingway: Very friendly telling of even...,2020-01-17,rt very friendly telling of events here about ...,-0.091189,-0.005541,-0.00108,0.007113,-0.039241,0.091224,-0.000484,...,0.054797,-0.031643,0.044863,-0.032381,0.018727,-0.135519,-0.089895,-0.002928,0.003388,0.023797
1217962723234983936,RT @WhiteHouse: President @realDonaldTrump ann...,2020-01-17,rt president announced historic steps to prote...,-0.014173,-0.064739,0.054706,-0.007583,-0.037187,0.064135,0.042061,...,0.062528,-0.075422,0.042154,0.042574,-0.00105,-0.067678,-0.079042,0.003974,-0.036825,0.043779


In [25]:
trump_with_enb_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18649 entries, 1234653427789070336 to 1319345719829008384
Columns: 303 entries, text to V300
dtypes: datetime64[ns](1), float64(300), object(2)
memory usage: 43.3+ MB


In [26]:
trump_with_enb_df.to_csv('data/trump_prepared_tweets.csv', index_label='id')