# 02- Exploratory Data Analysis (EDA)

In [1]:
import pickle
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

In [2]:
df = pd.read_pickle('../data/cleaned_tweets_test.pkl')

In [3]:
bow_true = []
for word in df[df.category == 'true'].cleaned.str.split().to_list():
    bow_true += word


In [4]:
bow_unreliable = []
for word in df[df.category == 'unreliable'].cleaned.str.split().to_list():
    bow_unreliable += word

In [5]:
len(bow_true)

2829

In [6]:
len(bow_unreliable)

3294

In [7]:
len(set(bow_true))

1089

In [8]:
len(set(bow_unreliable))

1583

In [9]:
words = bow_true + bow_unreliable

In [10]:
texts =  [(tweet, cat) for tweet, cat in zip(df.cleaned.to_list(), df.category.to_list())]

In [11]:
tweets, cats = (zip(*texts))

## Word Embeddings

In [12]:
testing_words = set(words)

In [13]:
we_training_cbow = pickle.load(open('../data/we_cbow_training.pickle', 'rb'))
we_training_sg = pickle.load(open('../data/we_sg_training.pickle', 'rb'))

In [14]:
training_words = pickle.load(open('../data/training_words_set.pickle', 'rb'))

In [15]:
len([word for word in testing_words if word not in training_words])

# this might create a significant issue

596

In [16]:
corpus = [tweet.split() for tweet in tweets]

In [17]:
# train word embeddings on test dataset and only extract embeddings that are not in training

test_cbow = Word2Vec(corpus, vector_size=300, min_count=1, epochs=10, seed=42)
test_sg = Word2Vec(corpus, sg=1, vector_size=300, min_count=1, epochs=10, seed=42)

In [18]:
we_test_cbow = {word: test_cbow.wv[word] for word in testing_words if word not in training_words}
we_test_sg = {word: test_sg.wv[word] for word in testing_words if word not in training_words}

In [19]:
we_cbow = we_training_cbow | we_test_cbow
we_sg = we_training_sg | we_test_sg

In [20]:
def meaner(word_embedding, tweets):
    tweet_embedding = {}
    for i, tweet in enumerate(tweets):
        tweet_embedding[tweet] = np.mean(np.array([word_embedding[word] for word in tweets[i].split()]), axis=0)
    return tweet_embedding

In [21]:
te_cbow = meaner(we_cbow, tweets)
te_sg = meaner(we_sg, tweets)

In [22]:
len(te_cbow), len(te_sg)

(557, 557)

In [23]:
te_cbow_df = pd.DataFrame(te_cbow).T.reset_index().rename(columns={'index':'tweet'})
te_cbow_df['category'] = cats
te_cbow_df['label'] = te_cbow_df.category.map({'unreliable':0, 'true':1})
te_cbow_df = te_cbow_df.sample(frac=1, random_state=42).reset_index(drop=True)

te_sg_df = pd.DataFrame(te_sg).T.reset_index().rename(columns={'index':'tweet'})
te_sg_df['category'] = cats
te_sg_df['label'] = te_sg_df.category.map({'unreliable':0, 'true':1})
te_sg_df = te_sg_df.sample(frac=1, random_state=42).reset_index(drop=True)

In [24]:
te_cbow_df.head()

Unnamed: 0,tweet,0,1,2,3,4,5,6,7,8,...,292,293,294,295,296,297,298,299,category,label
0,nation health professional continue manage cor...,-0.000578,-0.037394,0.004011,-0.067333,0.074218,-0.015869,0.181918,0.168126,-0.073878,...,0.016714,0.090119,0.063498,0.176591,0.260096,0.038692,-0.030767,0.040254,unreliable,0
1,toronto public health set coronavirus hotline ...,-0.00287,-0.045381,0.022516,-0.095714,0.118483,-0.024555,0.288207,0.276162,-0.104954,...,0.025484,0.145816,0.100598,0.28157,0.413442,0.074131,-0.048276,0.067406,true,1
2,hey trumptrain official warns trump ignorant c...,0.000579,-0.038867,0.018511,-0.079097,0.090032,-0.019367,0.219561,0.199829,-0.086984,...,0.02366,0.109519,0.072719,0.218561,0.321472,0.05853,-0.03728,0.053943,unreliable,0
3,lie us coronaviruse,-0.004057,-0.025749,0.011759,-0.054163,0.067528,-0.011438,0.159022,0.16085,-0.06079,...,0.014135,0.083286,0.062069,0.143551,0.212194,0.041828,-0.028897,0.037737,unreliable,0
4,maga hat made china may infected coronavirus s...,-0.005614,-0.033827,0.017325,-0.067794,0.076886,-0.017765,0.195124,0.180815,-0.074958,...,0.017363,0.097263,0.069475,0.198881,0.293775,0.051552,-0.034167,0.047908,unreliable,0


In [25]:
pickle.dump(te_cbow_df, open("../data/te_cbow_df_testing_extra.pickle", "wb"))
pickle.dump(te_sg_df, open("../data/te_sg_df_testing_extra.pickle", "wb"))