# Information Warfare
## Russia’s use of Twitter during the 2016 US Presidential Election
---

In [1]:
import os
import numpy as np
import pandas as pd
import spacy
from spacy.tokens import Token
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

from processing_modules.io_data import ImportData
from processing_modules.text_processing import link_remover, hashtag_token, account_token, group_lists,\
                                                lemmatize_and_clean_document, clean_document, retweet_remover

## Import Data

In [2]:
# All Tweets
portfolio_data_bucket = os.getenv("PORTFOLIO_DATA_BUCKET")

five_38_data_retrieval = ImportData(
    bucket=portfolio_data_bucket,
    prefix="disinformation-project/raw-data"
)

df = five_38_data_retrieval\
        .retrieve_objects()\
        .return_dataframe()

## Sample and clean data

In [4]:
account_categories = ['RightTroll','LeftTroll', 'HashtagGamer', 'NewsFeed']
df = df[(df.language == 'English') & (df.account_category.isin(account_categories))]

# need to remove accounts that tweet less than 400 times. 
author_counts = df.groupby('author')['tweet_id'].count()
relevant_authors = author_counts[author_counts > 200].index.values.tolist()
df = df[df.author.isin(relevant_authors)]

sample = df.groupby(['author', 'account_category'])[['content']]\
            .apply(lambda x: x.sample(frac=.3, replace=False))\
            .reset_index()\
            .drop("level_2", axis='columns')

sample.content = sample.content\
                    .astype(str).str.strip()\
                    .apply(link_remover)

## Tokenization

In [5]:
# This allows us to add custom attributes to tokens, in this case, hashtags and accounts
Token.set_extension('is_hashtag', default = False, force = True)
Token.set_extension('is_account', default = False, force = True)

# We can  disable pipeline objects to save time: disable = ['parser', 'etc']
nlp = spacy.load("en_core_web_sm", disable = ['parser', 'ner', 'textcat'])

# Here I add the two custom functions for hashtags and accounts to the pipeline
nlp.add_pipe(hashtag_token)
nlp.add_pipe(account_token)

# And we're off!
parsed_tweets = list(nlp.pipe(sample.content))

#### Additional text cleaning and aggregation

Here we:
- further clean tweets by filter tokens (remove punctuation, spaces, etc.) and
- group tweet documents by author 

In [6]:
sample['processed_tweets'] = list(map(lemmatize_and_clean_document, parsed_tweets))
#sample['processed_tweets']= list(map(clean_document, parsed_tweets))

grouped_tweets = sample.groupby(['author', 'account_category'])['processed_tweets']\
                    .apply(group_lists)\
                    .reset_index()

#### Document vectors

In [7]:
# parsed_content is the list of parsed text
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(grouped_tweets.processed_tweets)]

# Vector size of 300
model = Doc2Vec(documents, vector_size=300, window=5, min_count=3, workers = 6)

# save the model to disk
model.save('./models/doc2vec/disinformation-project-doc2vec')

document_vectors = []

for index in range(0, len(model.docvecs)):
    document_vectors.append(model.docvecs[index])

In [8]:
embedding_dataframe = pd.DataFrame(np.stack(document_vectors))

embedding_dataframe.insert(0, 'account_category', grouped_tweets.account_category)
embedding_dataframe.insert(0, 'author', grouped_tweets.author)

embedding_dataframe.to_csv("./data/tweet-embeddings.csv", index=False)