In [237]:
import pandas as pd
import numpy as np
import torch
import pickle
from tqdm.auto import tqdm
from transformers import pipeline, AutoModel, AutoTokenizer
from collections import defaultdict
import re
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD

### Create train and test sets of tweets
Before feature extraction, we do the train-dev-test split, so that these datasets are constant across feature extraction methods. For binary classification, the positive examples come from the Russian troll tweets dataset. Negative examples are a combination of a sentiment dataset, and a dataset of tweets from Republican and Democratic politicians. (We want to make sure that there are negative examples that still have a "political" orientation, since our goal is to tell troll tweets from real tweets, rather than political from non-political.)

In [12]:
pos_examples = pd.read_csv("data/preprocessed-text/preprocessed-troll-tweets.csv")
neg_sentiment_examples = pd.read_csv("data/preprocessed-text/sentiment-preprocessed.csv",
                                    encoding="latin").rename(columns={"account_type":"account_category"})
neg_political_examples = pd.read_csv(
    "data/preprocessed-text/big-political-preprocessed.csv").rename(columns={
    "account_type":"account_category"})

In [13]:
len(pos_examples), len(neg_sentiment_examples), len(neg_political_examples)

(1970780, 1600498, 1243370)

In [14]:
neg_sentiment_examples.columns

Index(['content', 'account_category', 'troll'], dtype='object')

In [15]:
random_state = 229
combined = pd.concat([
    pos_examples,
    neg_sentiment_examples.sample(n=1000000, random_state=random_state),
    neg_political_examples.sample(n=1000000, random_state=random_state)
]).sample(frac=1, random_state=random_state).reset_index(drop=True)

In [16]:
len(combined)
combined.head(20)

Unnamed: 0,content,account_category,troll
0,.@BilgeEbiri really nails why many villains in...,RightTroll,True
1,"Clinton, Trump lead 2016 delegate race https:/...",NewsFeed,True
2,RT @the_intercept: How our reporter @JuanMThom...,LeftTroll,True
3,"Cruz, Colbert debate Reagan, gay marriage #en...",NewsFeed,True
4,'@420omnivore @leyalouisee May be this girl wi...,LeftTroll,True
5,More to Jamaica than 'anti-gay Gestapos': Man ...,NewsFeed,True
6,Darkness cannot drive out darkness; only light...,RightTroll,True
7,I'm thrilled to be here @ #CBCFALC2012 hosting...,NotTroll,False
8,"Nearly all men can stand adversity, but if you...",LeftTroll,True
9,‘Reprehensible’ fondling of 7-year-old girl by...,NewsFeed,True


In [17]:
metadata = {
    "troll": combined.groupby("account_category").count(), 
    "categories": combined.groupby("troll").count()
}
pickle.dump(metadata, open("data/preprocessed-text/combined-metadata.pickle", "wb+"))

In [18]:
train_cutoff, dev_cutoff = int(len(combined) * 0.7), int(len(combined) * 0.85)
train_tweets = combined.iloc[:train_cutoff,:]
dev_tweets = combined.iloc[train_cutoff:dev_cutoff,:]
test_tweets = combined.iloc[dev_cutoff:,:]

In [19]:
train_tweets.to_csv("data/preprocessed-text/train_tweets.csv", index=False)
dev_tweets.to_csv("data/preprocessed-text/dev_tweets.csv", index=False)
test_tweets.to_csv("data/preprocessed-text/test_tweets.csv", index=False)

### BERT features
Take these raw tweets for train, dev, and test sets and use pretrained BERT to create features.

In [275]:
train_tweets = pd.read_csv("data/preprocessed-text/train_tweets.csv",
                          dtype={'content':'string', 'account_category':'string', 'troll':'boolean'}).dropna()
dev_tweets = pd.read_csv("data/preprocessed-text/dev_tweets.csv",
                          dtype={'content':'string', 'account_category':'string', 'troll':'boolean'}).dropna()
test_tweets = pd.read_csv("data/preprocessed-text/test_tweets.csv",
                          dtype={'content':'string', 'account_category':'string', 'troll':'boolean'}).dropna()

In [276]:
model_name = "distilbert-base-uncased"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# feature_extractor = pipeline('feature-extraction', model=model, tokenizer=tokenizer)

In [277]:
# Input: Pandas Series of tweets.
# Output: DataFrame of features where each column is a feature,
#         and each row is a tweet.
def bert_featurize(tweets, model, tokenizer):
    encoded = tweets.apply(lambda t: tokenizer.encode(t))
    max_len = np.max([len(t) for t in encoded.tolist()])
    padded = encoded.apply(lambda t: np.array(t + [0] * (max_len - len(t))))
    model_input = torch.tensor(np.vstack(padded.values))
    attention_mask = torch.tensor(np.where(model_input == 0, 0, 1))
    with torch.no_grad():
        output = model(model_input, attention_mask=attention_mask)
    return pd.DataFrame(output[0][:, 0, :].numpy())

In [278]:
# Input: Pandas DataFrame including "content" column for tweets.
# Output: Same DataFrame with BERT features added in new columns.
def bert_featurize_df(df, model, tokenizer, batch_size, outfile):
    for idx in tqdm(range(0, len(df), batch_size)):
        chunk = df.iloc[idx:idx + batch_size, :]
        bert_features = bert_featurize(chunk["content"], model, tokenizer)
        combined = chunk.reset_index(drop=True).join(bert_features.reset_index(drop=True))
        combined.to_csv(outfile, mode='a', index=False, header=False)
    

In [30]:
# simple test
bert_featurize_df(train_tweets[:200], model, tokenizer, batch_size=10, outfile="data/transformer-binary/tiny.csv")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=20.0), HTML(value='')))




Smaller datasets, taking a subsample of each of the splits.

In [None]:
bert_featurize_df(train_tweets[:100000], model, tokenizer, batch_size=50, 
                  outfile="data/transformer-binary/bert_train_small.csv")

In [195]:
bert_featurize_df(dev_tweets[:15000], model, tokenizer, batch_size=50,
                 outfile="data/transformer-binary/dev_bert_small.csv")

In [None]:
bert_featurize_df(test_tweets[:15000], model, tokenizer, batch_size=50,
                 outfile="data/transformer-binary/test_bert_small.csv")

Larger datasets, using the entirety of each split. (Takes a *long* time to run.)

In [37]:
bert_featurize_df(train_tweets.dropna()[1403050:], model, tokenizer, batch_size=25,
                 outfile="data/transformer-binary/train_bert_large.csv")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=55060.0), HTML(value='')))




In [280]:
bert_featurize_df(dev_tweets, model, tokenizer, batch_size=50,
                                  outfile="data/transformer-binary/dev_bert_large.csv")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=11913.0), HTML(value='')))




In [281]:
bert_featurize_df(test_tweets, model, tokenizer, batch_size=100,
                  outfile="data/transformer-binary/test_bert_large.csv")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=5957.0), HTML(value='')))




### "Bag of Words" Features

In [None]:
train_tweets = pd.read_csv("data/preprocessed-text/train_tweets.csv",
                          dtype={'content':'string', 'account_category':'string', 'troll':'boolean'}).dropna()

In [269]:
class TextFeaturizer():
    def __init__(self, load_path=None):
        self.word_counts = None
        self.vocab = None
        self.SVD = None
        if load_path is not None:
            self.word_counts, self.vocab, self.SVD = pickle.load(open(load_path, "rb"))
        
    # Input: List of Tweets
    def count_words(self, tweets):
        # Use training data to construct vocabulary
        self.word_counts = defaultdict(int)
        for tweet in tqdm(tweets):
            words = re.sub(r'[^\w\s#]', '', tweet).lower().strip().split(" ")
            for word in words:
                self.word_counts[word] += 1
    
    # Create vocab, filtering out words that occur 
    # more times than min_occ, less than max_occ
    def create_vocab(self, min_occ=0, max_occ=float("inf")):
        # Filter out words that occur more than 0.75x the number of tweets, or less than 100 times.
        filtered = {k:v for k, v in self.word_counts.items() if v >= min_occ and v < max_occ}
        self.vocab = {word: i for i, word in enumerate(filtered)}
        print("Vocab Length:", len(self.vocab))
        
    def tweet_to_arr(self, tweet):
        words = re.sub(r'[^\w\s#]', '', tweet).lower().strip().split(" ")
        idxs = [self.vocab[word] for word in words if word in self.vocab]
        arr = np.zeros((len(self.vocab),))
        arr[tuple([idxs])] = 1
        return arr
    
    # For efficiency, provide a sample of the dataframe, not the entire thing.
    def fit_svd(self, n_components, df):
        self.SVD = TruncatedSVD(n_components = n_components)
        arrs = df.content.apply(lambda tweet: self.tweet_to_arr(tweet))
        features = np.vstack(arrs.values)
        print(features.shape)
        self.SVD.fit(features)
    
    def bag_of_words_featurize(self, df, batch_size, outfile, svd=False):
        for idx in tqdm(range(0, len(df), batch_size)):
            chunk = df.iloc[idx:idx + batch_size, :]
            arrs = chunk.content.apply(lambda tweet: self.tweet_to_arr(tweet))
            features = np.vstack(arrs.values)
            if svd:
                features = self.SVD.transform(features)
            combined = chunk.reset_index(drop=True).join(pd.DataFrame(features).reset_index(drop=True))
            combined.to_csv(outfile, mode='a', index=False, header=(idx == 0)) 
    
    def save_model(self, outfile):
        data = [self.word_counts, self.vocab, self.SVD]
        pickle.dump(data, open(outfile, "wb+"))

In [270]:
tf = TextFeaturizer("text-featurizer.pickle")

In [256]:
sample = train_tweets.sample(200000)
tf.fit_svd(500, sample)

(200000, 3808)


In [259]:
pickle.dump([tf.word_counts, tf.vocab, tf.SVD], open("text-featurizer.pickle", "wb+"))

In [272]:
tf.bag_of_words_featurize(train_tweets, 5000, "./data/bag-of-words/train_bow_large.csv")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=556.0), HTML(value='')))




In [273]:
dev_tweets = pd.read_csv("data/preprocessed-text/dev_tweets.csv",
                          dtype={'content':'string', 'account_category':'string', 'troll':'boolean'}).dropna()
tf.bag_of_words_featurize(dev_tweets, 5000, "./data/bag-of-words/dev_bow_large.csv")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=120.0), HTML(value='')))




In [274]:
test_tweets = pd.read_csv("data/preprocessed-text/test_tweets.csv",
                          dtype={'content':'string', 'account_category':'string', 'troll':'boolean'}).dropna()
tf.bag_of_words_featurize(test_tweets, 5000, "./data/bag-of-words/test_bow_large.csv")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=120.0), HTML(value='')))




In [283]:
tf.bag_of_words_featurize(train_tweets, 5000, "./data/bag-of-words-binary/train_bow_large.csv", svd=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=556.0), HTML(value='')))




In [284]:
tf.bag_of_words_featurize(dev_tweets, 5000, "./data/bag-of-words-binary/dev_bow_large.csv", svd=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=120.0), HTML(value='')))




In [285]:
tf.bag_of_words_featurize(test_tweets, 5000, "./data/bag-of-words-binary/test_bow_large.csv", svd=True)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=120.0), HTML(value='')))


