In [0]:
import glob
import numpy as np
import pandas as pd
import time
from datetime import datetime
import matplotlib.pyplot as plt

import nltk
from nltk import word_tokenize
from transformers import BertTokenizer as tokenizer
from transformers import BertForSequenceClassification, AdamW, BertConfig
import torch
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression




In [0]:
def load_troll_tweets(path):
  """
  Loads troll tweets from folder containing csv files
  
  Args:
    path (string): path to folder containing data files

  Returns:
    frame (dataframe): dataframe of all troll tweets
  """
  all_files = glob.glob(path + "/*.csv")

  collect = []

  for filename in all_files:
      df = pd.read_csv(filename)
      collect.append(df)

  frame = pd.concat(collect, axis=0, ignore_index=True)

return frame

In [0]:
def create_dataset(troll_frame, control_file_path):
  """
  Selects tweets from full dataset, tokenizes them and combines to create tweet dataset

  Args:
    troll_frame (dataframe): Dataframe of troll tweets
    control_file_path (string): Path to pickle file containing control tweets

  Returns:
    full_dataset (dataframe): Combined dataframe of troll and control tweets
    tweet_data (list): List of all tweet content
    labels (list): List of all labels associated with tweet {0,1}
  """

  troll = troll_frame[troll_frame['language'] == 'English']
  control = pd.read_pickle(control_file)

  control = control.drop(['external_author_id', 'author','country', 'city_state', 'language', 'post_type', 'account_creation_date',
              'tweet_id', 'tco1_step1', 'tco2_step1', 'tco3_step1'], axis = 1)
  troll = troll.drop(['external_author_id', 'author', 'region', 'language', 'harvested_date', 'followers', 'post_type',
            'account_type', 'account_category', 'new_june_2018', 'alt_external_id', 'tweet_id', 'article_url',
            'tco1_step1', 'tco2_step1', 'tco3_step1'], axis = 1)
  publish_dates = pd.to_datetime(troll['publish_date'], format = '%m/%d/%Y %H:%M')
  troll['publish_date'] = publish_dates
  
  troll['label'] = 1
  control['label'] = 0

  full_dataset = pd.concat([troll, control])
  min_date = min(full_dataset['publish_date'])
  full_dataset['publish_date'] = full_dataset['publish_date'] - min_date
  full_dataset['publish_date'] = full_dataset['publish_date'].dt.days

  tweet_data = full_dataset['content'].tolist()
  labels = full_dataset['label']

  return full_dataset, tweet_data, labels

In [0]:
def split_dataset(tweet_data, labels):
  """
  Splits dataset into train, val and test sets

  Args:
    tweet_data (list): List of all tweets
    labels (list): List of all labels
  
  Returns:
    X_train (list): Tweets for train data
    y_train (list): Labels for train data
    X_val (list): Tweets for validation data
    y_val (list): Labels for validation data
    X_test(list): Tweets for test data
    y_test (list): Labels for test data
    train_indices (list): Indices of full dataset used for training
    val_indices (list): Indices of full dataset used for validation
    test_indices (list): Indices of full dataset used for test
  """
  indices = [i for i in range(len(tweet_data))]

  X_train, X_test, train_indices, test_indices, y_train, y_test = train_test_split(tweet_data, indices, labels, test_size = 0.3, random_state = 30)
  
  X_train, X_val, train_indices, val_indices, y_train_ y_val = train_test_split(tweet_data, indices, labels, test_size = 0.2, random_state = 30)

  return X_train, y_train, X_val, y_val, X_test, y_test, train_indices, val_indices, test_indices

In [0]:
def get_max_length(tweet_data, tokenizer):
  """
  Finds length of longest tweet in all data

  Args:
    tweet_data (list): List of all tweet content
    tokenizer (class) : Appropriate tokenizer for language model being trained

  Returns:
    max_length (int): Length of longest tweet in data
  """
  max_length = 0
  for tweet in tweet_data:
      input_ids = tokenizer.encode(tweet, add_special_tokens=True)
      if len(input_ids) > max_length:
        max_length = len(input_ids)
  return max_length

In [0]:
def tokenize_prep_data(tokenizer, X_data, y_data, max_len, batch_size)
  """
  BERT Tokenizer tweet data and prepare datasets

  Args:
    tokenizer (class): BERT tokenizer class
    X_data (list): Tweet content data
    y_data (list): Labels
    max_len (int): Length of longest tweet
    batch_size (int): Batch size for data, BERT paper recommends 16 or 32

  Returns:
    dataloader (Pytorch Dataloader): Iterable over data that contains a sampler

  """
  tokenizer = tokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
  inputs = []
  attention_masks = []
  for tweet in X_data:
      encoded_tweets = tokenizer.encode_plus(
                          tweet,                     
                          add_special_tokens = True,
                          max_length = max_len,          
                          pad_to_max_length = True,
                          return_attention_mask = True,  
                          return_tensors = 'pt', 
                    )
      
      inputs.append(encoded_tweets['input_ids'])
      attention_masks.append(encoded_dict['attention_mask'])
      if counter % 100000 == 0:
        print('completed {}'.format(counter))

  input_ids = torch.cat(input_ids, dim=0)
  attention_masks = torch.cat(attention_masks, dim=0)
  labels = torch.tensor(y_data.values)

  full_dataset = TensorDataset(input_ids, attention_masks, labels)
  dataloader = DataLoader(
              full_dataset,
              sampler = RandomSampler(full_dataset), 
              batch_size = batch_size 
          )
  
  return dataloader

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels = 2,
    output_attentions = False, 
    output_hidden_states = False, 
)
model.to(device)
optimizer = AdamW(model.parameters(),
                  lr = lr,
                  eps = eps
                )
epochs = 1                                      

In [0]:
def train_model(epochs, model, train_dataloader, device, validation_dataloader)
  """
  Trains model on train data and evaluates it on validation data

  Args:
    epochs (int): Number of epochs to train model
    model (Pytorch model): Model to train
    train_dataloader (Pytorch Dataloader): Iterable of train data
    device (Pytorch Device): If GPU available then GPU, otherwise CPU
    validation_dataloader (Pytorch Dataloader): Iterable of validation data

  Returns:
    model (Pytorch Model): Trained model
  """

  start = time.time()
  for epoch in range(epochs):
      epoch_loss = 0
      model.train()

      for step, batch in enumerate(train_dataloader):

          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)

          model.zero_grad()        

          loss, logits = model(b_input_ids, 
                              token_type_ids=None, 
                              attention_mask=b_input_mask, 
                              labels=b_labels)

          epoch_loss += loss.item()

          loss.backward()
          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
          optimizer.step()

      train_loss = epoch_loss / len(train_dataloader)            

      print('train loss for epoch {} is {} and training took {}'.format(epoch+1, train_loss, time.time()-start))
      model.eval()

      val_gt = []
      val_preds = []
      val_loss = 0

      for batch in validation_dataloader:

          b_input_ids = batch[0].to(device)
          b_input_mask = batch[1].to(device)
          b_labels = batch[2].to(device)
          
          with torch.no_grad():        
              loss, logits = model(b_input_ids, 
                                    token_type_ids=None, 
                                    attention_mask=b_input_mask,
                                    labels=b_labels)
              
          val_loss += loss.item()

          logits = logits.detach().cpu().numpy()
          label_ids = b_labels.cpu().numpy()

          val_gt.extend(label_ids)
          val_preds.extend(np.argmax(logits, axis=1))

      val_accuracy = accuracy_score(val_gt, val_preds)

      avg_val_loss = val_loss / len(validation_dataloader)

      print('validation accuracy for epoch {} is {}'.format(epoch+1, val_accuracy))
      print('validation loss for epoch {} is {}'.format(epoch+1, avg_val_loss))

    return model

In [0]:
def predict_test(model, test_dataloader):
  """
  Predict labels for test dataset based on trained model

  Args:
    model (Pytorch Model): Trained model
    test_dataloader (Pytorch dataloader): Iterable over test data

  Returns:
    test_preds (list): List of probabilities for each class for each test datapoint
    test_gt (list): List of true labels for each test datapoint
  """
  model.eval()

  test_preds = []
  test_gt = []

  for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    
    b_input_ids, b_input_mask, b_labels = batch
    
    
    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=None, 
                        attention_mask=b_input_mask)

    probs = outputs[0]

    probs = probs.detach().cpu().numpy()
    label_ids = b_labels.cpu().numpy()
    test_preds.append(logits)
    test_gt.append(label_ids)

  return test_preds, test_gt

Predicting labels for 30,000 test sentences...
    DONE.


In [0]:
def format_probs(test_preds, full_dataset, test_indices):
  """
  Format probabilities from BERT into form usable for final classifier

  Args:
    test_preds (list): List of logits for test set
    full_dataset (dataframe): Dataframe of all data
    test_indicies (list): List of indices of full dataset used for test data

  Returns:
    final_output (dataframe): Dataframe of all data required for classifier
  """
  softmax_probs = [softmax(i) for i in test_preds]
  prob_troll = [i[1] for i in softmax_probs]
  final_output = data.iloc[test_indices]
  final_output['Probability Troll'] = prob_troll
  return final_output

30000

In [0]:
def train_test_classifier(final_output):
  """
  Train and test binary classifier

  Args:
    final_output (dataframe): Dataframe containing all data needed for classifier

  Returns:
    score (float): Mean accuracy
    cm (array): Confusion matrix
    fpr (array): False positive rate
    tpr (array): True positive rate
  """
  y = final_output['label']
  X = final_output.drop(['label', 'content'])

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
  logreg = LogisticRegression()
  logreg.fit(X_train, y_train)
  preds = logreg(X_test)
  score = logreg.score(x_test, y_test)
  cm = sklearn.metrics.confusion_matrix(y_test, preds)
  fpr, tpr, thresholds = sklearn.metrics.roc_curve(y_test, preds)
  
  return score, cm, fpr, tpr

In [0]:
def output_metrics(score, cm, fpr, tpr):
  """
  Prints metrics and plots ROC curve

  Args:
    score (float): Mean accuracy
    cm (array): Confusion matrix
    fpr (array): False positive rate
    tpr (array): True positive rate

  """
  print('Accuracy of Log Reg model {}'.format(score))
  print('Confusion Matrix {}'.format(cm))

  plt.plot(fpr, tpr)
  plt.xlabel = ('False Positive Rate')
  plt.ylabel = ('False Negative Rate')
  plt.show()

  pass

In [0]:
# Load and Process Data
path = r'data/russian-troll-tweets-master'
troll_frame = load_troll_tweets(path)
control_file_path = 'data/control_data.pkl'
full_dataset, tweet_data, labels = create_dataset(troll_frame, control_file_path)
X_train, y_train, X_val, y_val, X_test, y_test, train_indices, val_indices, test_indices = split_dataset(tweet_data, labels)

# Tokenize and pad data and set up dataloaders
tokenizer = tokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
max_len = get_max_length(tweet_data, tokenizer)
train_dataloader = tokenize_prep_data(tokenizer, X_train, y_train, max_len, batch_size=32)
validation_dataloader = tokenize_prep_data(tokenizer, X_val, y_val, max_len, batch_size=32)
test_dataloader = tokenize_prep_data(tokenizer, X_test, y_test, max_len, batch_size=32)

# Fine-tune BERT model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(
    "bert-base-cased",
    num_labels = 2,
    output_attentions = False, 
    output_hidden_states = False, 
)
model.to(device)
optimizer = AdamW(model.parameters(), lr = lr, eps = eps)
epochs = 1                                      
trained_model = train_model(epochs, model, train_dataloader, device, validation_dataloader)

# Predict on test set and format outputs for binary classifier
test_preds, test_gt = predict_test(trained_model, test_dataloader)
final_output = format_probs(test_preds, full_dataset, test_indices)

# Train binary classifier, and output classification metrics on test set
score, cm, fpr, tpr = train_test_classifier(final_output)
output_metrics(score, cm, fpr, tpr)