In [None]:
pip install ekphrasis

In [None]:
pip install transformers

In [None]:
import pandas as pd
import os
import numpy as np
import torch
import random
import functools
import operator
import cv2
import seaborn as sns
import matplotlib.pyplot as plt


from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader, Dataset, SequentialSampler
from transformers import get_linear_schedule_with_warmup, RobertaModel, RobertaConfig, RobertaTokenizer, AutoTokenizer, AutoModel, AutoConfig
from sklearn.metrics import matthews_corrcoef, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm, trange
# from ekphrasis.classes.preprocessor import TextPreProcessor
# from ekphrasis.classes.tokenizer import SocialTokenizer
# from ekphrasis.dicts.emoticons import emoticons
from keras.models import load_model, Model

Using TensorFlow backend.


### Helper Functions

In [None]:
def clean_text(data, normalize_list, annotate_list):
        """
        This function preprocesses the text using the Ekphrasis library
        
        data: Pandas series object containing strings of text

        normalize_list: list of data features to clean

        annotate_list: list of data features to annotate
        """

        text_processor = TextPreProcessor(
            normalize= normalize_list,
            annotate= annotate_list,
            fix_html=True,
            segmenter="twitter", 
            unpack_hashtags=True,  
            unpack_contractions=True,  
            spell_correct_elong=True,  
            tokenizer=SocialTokenizer(lowercase=True).tokenize,
            dicts=[emoticons]
        )

        clean_data = data.map(lambda x: " ".join(text_processor.pre_process_doc(x)))

        return clean_data


def early_stopping(val_loss_values, early_stop_vals):
    """
    Determines whether or not the model will keep running based on the patience and delta given relative to the val loss
    """
    if len(val_loss_values) > early_stop_vals["patience"]:
      if val_loss_values[-1] <= np.mean(np.array(val_loss_values[-1-early_stop_vals["patience"]:-1])) - early_stop_vals["delta"]:
        return False
      else:
        return True
    else:
      return False


def metrics(labels, preds, argmax_needed: bool = False):
    """
    Returns the Matthew's correlation coefficient, accuracy rate, true positive rate, true negative rate, false positive rate, false negative rate, precission, recall, and f1 score
    
    labels: list of correct labels

    pred: list of model predictions

    argmax_needed (boolean): converts logits to predictions. Defaulted to false.
    """

    if argmax_needed == True:
        preds = np.argmax(preds, axis=1).flatten()

    mcc = matthews_corrcoef(labels, preds)
    acc = accuracy_score(labels, preds)
    cm = confusion_matrix(labels, preds)

    f1 = f1_score(labels, preds, average= "weighted")
    precision = precision_score(labels, preds, average= "weighted")
    recall = recall_score(labels, preds, average= "weighted")

    results = {
        "mcc": mcc,
        "acc": acc,
        "confusion_matrix": cm,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }
    
    return results, labels, preds


def combine_text(df):
    """
    Combines tweet and image text into one column

    df: Dataframe which holds the data
    """
    combined_text = []

    for row_num in range(len(df)):
        tweet_text = df.loc[row_num, "tweet_text"]
        image_text = df.loc[row_num, "img_text"]
        if type(image_text) == str:
            combined_text.append(tweet_text + image_text)
        else:
            combined_text.append(tweet_text)

    return combined_text


def training_plot(train_loss_values, val_loss_values):
    """
    Plots loss after each epoch

    training_loss_values: list of floats; output from fine_tune function

    val_loss_values: list of floats; output from fine_tune function
    """
    sns.set(style='darkgrid')
    plt.rcParams["figure.figsize"] = (12,6)

    plt.plot(train_loss_values, 'b-o', label="train")
    plt.plot(val_loss_values, 'g-o', label="valid")

    #plt.title("Training and Validation loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    #plt.savefig("dogwhistle_train_plot.png",bbox_inches='tight')

    return plt.show()


def model_saver(model, model_type, model_implementation, output_directory, training_dict, labels, preds, ids, results, tokenizer= None):
    """
    Saves Model and other outputs

    model: Model to be saved
    
    model_type (string): Name of model

    model_implementation: Keras or Pytorch
    
    output_directory: Directory to folder to save file in

    training_dict: Dictionary of training and validation values 

    labels: List of labels for test set

    preds: List of model predictions after passed through argmax()

    results: Dictionary of metrics

    tokenizer: Tokenizer to be saved. Defaulted to None.
    """
    
    output_directory = os.path.join(output_directory, model_type)
    
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    os.chdir(output_directory)

    np.save(model_type+"_dogwhistle_train_results.npy", training_dict) #save training dict
    np.save(model_type+"_dogwhistle_test_results.npy", results) #save test metrics
    
    test_predictions = pd.DataFrame([ids, labels, preds]) #save predictions and labels 
    test_predictions = test_predictions.T
    test_predictions = test_predictions.rename(columns={0: 'Ids', 1: 'Labels', 2: 'Predictions'})
    test_predictions.to_csv(model_type+"_dogwhistle_predictions.csv")

    #save models
    if model_implementation == "Pytorch":
        torch.save(model.state_dict(), model_type+"_model")

    if model_implementation == "Keras":
        model.save("image_model.h5") #save model

    return print("Saving complete.")

### Text Feature Extraction

In [None]:
class Transformer_features(nn.Module):
  def __init__(self, method_type):
      """
      method_type: Extracts features from Bert either using the method in Devlin et al or Sabat el al

      """
      super(Transformer_features, self).__init__()
      self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

      if method_type == "Devlin":
          self.config = AutoConfig.from_pretrained('/content/drive/My Drive/Dog_Whistle_Code/Fine_Tuned_Models/Text/RoBERTa', output_hidden_states = True)
          self.model = AutoModel.from_config(self.config).to(self.device)

      if method_type == "Sabat":
          self.model = RobertaModel.from_pretrained('/content/drive/My Drive/Dog_Whistle_Code/Fine_Tuned_Models/Text/RoBERTa').to(self.device)


  def forward(self, dataloader, method_type):
    """
    This function recieves tokenized tensors and the sentence pair IDs and returns a sentence embedding for each input sequence

    dataloader: dataloader object containing combined text and IDs

    method_type: Extracts features from Bert either using the method in Devlin et al or Sabat el al

    """
   
    self.model.eval()


    if method_type == "Devlin": # averages word embeddings to get sentence embeddings, then concatenates last four layers
        
        combined_layers = torch.zeros(1, 4096).to(self.device)
        id_list = []

        for batch in dataloader:
            with torch.no_grad():
                _, _, encoded_layers = self.model(batch[0].to(self.device), attention_mask=batch[1].to(self.device)) #shape [25 x len(tokens) x 100 x 1024]

            concat_layers = torch.cat((torch.mean(encoded_layers[-4], dim=1), torch.mean(encoded_layers[-3], dim=1), torch.mean(encoded_layers[-2], dim=1), torch.mean(encoded_layers[-1], dim=1)), dim=1)
            combined_layers = torch.cat((combined_layers, concat_layers), dim=0)
            id_list.append(batch[2])

    if method_type == "Sabat": # averages word embeddings from last layer 

        combined_layers = torch.zeros(1, 1024).to(self.device)
        id_list = []

        for batch in dataloader:
            with torch.no_grad():
                output, _ = self.model(batch[0].to(self.device)) #shape [batch_size x pad_length x 1024]

            text_features = torch.mean(output, dim=1)
            combined_layers = torch.cat((combined_layers, text_features), dim=0)
            id_list.append(batch[2])

        
    combined_layers = combined_layers[1:, :].to(torch.int64) #input len x 4096
    id_list = torch.as_tensor(functools.reduce(operator.iconcat, id_list, [])).to(torch.int64) #input length
    out_matrix = torch.cat((id_list.unsqueeze(dim= 1).to(self.device), combined_layers.to(self.device)), dim=1)

    return out_matrix

In [None]:
#Text Hyperparameters
NORMALIZE_LIST = ['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number']
ANNOTATE_LIST = ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored']
TOKENIZER = RobertaTokenizer.from_pretrained('/content/drive/My Drive/Dog_Whistle_Code/Fine_Tuned_Models/Text/RoBERTa')
PAD_LENGTH = 100

In [None]:
class DogWhistleDatasetText(Dataset):
    def __init__(self, df, tokenizer, pad_length: int=100):
        self.data = df
        self.tokenizer = tokenizer
        self.pad_length = pad_length
         
    def __len__(self):
        return (self.data.shape[0])
    
    def __getitem__(self, i):
        text = self.data.loc[i, "combined_text"] 
        encoded_dict = self.tokenizer.encode_plus(text, add_special_tokens = True, max_length = self.pad_length, pad_to_max_length = True, return_attention_mask = True, return_tensors = 'pt')

        image_number = self.data.loc[i, "image_number"]

        return (torch.sum(encoded_dict['input_ids'], dim=0), torch.sum(encoded_dict['attention_mask'], dim=0), image_number) #reshape encoded_dict from 1x100 to 100


In [None]:
# Prepare data

#Load data
train = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/dog_whistle_train.csv", encoding='utf-8')
dev = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/dog_whistle_dev.csv", encoding='utf-8')
test = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/dog_whistle_test.csv", encoding='utf-8')


#Clean data
train["combined_text"] = combine_text(train)
train["combined_text"] = clean_text(train["combined_text"], NORMALIZE_LIST, ANNOTATE_LIST)
dev["combined_text"] = combine_text(dev)
dev["combined_text"] = clean_text(dev["combined_text"], NORMALIZE_LIST, ANNOTATE_LIST)
test["combined_text"] = combine_text(test)
test["combined_text"] = clean_text(test["combined_text"], NORMALIZE_LIST, ANNOTATE_LIST)


#Subset necessary data
train = train[["image_number", "combined_text"]]
dev = dev[["image_number", "combined_text"]] 
test = test[["image_number", "combined_text"]] 


#Create Dataset
train_dataset = DogWhistleDatasetText(train, TOKENIZER)
dev_dataset = DogWhistleDatasetText(dev, TOKENIZER)
test_dataset = DogWhistleDatasetText(test, TOKENIZER)


#Create dataloader
train_dataloader = DataLoader(train_dataset, batch_size=32)
dev_dataloader = DataLoader(dev_dataset, batch_size=32) 
test_dataloader = DataLoader(test_dataset, batch_size=32)


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading english - 1grams ...
Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading english - 1grams ...
Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading english - 1grams ...


In [None]:
TextExtractor = Transformer_features("Devlin")

train_text_features = TextExtractor(train_dataloader, "Devlin")
print("Done")
dev_text_features = TextExtractor(dev_dataloader, "Devlin")
print("Done")
test_text_features = TextExtractor(test_dataloader, "Devlin")
print("Done")

Done
Done
Done


In [None]:
TextExtractor = Transformer_features("Sabat")

train_text_features = TextExtractor(train_dataloader, "Sabat")
print("Done")
dev_text_features = TextExtractor(dev_dataloader, "Sabat")
print("Done")
test_text_features = TextExtractor(test_dataloader, "Sabat")
print("Done")

Done
Done
Done


In [None]:
#Save Devlin
train_text_features = train_text_features.cpu().numpy()
np.save("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/text_features.npy", train_text_features)

dev_text_features = dev_text_features.cpu().numpy()
np.save("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/text_features.npy", dev_text_features)

test_text_features = test_text_features.cpu().numpy()
np.save("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/text_features.npy", test_text_features)


In [None]:
#Save Sabat
train_text_features= train_text_features.cpu().numpy()
np.save("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/text_features_sabat.npy", train_text_features)

dev_text_features = dev_text_features.cpu().numpy()
np.save("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/text_features_sabat.npy", dev_text_features)

test_text_features = test_text_features.cpu().numpy()
np.save("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/text_features_sabat.npy", test_text_features)

### Image Feature Extraction

In [None]:
def Image_features(trained_model, dataloader):
  """ Extracts image features from images

  trained_model: pre-trained image model

  dataloader: dataloader object containing image paths and IDs
  """

  combined_output = np.zeros((1, 1024))
  id_list = []
  
  for num, batch in enumerate(dataloader):
      if num % 25 == 0:
          print("Processing batch {} of {}".format(num, len(dataloader)))
      batch_output = Model(trained_model.input, trained_model.layers[-2].output).predict(batch[0]) #32 x 1024
      combined_output = np.concatenate((combined_output, batch_output), axis=0)
      id_list.append(batch[1])


  combined_output = combined_output[1:, :]
  id_list = np.array(functools.reduce(operator.iconcat, id_list, []))
                            
  out_matrix = np.concatenate((np.expand_dims(id_list, axis=1), combined_output), axis=1)

  return out_matrix


In [None]:
class DogWhistleDatasetImage(Dataset):
    def __init__(self, df, base_path, image_size: int=299):
        self.data = df
        self.base_path = base_path
        self.image_size = image_size

    def __len__(self):
        return (self.data.shape[0])
    
    def __getitem__(self, i):
        image_path = str(self.data.loc[i, "image_number"])
        path = self.base_path + "/" + image_path + ".jpg"
        image = cv2.imread(path) 
        image = cv2.resize(image, (self.image_size, self.image_size)) 

        sample = (image, self.data.loc[i, "image_number"])

        return sample

In [None]:
#Load data
train = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/dog_whistle_train.csv", encoding='utf-8')
dev = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/dog_whistle_dev.csv", encoding='utf-8')
test = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/dog_whistle_test.csv", encoding='utf-8')


#Subset necessary data
train = train[["image_number"]]
dev = dev[["image_number"]] 
test = test[["image_number"]] 


#Create Dataset
train_dataset = DogWhistleDatasetImage(train, "/content/drive/My Drive/Dog_Whistle_Code/Data/Images")
dev_dataset = DogWhistleDatasetImage(dev, "/content/drive/My Drive/Dog_Whistle_Code/Data/Images")
test_dataset = DogWhistleDatasetImage(test, "/content/drive/My Drive/Dog_Whistle_Code/Data/Images")


#Create dataloader
train_dataloader = DataLoader(train_dataset, batch_size=32)
dev_dataloader = DataLoader(dev_dataset, batch_size=32) 
test_dataloader = DataLoader(test_dataset, batch_size=32)

In [None]:
ImageExtractor = load_model('/content/drive/My Drive/Dog_Whistle_Code/Fine_Tuned_Models/Image/Xception/image_model.h5') #using pre-trained Xception

train_image_features = Image_features(ImageExtractor, train_dataloader)
dev_image_features = Image_features(ImageExtractor, dev_dataloader)
test_image_features = Image_features(ImageExtractor, test_dataloader)

Processing batch 0 of 125
Processing batch 25 of 125
Processing batch 50 of 125
Processing batch 75 of 125
Processing batch 100 of 125
Processing batch 0 of 16
Processing batch 0 of 16


In [None]:
# Save
np.save("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/image_features.npy", train_image_features)
np.save("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/image_features.npy", dev_image_features)
np.save("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/image_features.npy", test_image_features)

### Combine Feature Data

In [None]:
# Load Text data
# train_text = np.load("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/text_features.npy", allow_pickle=True)
# dev_text = np.load("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/text_features.npy", allow_pickle=True)
# test_text = np.load("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/text_features.npy", allow_pickle=True)
train_text = np.load("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/text_features_sabat.npy", allow_pickle=True)
dev_text = np.load("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/text_features_sabat.npy", allow_pickle=True)
test_text = np.load("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/text_features_sabat.npy", allow_pickle=True)


# Load Image data
train_image = np.load("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/image_features.npy", allow_pickle=True)
dev_image = np.load("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/image_features.npy", allow_pickle=True)
test_image = np.load("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/image_features.npy", allow_pickle=True)

# Load Other data
train2 = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/dog_whistle_train.csv", encoding='utf-8')
dev2 = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/dog_whistle_dev.csv", encoding='utf-8')
test2 = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/dog_whistle_test.csv", encoding='utf-8')

# Merge
train = pd.concat((pd.DataFrame(train_text[:, 1:]), pd.DataFrame(train_image[:, 1:])), axis = 1)
train["ids"] = train_text[:, :1]
train["labels"] = train2["Primary_numeric_gt"]
dev = pd.concat((pd.DataFrame(dev_text[:, 1:]), pd.DataFrame(dev_image[:, 1:])), axis = 1)
dev["ids"] = dev_text[:, :1]
dev["labels"] = dev2["Primary_numeric_gt"]
test = pd.concat((pd.DataFrame(test_text[:, 1:]), pd.DataFrame(test_image[:, 1:])), axis = 1)
test["ids"] = test_text[:, :1]
test["labels"] = test2["Primary_numeric_gt"]

# Save
# train.to_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/combined_features.csv")
# dev.to_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/combined_features.csv")
# test.to_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/combined_features.csv")
train.to_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/combined_features_sabat.csv")
dev.to_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/combined_features_sabat.csv")
test.to_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/combined_features_sabat.csv")

### Pytorch Implementation

In [None]:
class MultimodalClassifier(nn.Module):
    def __init__(self, MLP_type, hidden_size: int=50, dropout: float=0.2, num_labels: int=4, input_len: int = 5120):
        """Initializes the network structure
        MLP_type: Which paper's MLP structure to use
        
        image_model: CovNet from Keras library to use as image feature extractor

        text_model: Transformer Model from Hugging Face to use as the text feature extractor

        hidden_size (int): Number of nodes in the hidden layer. Defaulted to 50. 

        dropout (float): Rate at which nodes are deactivated. Defaulted to 0.2. 
        
        num_labels (int): Number of labels to predict. Defaulted to 4.

        input_len (int): Length of input vector. Defaulted to 5120 (Image feature length (4096) + text feature length (1024)).
        """
        super(MultimodalClassifier, self).__init__()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        if MLP_type == "Sabat":
            self.classifier = nn.Sequential(
                              nn.Linear(input_len, hidden_size),
                              nn.ReLU(),
                              nn.Dropout(dropout),
                              nn.Linear(hidden_size, hidden_size),
                              nn.ReLU(),
                              #nn.Dropout(dropout),
                              nn.Linear(hidden_size, num_labels)
                              #nn.Softmax(dim=1) 
                          )

        if MLP_type == "Gomez":
            self.classifier = nn.Sequential(
                              nn.Linear(input_len, input_len),
                              nn.BatchNorm1d(input_len),
                              nn.ReLU(),
                              #nn.Dropout(dropout),
                              nn.Linear(input_len, 1024),
                              nn.BatchNorm1d(1024),
                              nn.ReLU(),
                              #nn.Dropout(dropout),
                              nn.Linear(1024, 512),
                              nn.BatchNorm1d(512),
                              nn.ReLU(),
                              #nn.Dropout(dropout),
                              nn.Linear(512, num_labels),
                              nn.Softmax(dim=1) 
                          )


    def forward(self, features):
        """Initiaties foward pass through network
        
        features: Matrix of size number of tweets x 5120 containing concatenated image and text features

        """

        out = self.classifier(features.to(torch.float))

        return out


    def trainer(self, input_model, train_data, dev_data, early_stop_vals: dict, epochs: int = 25, learning_rate: float = 1e-5, weight_decay: float = 0.1, warmup: float = 0.06):   
            """
            Trains multimodal model

            input_model: Instatiation of model

            train_data: Dataloader object containing train data- image, text, labels

            dev_data: Dataloader object containing dev data- image, text, labels

            early_stopping: Dictionary containing patience value (int) and delta value (float). The patience determines the number of epochs to wait to achieve the given delta

            epochs (int): Number of times to run through all batches. Default value is 25.

            learning_rate (float): Default value is 1e-5.

            weight decay (float): Default value is 0.1 

            warmup (float): Default value is 0.06; percentage of training steps in warmup.
            """

            model = input_model.to(self.device)
            self.optimizer = optim.AdamW(model.classifier.parameters(), lr = learning_rate, weight_decay = weight_decay)
            self.scheduler = get_linear_schedule_with_warmup(self.optimizer, num_warmup_steps = warmup * (len(train_data) * epochs), num_training_steps = (1-warmup) * (len(train_data) * epochs))
            criterion = nn.CrossEntropyLoss().to(self.device)

            train_loss_values, val_loss_values, train_acc_values, val_acc_values = [], [], [], []

            for epoch in trange(epochs, desc= "Epoch"):
                if early_stopping(val_loss_values, early_stop_vals) == False:
                    print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
                    print('Training...')

                    train_total_loss, train_total_len, train_num_correct = 0, 0, 0

                    model.train()

                    for step, batch in enumerate(train_data): 
                        if step % 50 == 0:
                            print("Processing batch...{} of {}".format(step, len(train_data)))

                        #model.zero_grad()
                        self.optimizer.zero_grad()  
                        batch_features, batch_labels, _ = tuple(t.to(self.device) for t in batch)
                        train_total_len += batch_features.shape[0]

                        logits = model(batch_features)

                        loss = criterion(logits, batch_labels).to(self.device) 
                        train_total_loss += loss

                        loss.backward() 
                        self.optimizer.step() 
                        self.scheduler.step()

                        pred = logits.argmax(1, keepdim=True).float()
                        correct_tensor = pred.eq(batch_labels.float().view_as(pred))
                        correct = np.squeeze(correct_tensor.cpu().numpy())
                        train_num_correct += np.sum(correct)

                        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)


                    train_acc = train_num_correct / train_total_len
                    train_acc_values.append(train_acc)
                    avg_train_loss = train_total_loss / len(train_data)         
                    train_loss_values.append(avg_train_loss)

                    print()
                    print("Running Validation...")
                    print()

                    val_total_loss, val_total_len, val_num_correct = 0, 0, 0 

                    model.eval()

                    for batch in dev_data:
                        batch_features, batch_labels, _ = tuple(t.to(self.device) for t in batch)
                        val_total_len += batch_features.shape[0]

                        with torch.no_grad():        

                            logits = model(batch_features)
      
                        loss = criterion(logits, batch_labels) 
                        val_total_loss += loss
                      
                        pred = logits.argmax(1, keepdim=True).float()
                        correct_tensor = pred.eq(batch_labels.float().view_as(pred))
                        correct = np.squeeze(correct_tensor.cpu().numpy())
                        val_num_correct += np.sum(correct)

                    val_acc = val_num_correct / val_total_len
                    val_acc_values.append(val_acc)
                    avg_val_loss = val_total_loss / len(dev_data)
                    val_loss_values.append(avg_val_loss.cpu().numpy())

                    print("Epoch | Train Accuracy | Validation Accuracy | Training Loss | Validation Loss")
                    print(f"{epoch+1:3d} |    {train_acc:.3f}    |       {val_acc:.3f}       |    {avg_train_loss:.3f}    |     {avg_val_loss:.3f}")
                    print()


                    if epoch == (epochs-1):
                        training_plot(train_loss_values, val_loss_values)
                        training_dict = {"Train Accuracy": train_acc_values, "Train Loss": train_loss_values, "Val Accuracy": val_acc_values, "Val Loss": val_loss_values}
                        print("Training complete!")
                        return training_dict
                    else:
                        continue
          
                else:
                    print("Stopping early...")
                    training_plot(train_loss_values, val_loss_values)
                    training_dict = {"Train Accuracy": train_acc_values, "Train Loss": train_loss_values, "Val Accuracy": val_acc_valuess, "Val Loss": val_loss_values}
                    print("Training complete!")
                    return training_dict


    def test(self, input_model, test_data):
        """
        Tests the model's performance based on a several metrics

        input_model: Instatiation of model

        test_data: Dataloader object containing test data- image, text, labels
        """
   
        print('Predicting labels for {} sentences...'.format(len(test_data)))
        
        model = input_model.to(self.device)
        model.eval()

        predictions, true_labels, ids = [], [], []

        for batch in test_data:
            batch_features, batch_labels, batch_ids = tuple(t.to(self.device) for t in batch)
      
            with torch.no_grad():
                logits = model(batch_features)

            predictions.append(logits.detach().cpu().numpy())
            true_labels.append(batch_labels.to('cpu').numpy())
            ids.append(batch_ids.cpu().numpy())

        predictions = functools.reduce(operator.iconcat, predictions, [])
        true_labels = functools.reduce(operator.iconcat, true_labels, [])
        ids = functools.reduce(operator.iconcat, ids, [])

        print('    DONE.')
    
        return metrics(true_labels, predictions, argmax_needed= True), ids


In [None]:
#Hyperparamters
DROPOUT = 0.2
HIDDEN_SIZE = 100
BATCH_SIZE = 8
NUM_LABELS = 4
NUM_EPOCHS = 100
EARLY_STOPPING = {"patience": 5, "delta": 0.005}
LEARNING_RATES = [0.0001, 0.001, 0.01, 0.1]
WEIGHT_DECAY = 0.1 
WARMUP = 0.06 
OUTPUT_DIR = "/content/drive/My Drive/Dog_Whistle_Code/Fine_Tuned_Models/Multimodal/Feature Concatenation"


In [None]:
class DogWhistleDataset(Dataset):
    def __init__(self, df):
        self.data = df

    def __len__(self):
        return (self.data.shape[0])
    
    def __getitem__(self, i):
        features = np.array(self.data.iloc[0, 1:-2]) #start at 1 because of the Unnamed:0 header that gets added
        labels = self.data.loc[i, "labels"]
        ids = self.data.loc[i, "ids"]
        sample = (features, labels, ids)

        return sample

In [None]:
# Load data
# train = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/combined_features.csv")
# dev = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/combined_features.csv")
# test = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/combined_features.csv")

train = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/combined_features_sabat.csv")
dev = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/combined_features_sabat.csv")
test = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/combined_features_sabat.csv")

# Create dataset object
train_dataset = DogWhistleDataset(train)
dev_dataset = DogWhistleDataset(dev)
test_dataset = DogWhistleDataset(test)

# Create dataloader
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=BATCH_SIZE, shuffle=True) 
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
Classifier = MultimodalClassifier("Sabat", HIDDEN_SIZE, DROPOUT, NUM_LABELS, 2048)
#logits, batch_labels = Classifier.trainer(Classifier, train_dataloader, dev_dataloader, EARLY_STOPPING, 10, 0.1, WEIGHT_DECAY, WARMUP)
train_dict = Classifier.trainer(Classifier, train_dataloader, dev_dataloader, EARLY_STOPPING, 5, LEARNING_RATES[0], WEIGHT_DECAY, WARMUP)
(metric_vals, labels, preds), ids = Classifier.test(Classifier, test_dataloader)
#model_saver(Classifier, "Multimodal", OUTPUT_DIR, train_dict, labels, preds, metrics, ids)

print(metric_vals)

### Random Forrest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(train.iloc[:, 1:-2])
X_test = sc.transform(test.iloc[:, 1:-2])
y_train = train.loc[:, "labels"].values
y_test = test.loc[:, "labels"].values

clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

metric_vals, _, _ = metrics(y_test, y_pred)
metric_vals

  _warn_prf(average, modifier, msg_start, len(result))


{'acc': 0.6394422310756972, 'confusion_matrix': array([[232,  36,   0,   0],
        [ 96,  89,   0,   0],
        [  7,   5,   0,   0],
        [ 35,   2,   0,   0]]), 'f1': 0.5951976436096736, 'mcc': 0.31622713710618033, 'precision': 0.5832232902950033, 'recall': 0.5951976436096736}

### Keras Implementation

In [None]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import LearningRateScheduler, EarlyStopping

#Hyperparamters
DROPOUT = 0.2
HIDDEN_SIZE = 100
BATCH_SIZES = [8, 16, 32]
NUM_LABELS = 4
NUM_EPOCHS = 100
EARLY_STOPPING = {"patience": 3, "delta": 0.005}
LEARNING_RATES = [0.0001, 0.001, 0.01, 0.1]
WEIGHT_DECAY = 0.1 
WARMUP = 0.06 
OUTPUT_DIR = "/content/drive/My Drive/Dog_Whistle_Code/Fine_Tuned_Models/Multimodal/Feature Concatenation"

def decay(epoch, lr):
    epochs_drop = 5
    DECAY_RATE = 0.94
    lrate = lr * (DECAY_RATE**((1+epoch)/epochs_drop))
    return lrate

SCHEDULER = LearningRateScheduler(decay)


# Load data
# train = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/combined_features.csv")
# dev = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/combined_features.csv")
# test = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/combined_features.csv")

train = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/combined_features_sabat.csv")
dev = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/combined_features_sabat.csv")
test = pd.read_csv("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/combined_features_sabat.csv")

# Divide labels and features
x_train = train.iloc[:, 1:-2]
y_train = pd.get_dummies(train.loc[:, "labels"])
x_dev = dev.iloc[:, 1:-2]
y_dev = pd.get_dummies(dev.loc[:, "labels"])
x_test = test.iloc[:, 1:-2]
y_test = test.loc[:, "labels"].values.tolist() 


In [None]:
def construct_model(MLP_type, hidden_size: int=50, dropout: float=0.2, num_labels: int=4, input_len: int = 5120):
        """Builds the network structure
        image_model: CovNet from Keras library to use as image feature extractor

        text_model: Transformer Model from Hugging Face to use as the text feature extractor

        hidden_size (int): Number of nodes in the hidden layer. Defaulted to 50. 

        dropout (float): Rate at which nodes are deactivated. Defaulted to 0.2. 
        
        num_labels (int): Number of labels to predict. Defaulted to 4.

        input_len (int): Length of input vector. Defaulted to 5120 (Text feature length (4096) + image feature length (1024)).
        """

        if MLP_type == "Sabat":
            model = Sequential()
            model.add(Dense(units=hidden_size, activation='relu',input_dim=input_len))
            model.add(Dropout(0.2))
            model.add(Dense(units=hidden_size, activation='relu',input_dim=hidden_size))
            #model.add(Dropout(0.2))
            model.add(Dense(units=num_labels, activation='softmax', input_dim=hidden_size))

        if MLP_type == "Gomez":
            model = Sequential()
            model.add(Dense(units=input_len, activation='relu',input_dim=input_len))
            model.add(BatchNormalization())
            #model.add(Dropout(0.2))
            model.add(Dense(units=1024, activation='relu',input_dim=input_len))
            model.add(BatchNormalization())
            #model.add(Dropout(0.2))
            model.add(Dense(units=512, activation='relu',input_dim=1024))
            model.add(BatchNormalization())
            #model.add(Dropout(0.2))
            model.add(Dense(units=num_labels, activation='softmax', input_dim=512))
            
        return model


def model_trainer(input_model, x_train, x_test, x_dev, y_dev, early_stop_vals: dict, scheduler, epochs: int = 25, learning_rate: float = 1e-5, batch_size: int=8):   
    """
    Trains multimodal model

    input_model: Instatiation of model

    x_train: Dataframe containing train features

    y_train: Pandas series containing train labels

    x_dev: Dataframe containing validation features

    y_dev: Pandas series containing validation labels

    early_stopping: Dictionary containing patience value (int) and delta value (float). The patience determines the number of epochs to wait to achieve the given delta

    epochs (int): Number of times to run through all batches. Default value is 25.

    learning_rate (float): Default value is 1e-5.

    batch_size (int): Number of examples to be passed through the model at a given time. Defaulted to 8.
    """


    Early_Stop = EarlyStopping(monitor='val_loss', min_delta=early_stop_vals["delta"], patience=early_stop_vals["patience"], verbose=1, mode='auto')
    opt = Adam(learning_rate=learning_rate)
    input_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=["accuracy"])
    history = input_model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, verbose=1, validation_data=(x_dev, y_dev), callbacks=[Early_Stop, scheduler])

    train_dict = {"Train Accuracy": history.history['accuracy'], "Train Loss": history.history['loss'], "Val Accuracy": history.history['val_accuracy'], "Val Loss": history.history['val_loss'] }

    return input_model, train_dict


def model_tester(input_model, x_test, y_test):
    """
    Tests the model's performance based on a several metrics

    input_model: Instatiation of model

    x_test: Dataframe containing test features

    y_test: Pandas series containing test labels
    """

    print('Predicting labels for {} sentences...'.format(len(x_test)))

    preds = input_model.predict(x_test)
    results, labels, predictions = metrics(y_test, preds, argmax_needed=True)

    return results, labels, predictions



In [None]:
# Run Gomez
Keras_Classifier = construct_model("Gomez", HIDDEN_SIZE, DROPOUT, NUM_LABELS, 2048)
trained_model, train_dict = model_trainer(Keras_Classifier, x_train, x_test, x_dev, y_dev, EARLY_STOPPING, SCHEDULER, NUM_EPOCHS, LEARNING_RATES[0], 8)   
results, labels, predictions = model_tester(trained_model, x_test, y_test) 
print(results)

Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 00009: early stopping
Predicting labels for 502 sentences...
{'mcc': 0.452972250453079, 'acc': 0.6912350597609562, 'confusion_matrix': array([[197,  51,   3,  17],
       [ 40, 142,   0,   3],
       [  4,   4,   4,   0],
       [ 25,   7,   1,   4]]), 'precision': 0.6761401981635599, 'recall': 0.6912350597609562, 'f1': 0.6821803866213831}


In [None]:
# Run Sabat
Keras_Classifier = construct_model("Sabat", HIDDEN_SIZE, DROPOUT, NUM_LABELS, 2048)
trained_model, train_dict = model_trainer(Keras_Classifier, x_train, x_test, x_dev, y_dev, EARLY_STOPPING, SCHEDULER, NUM_EPOCHS, LEARNING_RATES[0], 25) #Note: lr from paper was LEARNING_RATES[-1]   
results, labels, predictions = model_tester(trained_model, x_test, y_test) 
print(results)

Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 00012: early stopping
Predicting labels for 502 sentences...
{'mcc': 0.4915644608084453, 'acc': 0.7250996015936255, 'confusion_matrix': array([[220,  47,   0,   1],
       [ 42, 143,   0,   0],
       [  6,   6,   0,   0],
       [ 32,   4,   0,   1]]), 'precision': 0.6918492695883134, 'recall': 0.7250996015936255, 'f1': 0.6910988867108053}


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
results_dict = {}
max_f1_value = 0

for i in BATCH_SIZES:
    learning_rate_dict = {}
    for j in LEARNING_RATES: 
        Keras_Classifier = construct_model("Sabat", HIDDEN_SIZE, DROPOUT, NUM_LABELS, 2048)
        trained_model, train_dict = model_trainer(Keras_Classifier, x_train, x_test, x_dev, y_dev, EARLY_STOPPING, SCHEDULER, NUM_EPOCHS, j, i) 
        learning_rate_dict[j], labels, predictions = model_tester(trained_model, x_test, y_test) 

    if learning_rate_dict[j]["f1"] >= max_f1_value: #only save best model
        max_f1_value = learning_rate_dict[j]["f1"]
        print("The new top F1 score is: {}. Saving model...".format(max_f1_value))
        model_saver(trained_model, "Sabat", "Keras", OUTPUT_DIR, train_dict, labels, predictions, test.loc[:, "ids"].values.tolist(), learning_rate_dict[j])

    results_dict[i] = learning_rate_dict 

#save complete training results
np.save(os.path.join(os.path.join(OUTPUT_DIR, "Sabat"), "dogwhistle_total_training_results_sabat.npy"), results_dict)

Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: early stopping
Predicting labels for 502 sentences...


  _warn_prf(average, modifier, msg_start, len(result))


Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 00004: early stopping
Predicting labels for 502 sentences...
Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 00010: early stopping
Predicting labels for 502 sentences...
Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 00006: early stopping
Predicting labels for 502 sentences...
The new top F1 score is: 0.37607108933005484. Saving model...
Saving complete.
Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 00013: early stopping
Predicting labels for 502 sentences...
Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch

  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 00013: early stopping
Predicting labels for 502 sentences...
Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
Predicting labels for 502 sentences...
Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 00013: early stopping
Predicting labels for 502 sentences...
Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 00004: early stopping
Predicting labels for 502 sentences...


In [None]:
results_dict = {}
max_f1_value = 0

for i in BATCH_SIZES:
    learning_rate_dict = {}
    for j in LEARNING_RATES: 
        Keras_Classifier = construct_model("Gomez", HIDDEN_SIZE, DROPOUT, NUM_LABELS, 2048)
        trained_model, train_dict = model_trainer(Keras_Classifier, x_train, x_test, x_dev, y_dev, EARLY_STOPPING, SCHEDULER, NUM_EPOCHS, j, i) 
        learning_rate_dict[j], labels, predictions = model_tester(trained_model, x_test, y_test) 

    if learning_rate_dict[j]["f1"] >= max_f1_value: #only save best model
        max_f1_value = learning_rate_dict[j]["f1"]
        print("The new top F1 score is: {}. Saving model...".format(max_f1_value))
        model_saver(trained_model, "Gomez", "Keras", OUTPUT_DIR, train_dict, labels, predictions, test.loc[:, "ids"].values.tolist(), learning_rate_dict[j])

    results_dict[i] = learning_rate_dict 

#save complete training results
np.save(os.path.join(os.path.join(OUTPUT_DIR, "Gomez"), "dogwhistle_total_training_results_gomez.npy"), results_dict)

Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 00008: early stopping
Predicting labels for 502 sentences...
Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
Predicting labels for 502 sentences...
Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
Predicting labels for 502 sentences...


  _warn_prf(average, modifier, msg_start, len(result))


Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 00004: early stopping
Predicting labels for 502 sentences...
The new top F1 score is: 0.42894325116573956. Saving model...
Saving complete.
Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping
Predicting labels for 502 sentences...
Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
Predicting labels for 502 sentences...
Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping
Predicting labels for 502 sentences...
Train on 3998 samples, validate on 500 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 00005: early stopping
Predicting labels for 502 sentences...
The new top F1 