In [None]:
#pip install simpletransformers

In [None]:
pip install git+https://github.com/botelhoa/simpletransformersbotelho.git

In [None]:
pip install ekphrasis

In [None]:
pip install transformers

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import matthews_corrcoef, confusion_matrix, precision_score, recall_score, f1_score, accuracy_score
from simpletransformers.classification.multi_modal_classification_model import MultiModalClassificationModel
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons

In [None]:
#Create DataFrame with three columns text, image, labels

def combine_text(df):
    """
    Combines tweet and image text into one column

    df: Dataframe which holds the data
    """
    combined_text = []

    for row_num in range(len(df)):
        tweet_text = df.loc[row_num, "tweet_text"]
        image_text = df.loc[row_num, "img_text"]
        if type(image_text) == str:
            combined_text.append(tweet_text + image_text)
        else:
            combined_text.append(tweet_text)

    return combined_text

def clean_text(data, normalize_list, annotate_list):
    """
    This function preprocesses the text using the Ekphrasis library
    
    data: Pandas series object containing strings of text

    normalize_list: list of data features to clean

    annotate_list: list of data features to annotate
    """

    text_processor = TextPreProcessor(
        normalize= normalize_list,
        annotate= annotate_list,
        fix_html=True,
        segmenter="twitter", 
        unpack_hashtags=True,  
        unpack_contractions=True,  
        spell_correct_elong=True,  
        tokenizer=SocialTokenizer(lowercase=True).tokenize,
        dicts=[emoticons]
    )

    clean_data = data.map(lambda x: " ".join(text_processor.pre_process_doc(x)))

    return clean_data


def df_preparer(file_path):
    """
    file_path: 
    """

    df = pd.read_csv(file_path, encoding='utf-8')
    df["text"] = combine_text(df)
    df["text"] = clean_text(df["text"], ['url', 'email', 'percent', 'money', 'phone', 'user', 'time', 'date', 'number'], ['hashtag', 'allcaps', 'elongated', 'repeated', 'emphasis', 'censored'])
    df["labels"] = df["Primary_numeric_gt"].map(lambda x: str(x)) #labels need to be strings
    df["images"] = df["image_number"].map(lambda x: str(x) + ".jpg")

    df = df[["labels", "text", "images"]]

    return df


def metrics(labels, preds, argmax_needed: bool = False):
    """
    Returns the Matthew's correlation coefficient, accuracy rate, true positive rate, true negative rate, false positive rate, false negative rate, precission, recall, and f1 score
    
    labels: list of correct labels

    pred: list of model predictions

    argmax_needed (boolean): converts logits to predictions. Defaulted to false.
    """
    labels = labels
    preds = preds

    if argmax_needed == True:
        preds = np.argmax(preds, axis=1).flatten()

    mcc = matthews_corrcoef(labels, preds)
    acc = accuracy_score(labels, preds)
    cm = confusion_matrix(labels, preds)

    f1 = f1_score(labels, preds, average= "weighted")
    precision = precision_score(labels, preds, average= "weighted")
    recall = f1_score(labels, preds, average= "weighted")

    results = {
        "mcc": mcc,
        "acc": acc,
        "confusion_matrix": cm,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }
    
    return results, labels, preds

train = df_preparer("/content/drive/My Drive/Dog_Whistle_Code/Data/Train/dog_whistle_train.csv")
dev = df_preparer("/content/drive/My Drive/Dog_Whistle_Code/Data/Validation/dog_whistle_dev.csv")
test = df_preparer("/content/drive/My Drive/Dog_Whistle_Code/Data/Test/dog_whistle_test.csv")
test["int_labels"] = test["labels"].map(lambda x: int(x))

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading english - 1grams ...
Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading english - 1grams ...
Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading english - 1grams ...


In [None]:
args = {"fp16": False, 'train_batch_size': 16, 'eval_batch_size': 16, 'learning_rate': 1e-4, 'num_train_epochs': 10, 'max_seq_length': 100, "weight_decay": 0.1, "warmup_ratio": 0.06,
        "best_model_dir": "/content/drive/My Drive/Dog_Whistle_Code/Fine_Tuned_Models/Multimodal/MMBT", "evaluate_during_training": True, 
            "use_early_stopping": True, "early_stopping_patience": 2, "early_stopping_delta": 0, "early_stopping_metric": "eval_loss", "overwrite_output_dir": True}

args = {"fp16": False, 'train_batch_size': 8, 'eval_batch_size': 8, 'learning_rate': 1e-5, 'num_train_epochs': 4, 'max_seq_length': 100, "weight_decay": 0.1, "warmup_ratio": 0.06,
        "use_early_stopping": True, "early_stopping_patience": 2, "early_stopping_delta": 0, "early_stopping_metric": "eval_loss", "overwrite_output_dir": True}

model = MultiModalClassificationModel("bert", "bert-large-uncased", label_list= ["0", "1", "2", "3"], args=args, use_cuda= True)
print("Loading finished")
model.train_model(train, eval_data=dev, image_path="/content/drive/My Drive/Dog_Whistle_Code/Data/Images/", output_dir= "/content/drive/My Drive/Dog_Whistle_Code/Fine_Tuned_Models/Multimodal/MMBT")
print("Training finished")
results, logits  = model.eval_model(test, image_path="/content/drive/My Drive/Dog_Whistle_Code/Data/Images/")
metric_vals, labels, preds = metrics(test["int_labels"], logits, argmax_needed=True)
print(metric_vals)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=434.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1344997306.0, style=ProgressStyle(descr…




Downloading: "https://download.pytorch.org/models/resnet152-b121ed2d.pth" to /root/.cache/torch/checkpoints/resnet152-b121ed2d.pth


HBox(children=(FloatProgress(value=0.0, max=241530880.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…


Loading finished


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=500.0, style=ProgressStyle(descri…

Running loss: 1.565488



Running loss: 0.402592

Token indices sequence length is longer than the specified maximum sequence length for this model (558 > 512). Running this sequence through the model will result in indexing errors


Running loss: 0.263214


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=500.0, style=ProgressStyle(descri…

Running loss: 0.137276

Token indices sequence length is longer than the specified maximum sequence length for this model (558 > 512). Running this sequence through the model will result in indexing errors


Running loss: 0.865586


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=500.0, style=ProgressStyle(descri…

Running loss: 0.281374

Token indices sequence length is longer than the specified maximum sequence length for this model (558 > 512). Running this sequence through the model will result in indexing errors


Running loss: 0.156487


HBox(children=(FloatProgress(value=0.0, description='Current iteration', max=500.0, style=ProgressStyle(descri…

Running loss: 0.361027

Token indices sequence length is longer than the specified maximum sequence length for this model (558 > 512). Running this sequence through the model will result in indexing errors


Running loss: 0.505748

Training finished


HBox(children=(FloatProgress(value=0.0, max=63.0), HTML(value='')))

Token indices sequence length is longer than the specified maximum sequence length for this model (1076 > 512). Running this sequence through the model will result in indexing errors



{'mcc': 0.61981806961996, 'acc': 0.7848605577689243, 'confusion_matrix': array([[219,  37,   0,  12],
       [ 22, 162,   0,   1],
       [  3,   8,   0,   1],
       [ 16,   8,   0,  13]]), 'precision': 0.762845866913318, 'recall': 0.7713136091995654, 'f1': 0.7713136091995654}


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#Save
np.save("/content/drive/My Drive/Dog_Whistle_Code/Fine_Tuned_Models/Multimodal/MMBT/results.npy", metric_vals)

df = pd.DataFrame([labels, preds]).T
df = df.rename(columns ={"int_labels": "Labels", "Unnamed 0": "Predictions"})
df.to_csv("/content/drive/My Drive/Dog_Whistle_Code/Fine_Tuned_Models/Multimodal/MMBT/predictions.csv")