# Setup

In [None]:
import os
import random
import re
import string
import warnings

import contractions
import gensim.downloader
import matplotlib.pyplot as plt
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.sentence as nas
import nlpaug.augmenter.word as naw
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from simpletransformers.classification import (ClassificationArgs,
                                               ClassificationModel)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import ComplementNB
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm
from transformers import logging

from dont_patronize_me import DontPatronizeMe

In [None]:
with torch.no_grad():
    torch.cuda.empty_cache()

warnings.filterwarnings(action='ignore')
logging.set_verbosity_error()
sns.set_theme()
tqdm.pandas()

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['WANDB_NOTEBOOK_NAME'] = 'coursework.ipynb'

nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("punkt", quiet=True)

GLOBAL_SEED = 0

def seed_everything(seed=GLOBAL_SEED):
    os.environ["PL_GLOBAL_SEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    print(f"Global seed set to {seed}")

seed_everything()

# Data Analysis

In [None]:
seed_everything()

df = pd.read_csv(
    "data/dontpatronizeme_pcl.tsv",
    sep='\t',
    names=['par_id', 'art_id', 'community', 'country', 'text', 'labels'], skiprows=4)
df

In [None]:
seed_everything()

df['labels'] = df["labels"].progress_apply(lambda x: "No PCL" if x in [0, 1] else "PCL")

In [None]:
seed_everything()

df["labels"].value_counts(sort=False).plot(kind="bar")

plt.xlabel(None)
plt.ylabel("Frequency")
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig("graphs/pcl_labels.png")
plt.show()

In [None]:
seed_everything()

fig, ax = plt.subplots()
keyword_df = df.filter(items=['labels', 'community'])
keyword_df["community"] = keyword_df["community"].str.title()
keyword_df = keyword_df.groupby(['community', 'labels']).size().unstack(fill_value=0)
keyword_df.plot(ax=ax, kind='barh', stacked=True, legend=True)
ax.legend(["No PCL", "PCL"])

for p in ax.patches:
    width = p.get_width()
    ax.annotate(f'{width:.0f}', (p.get_x() + width / 2, p.get_y()-0.13), ha='center', va='center', xytext=(0, 10), textcoords='offset points', fontsize=10, color='white')

plt.ylabel('Community')
plt.xlabel('Frequency')
plt.tight_layout()
plt.savefig("graphs/community.png")
plt.show()

In [None]:
seed_everything()

fig, ax = plt.subplots()
country_df = df.filter(items=['labels', 'country'])
country_df["country"] = country_df["country"].str.upper()
country_df = country_df.groupby(['country', 'labels']).size().unstack(fill_value=0)
country_df.plot(ax=ax, kind='barh', stacked=True, legend=True)
ax.legend(["No PCL", "PCL"], loc='upper left')

plt.ylabel('Country')
plt.xlabel('Frequency')
plt.tight_layout()
plt.savefig("graphs/country.png")
plt.show()

In [None]:
seed_everything()

length_df = df.copy(deep=True)
length_df["text"] = length_df["text"].astype(str).progress_apply(len)
length_df = length_df.filter(items=["labels", "text"])

hist = length_df.plot(
    kind="hist",
    by="labels",
    bins=20,
    range=(0, 1500),
    subplots=True,
    sharex=True,
    xlabel= "Text length",
    ylabel= "Frequency",
    legend=False,
)

plt.tight_layout()
plt.savefig("graphs/text_length.png")
plt.show()

In [None]:
length_df = df.copy(deep=True)
length_df["text"] = length_df["text"].astype(str).progress_apply(len)
length_df = length_df[["text", "labels"]]
print("No PCL median: ", length_df[length_df["labels"] == 0]["text"].median())
print("PCL median: ", length_df[length_df["labels"] == 1]["text"].median())

# Modeling

### Data Splitting

In [None]:
seed_everything()

def load_and_split_data():
    dpm = DontPatronizeMe("data", "data")

    dpm.load_task1()

    trids = pd.read_csv("data/train_semeval_parids-labels.csv")
    teids = pd.read_csv("data/dev_semeval_parids-labels.csv")

    trids.par_id = trids.par_id.astype(str)
    teids.par_id = teids.par_id.astype(str)

    data = dpm.train_task1_df

    train = []
    for idx in range(len(trids)):
        parid = trids.par_id[idx]
        keyword = data.loc[data.par_id == parid].keyword.values[0]
        country = data.loc[data.par_id == parid].country.values[0]
        text = data.loc[data.par_id == parid].text.values[0]
        labels = data.loc[data.par_id == parid].labels.values[0]
        orig_label = data.loc[data.par_id == parid].orig_label.values[0]
        train.append(
            {
                "community": keyword,
                "country": country,
                "text": text,
                "labels": labels,
                "orig_label": orig_label,
            }
        )

    test = []
    for idx in range(len(teids)):
        parid = teids.par_id[idx]
        keyword = data.loc[data.par_id == parid].keyword.values[0]
        country = data.loc[data.par_id == parid].country.values[0]
        text = data.loc[data.par_id == parid].text.values[0]
        labels = data.loc[data.par_id == parid].labels.values[0]
        orig_label = data.loc[data.par_id == parid].orig_label.values[0]
        test.append(
            {
                "community": keyword,
                "country": country,
                "text": text,
                "labels": labels,
                "orig_label": orig_label,
            }
        )

    return pd.DataFrame(train), pd.DataFrame(test)

def labels2file(p, outf_path):
	with open(outf_path,'w') as outf:
		for pi in p:
			outf.write(','.join([str(k) for k in pi])+'\n')

### Data Augmentation

In [None]:
seed_everything()

def augment_and_rebalance(data, aug, filename=None):
    data = data.dropna()
    data_0 = data[data["labels"] == 0]
    data_1 = data[data["labels"] == 1]
    n = len(data_0) // len(data_1)
    augmented = [data]
    print(f"Augmenting {n} times")
    for i in range(n):
        data_1_copy = data_1.copy(deep=True)
        data_1_copy["text"] = data_1_copy["text"].progress_apply(lambda x: aug.augment(x)[0])
        augmented.append(data_1_copy)
        print(f"Augmentation {i+1} complete")
    print(f"All augmentations complete")
    final = pd.concat(augmented, ignore_index=True, axis=0)
    if filename:
        final.to_csv(f"data/{filename}.csv", index=False)
    return final

### Data Preprocessing

In [None]:
seed_everything()

def preprocessor(text, processes):
    text = re.sub("<h>", "", text) # Remove <h> tags
    text = re.sub(r"n\'t", " not", text) # Replace n't with not
    text = re.sub(r"\"", "", text) # Remove quotation marks
    text = text.replace(" '", "") # Remove spaces before apostrophes
    text = contractions.fix(text) # Expand contractions
    text = re.sub(r"\s+", " ", text).strip() # Remove extra whitespace

    if "lowercase" in processes:
        text = text.lower()

    if "punctuation" in processes:
        text = text.translate(str.maketrans("", "", string.punctuation))

    tokens = word_tokenize(text)

    if "stopwords" in processes:
        tokens = [token for token in tokens if token not in stopwords.words("english")]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    if "tokenize" in processes:
        return tokens

    return " ".join(tokens)

def preprocess(data, processes, filename=None):
    data = data.dropna()
    data["text"] = data["text"].progress_apply(lambda x: preprocessor(x, processes))
    if filename:
        data.to_csv(f"data/{filename}.csv", index=False)
    return data


### Save Augmented Data

In [None]:
seed_everything()

def save_augmentations():
    train, _ = load_and_split_data()
    train, val = train_test_split(
        train, test_size=0.2, shuffle=True, random_state=GLOBAL_SEED
    )

    train.to_csv(f"data/train_0.8.csv", index=False)
    val.to_csv(f"data/val_0.2.csv", index=False)

    for aug in [
        nac.KeyboardAug(),
        nac.RandomCharAug(action="swap"),
        naw.SynonymAug(),
        naw.AntonymAug(),
        naw.ContextualWordEmbsAug(),
        nas.ContextualWordEmbsForSentenceAug(),
    ]:
        augment_and_rebalance(
            train.copy(deep=True),
            aug=aug,
            filename=f"train_0.8_{aug.name.lower()}",
        )

# save_augmentations()

### Define Fine Tuning Function

In [None]:
seed_everything()

def fine_tune(
    train,
    val,
    epochs,
    batch_size,
    lr,
    schedule,
):
    args = ClassificationArgs(
        output_dir="outputs",
        train_batch_size=batch_size,
        eval_batch_size=batch_size,
        num_train_epochs=epochs,
        learning_rate=lr,
        scheduler=schedule,
        use_early_stopping=True,
        overwrite_output_dir=True,
        use_multiprocessing=False,
        use_multiprocessing_for_evaluation=False,
    )
    model = ClassificationModel(
        "roberta",
        "roberta-base",
        args=args,
        use_cuda=torch.cuda.is_available(),
        num_labels=2,
    )
    model.train_model(train)
    results, outputs, wrong = model.eval_model(val)
    print(classification_report(val["labels"], np.argmax(outputs, axis=-1)))

    return model


### Test Augmentations

In [None]:
seed_everything()

def augmentation_flow():
    val = pd.read_csv("data/val_0.2.csv")
    val = preprocess(val, processes={})
    val = val[["text", "labels"]]

    train = pd.read_csv("data/train_0.8.csv")
    train = preprocess(train, processes={})
    train = train[["text", "labels"]]

    for train_aug_file in [
        "train_0.8_keyboard_aug",
        "train_0.8_randomchar_aug",
        "train_0.8_synonym_aug",
        "train_0.8_antonym_aug",
        "train_0.8_contextualwordembs_aug",
        "train_0.8_contextualwordembsforsentence_aug",
    ]:
        train_aug = pd.read_csv(f"data/{train_aug_file}.csv")
        train_aug = preprocess(train_aug, processes={})
        train_aug = train_aug[["text", "labels"]]

        print(f"Augmentation: {train_aug_file}")

        model = fine_tune(
            train_aug,
            val,
            epochs=5,
            batch_size=128,
            lr=1e-5,
            schedule="linear_schedule_with_warmup",
        )
        predictions, _ = model.predict(train["text"].values.tolist())
        print(classification_report(train["labels"], predictions))


# augmentation_flow()

### Test Preprocessing

In [None]:
seed_everything()

def preprocessing_flow():
    for processes in [
        {},
        {"lowercase"},
        {"punctuation"},
        {"stopwords"},
        {"stopwords", "punctuation"},
        {"stopwords", "lowercase"},
        {"punctuation", "lowercase"},
        {"stopwords", "punctuation", "lowercase"},
    ]:
        train = pd.read_csv("data/train_0.8.csv")
        train = preprocess(train, processes=processes)
        train = train[["text", "labels"]]

        val = pd.read_csv("data/val_0.2.csv")
        val = preprocess(val, processes=processes)
        val = val[["text", "labels"]]

        ps = ", ".join(processes)
        print(f"Preprocessing: {ps}")

        model = fine_tune(
            train,
            val,
            epochs=5,
            batch_size=128,
            lr=1e-5,
            schedule="linear_schedule_with_warmup",
        )
        predictions, _ = model.predict(train["text"].values.tolist())
        print(classification_report(train["labels"], predictions))


# preprocessing_flow()

### Baselines

In [None]:
seed_everything()

def get_embeddings(data):
    # word2vec_model = gensim.downloader.load("glove-twitter-200")
    # vec_size = 200

    word2vec_model = gensim.downloader.load("word2vec-google-news-300")
    vec_size = 300

    def word2vec(text, model):
        vecs = []
        for word in text:
            if word in model:
                vecs.append(model[word])
            else:
                vecs.append(np.zeros(vec_size))
        return np.mean(vecs, axis=0)

    for x in ["country", "community"]:
        data[x] = pd.Categorical(data[x], categories=data[x].unique()).codes

    data["length"] = data["text"].progress_apply(len)
    data["text"] = data["text"].progress_apply(lambda x: word2vec(x, word2vec_model))
    data[np.arange(vec_size)] = data["text"].progress_apply(lambda x: pd.Series(x))
    data = data.drop(columns=["text"])
    data = data.dropna()

    data.columns = data.columns.astype(str)

    return data


def baselines_flow():
    train, test = load_and_split_data()
    train_original = train.copy(deep=True)

    train = preprocess(train, processes={"tokenize"})
    test = preprocess(test, processes={"tokenize"})

    train = get_embeddings(train)
    test = get_embeddings(test)

    trainY = train[["labels"]]
    testY = test[["labels"]]

    trainX = train.drop(columns=["labels"])
    testX = test.drop(columns=["labels"])

    if 'orig_label' in df:
        trainX = trainX.drop(columns=["orig_label"])
        testX = testX.drop(columns=["orig_label"])

    scaler = MinMaxScaler()
    trainX = scaler.fit_transform(trainX)
    testX = scaler.transform(testX)

    for model in [LogisticRegression(max_iter=1000), ComplementNB()]:
        model.fit(trainX, trainY)
        model_name = type(model).__name__
        print(f"{model_name} train: \n", classification_report(trainY, model.predict(trainX)))
        print(f"{model_name} test: \n", classification_report(testY, model.predict(testX)))
        print(f"{model_name} misclassifications: \n", train_original["text"][trainY["labels"] != model.predict(trainX)])


# baselines_flow()

### Hyperparameter Tuning

In [None]:
seed_everything()

def hyperparameter_tuning_flow():
    train = pd.read_csv("data/train_0.8.csv")
    train = preprocess(train, processes={"lowercase"})
    train = train[["text", "labels"]]

    train_aug = pd.read_csv("data/train_0.8_keyboard_aug.csv")
    train_aug = preprocess(train_aug, processes={"lowercase"})
    train_aug = train_aug[["text", "labels"]]

    val = pd.read_csv("data/val_0.2.csv")
    val = preprocess(val, processes={"lowercase"})
    val = val[["text", "labels"]]

    epochs = 5
    lrs = [1e-4, 1e-5]
    batch_sizes = [128, 256]
    schedules = ["linear_schedule_with_warmup", "constant_schedule_with_warmup"]

    for schedule in schedules:
        for lr in lrs:
            for batch_size in batch_sizes:
                model = fine_tune(
                    train_aug,
                    val,
                    epochs=epochs,
                    batch_size=batch_size,
                    lr=lr,
                    schedule=schedule,
                )
                predictions, _ = model.predict(train["text"].values.tolist())
                print(classification_report(train["labels"], predictions))
                print(
                    f"Schedule: {schedule}, learning rate: {lr}, batch size: {batch_size}"
                )


# hyperparameter_tuning_flow()

### Final Model

In [None]:
seed_everything()

def final_model_flow():
    train, dev = load_and_split_data()

    train = preprocess(
        train,
        processes={"lowercase"},
    )
    train = train[["text", "labels"]]

    train_aug = train.copy(deep=True)
    train_aug = augment_and_rebalance(train_aug, nac.KeyboardAug())
    train_aug = preprocess(
        train_aug,
        processes={"lowercase"},
    )
    train_aug = train_aug[["text", "labels"]]

    dev = preprocess(
        dev,
        processes={"lowercase"},
    )
    dev = dev[["text", "labels"]]

    test = pd.read_csv("data/task4_test.tsv",sep='\t', names=['par_id', 'art_id', 'community', 'country', 'text'])
    test = preprocess(test, processes={"lowercase"})

    optimal = {"lr": 1e-4, "batch_size": 256, "schedule": "linear_schedule_with_warmup"}
    epochs = 10

    model = fine_tune(
        train_aug,
        dev,
        epochs=epochs,
        batch_size=optimal["batch_size"],
        lr=optimal["lr"],
        schedule=optimal["schedule"],
    )

    predictions, _ = model.predict(train["text"].values.tolist())
    print(classification_report(train["labels"], predictions))

    predictions, _ = model.predict(dev["text"].values.tolist())
    labels2file([[p] for p in predictions], 'dev.txt')

    predictions, _ = model.predict(test["text"].values.tolist())
    labels2file([[p] for p in predictions], 'test.txt')

    return model, dev


model, dev = final_model_flow()


# Analysis

In [None]:
_, dev = load_and_split_data()
dev = preprocess(
    dev,
    processes={"lowercase"},
)

### To what extent is the model better at predicting examples with a higher level of patronizing content?

In [None]:
def patronization_level_analysis(model, dev):
    levels = ["2", "3", "4"]
    f1_scores = []
    for level in levels:
        dev_level = dev[dev["orig_label"] == level]
        predictions, _ = model.predict(dev_level["text"].values.tolist())
        print(f"Level: {level}")
        f1 = f1_score(dev_level["labels"], predictions)
        print(f1)
        f1_scores.append(f1)

    plt.bar(levels, f1_scores)
    plt.xlabel("PCL level")
    plt.ylabel("F1 score")
    plt.tight_layout()
    plt.savefig("graphs/pcl_level_analysis.png")
    plt.show()

patronization_level_analysis(model, dev)

### How does the length of the input sequence impact the model performance?

In [None]:
def seq_len_analysis(model, dev):
    seq_lens = [64, 128, 256, 512]
    f1_scores = []
    for seq_len in seq_lens:
        dev_len = dev.copy(deep=True)
        dev_len["text"] = dev_len["text"].progress_apply(lambda x: x[:seq_len])
        predictions, _ = model.predict(dev_len["text"].values.tolist())
        print(f"Max sequence length: {seq_len}")
        f1 = f1_score(dev_len["labels"], predictions)
        print(f1)
        f1_scores.append(f1)

    plt.bar([str(s) for s in seq_lens], f1_scores)
    plt.xlabel("Maximum sequence length")
    plt.ylabel("F1 score")
    plt.tight_layout()
    plt.savefig("graphs/seq_lens_analysis.png")
    plt.show()

seq_len_analysis(model, dev)

### To what extent does model performance depend on the data categories?

In [None]:
def category_anaysis(model, dev):
    communities = dev["community"].unique()
    f1_scores = []
    for community in communities:
        dev_community = dev[dev["community"] == community]
        predictions, _ = model.predict(dev_community["text"].values.tolist())
        print(f"Community: {community}")
        f1 = f1_score(dev_community["labels"], predictions)
        print(f1)
        f1_scores.append(f1)

    plt.bar([s.title() for s in communities], f1_scores)
    plt.tight_layout()
    plt.xticks(rotation=45, ha='right')
    plt.xlabel("Community")
    plt.ylabel("F1 score")
    plt.savefig("graphs/community_analysis.png")
    plt.show()

category_anaysis(model, dev)