In [19]:
import os
import time
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from pathlib import Path
from keras.models import load_model
from keras.metrics import Precision, Recall
from keras.callbacks import CSVLogger, EarlyStopping
from keras.utils import np_utils
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
!pip install transformers
!pip install sentencepiece
from transformers import AutoTokenizer, TFT5ForConditionalGeneration, PreTrainedTokenizerFast, PreTrainedTokenizerBase, \
    TFAutoModel

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [20]:
import nltk
import matplotlib.pyplot as plt

from re import sub
from pandas import DataFrame
from tabulate import tabulate
from string import punctuation
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from typing import Union, List, Tuple
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from pandas.io.parsers import TextFileReader
from collections import Counter

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('omw-1.4')
nltk.download('punkt')
__STOPWORDS = stopwords.words("english")
STEMMER = PorterStemmer()
LEMMATIZER = WordNetLemmatizer()


def process_data(text: str, do_stemming: bool = False, do_lemmas: bool = False, do_lowercase: bool = False) -> str:
    """
    @param text: The text to process. It will remove the money amounts, retweets, links,
                 hashtags, punctuation and it will lowercase all the words
    @param do_stemming: Steam words to have less in vocabulary if set to true
    @param do_lowercase: Lowercase input text if set to true
    @param do_lemmas: Lemmatize word to be at a dictionary representation if true
    @return: The new processed text as a list of words
    """
    text = sub(r" +", " ", text)
    text = sub(r"\S@\S\s?", "", text)
    text = sub(r"[0-9]+(?:.[0-9]+){3}", "", text)
    text = sub(r"\$\w*", "", text)
    text = sub(r"(RT)+", "", text)
    text = sub(r"(lt)+", "", text)
    text = sub(r"(gt)+", "", text)
    text = sub(r"@[a-zA-Z0-9\_]+", "", text)
    text = sub(r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)", "", text)
    text = sub(r"#", "", text)
    text = sub(r"\d+", "", text)
    text = sub("[^A-Za-z0-9]+", " ", text)  # also removes special characters since they are not alphanumeric
    if do_lowercase:
        text = text.lower()

    # tokenized_data = __TOKENIZER.tokenize(text)
    tokenized_data = word_tokenize(text)
    processed_data = []

    for word in tokenized_data:
        if word not in __STOPWORDS and word not in punctuation:
            if do_stemming and not do_lemmas:
                word = STEMMER.stem(word)
            elif do_lemmas and not do_stemming:
                word = LEMMATIZER.lemmatize(word)
            elif do_lemmas and do_stemming:
                raise ValueError(
                    f"Can't do both lemmatizing and stemming. Values for do_lemmas={do_lemmas} "
                    f"and do_stemming={do_stemming} cannot be true for both."
                )
            processed_data.append(word)

    return " ".join(processed_data)


def plot_train_data(csv_data: Union[TextFileReader, DataFrame], train_metric: str, validation_metric: str) -> None:
    plt.figure()
    plt.plot(csv_data[train_metric], color="blueviolet")
    plt.plot(csv_data[validation_metric], color="green")
    plt.title(f"{train_metric.capitalize()} over epochs")
    plt.legend(["train", "validation"])
    plt.xlabel("epoch")
    plt.ylabel(train_metric)
    plt.grid(visible=True)


def display_train_report_and_f1_score(csv_data: Union[TextFileReader, DataFrame]) -> None:
    headers = ["epoch", "accuracy", "loss", "precision", "recall"]
    train_rep = csv_data[csv_data["epoch"] == len(csv_data) - 1][headers]
    precision = train_rep["precision"].iloc[0]
    recall = train_rep["recall"].iloc[0]
    f1_score = 2 * precision * recall / (precision + recall)
    train_rep["f1_score"] = f1_score
    rep_data = train_rep.values.tolist()
    print('\033[92m')
    print("╒═════════════════╕")
    print("│ Training Report │")
    print(tabulate(rep_data, headers=[header.capitalize() for header in headers] + ["F1 Score"], tablefmt="fancy_grid"))


def display_readable_time(start_time: float, end_time: float) -> None:
    minutes = (end_time - start_time) / 60
    ss = (end_time - start_time) % 60
    hh = minutes / 60
    mm = minutes % 60
    print('\033[94m')
    print(f"Training time: %02d:%02d:%02d | {round(end_time - start_time, 2)} seconds" % (hh, mm, ss))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [22]:
!mkdir train_local
# !wget /content/drive/MyDrive/dataset/abusive-language/vidgen-wassem-davidson-founta-zampieri.zip
!unzip /content/drive/MyDrive/dataset/abusive-language/vidgen-wassem-davidson-founta-zampieri.zip -d /content/train_local

mkdir: cannot create directory ‘train_local’: File exists
Archive:  /content/drive/MyDrive/dataset/abusive-language/vidgen-wassem-davidson-founta-zampieri.zip
replace /content/train_local/dynamically-hate-vidgen/Dynamically Generated Hate Dataset - annotation guidelines.pdf? [y]es, [n]o, [A]ll, [N]one, [r]ename: None


In [23]:
# TODO: Update these with the google stuff
FOUNTA_DIR = os.path.join("/content/train_local", "large-founta")
TRAIN_SET_PATH = os.path.join(FOUNTA_DIR, "train.tsv")
VAL_SET_PATH = os.path.join(FOUNTA_DIR, "dev.tsv")
TEST_SET_PATH = os.path.join(FOUNTA_DIR, "test.tsv")
FOUNTA_MODEL_LOGS_PATH = os.path.join("/content/drive/MyDrive/nlp-models/abusive-language/logs", "founta")
FOUNTA_MODEL_PATH = os.path.join("/content/drive/MyDrive/nlp-models/abusive-language/models", "founta")

MODEL_FILE_NAME = "byt5_large_founta"
BYT5_TYPE = "Narrativa/byt5-base-tweet-hate-detection"  # Narrativa/byt5-base-tweet-hate-detection

# Clean: 48 | No lowercase: 36 | Lowercase: 30 | Lowercase & Stemming: 30 | Lowercase & Lemmas: 30
MAX_PADDING_LENGTH = 30
LEARNING_RATE = 2e-5
BATCH_SIZE = 32
EPOCHS = 10

In [51]:
def encode_tweet(tweet: str, byt5_tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizerBase]):
    return byt5_tokenizer(
        # tweet,
        # process_data(tweet),
        process_data(tweet, do_stemming=False, do_lemmas=False, do_lowercase=True),
        max_length=MAX_PADDING_LENGTH,
        truncation=True,
        padding="max_length",
        return_tensors='pt'
    )


def encode_tweets(tweets_text: list, tweets_labels, byt5_tokenizer: Union[PreTrainedTokenizerFast, PreTrainedTokenizerBase]):
    if tweets_labels is not None:
        assert len(tweets_text) == len(tweets_labels), f"Features and labels must have the same lengths. " \
                                                       f"Your input ({len(tweets_text)}, {len(tweets_labels)})"

    input_ids = []
    attention_masks = []

    for tweet in tweets_text:
        tweet_for_byt5 = encode_tweet(tweet, byt5_tokenizer)
        input_ids.append(np.array(tweet_for_byt5["input_ids"]))
        attention_masks.append(np.array(tweet_for_byt5["attention_mask"]))

    if tweets_labels is not None:
        assert len(input_ids) == len(attention_masks) == len(tweets_labels), \
            "Arrays must have the same length."
        return np.array(input_ids), np.array(attention_masks), np.array(tweets_labels)

    return np.array(input_ids), np.array(attention_masks)


def generate_byt5_dict(input_ids, attention_mask) -> dict:
    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask
    }


def byt5_tuning(byt5_type: str = BYT5_TYPE):
    input_ids = tf.keras.Input(shape=(MAX_PADDING_LENGTH,), name="input_ids", dtype="int32")
    attention_masks = tf.keras.Input(shape=(MAX_PADDING_LENGTH,), name="attention_mask", dtype="int32")


    byt5_model = TFAutoModel.from_pretrained(byt5_type, from_pt=True)
    print("REEEEEEEEEEEEEEEEEEEEE")
    encodings = byt5_model(input_ids=input_ids, attention_mask=attention_masks)[0]
    # encodings = byt5_model.get_input_embeddings()[0]
    print(encodings)
    last_encoding = tf.squeeze(encodings[:, -1:, :], axis=1)
    # last_encoding = tf.keras.layers.Dropout(0.1)(last_encoding)

    outputs = tf.keras.layers.Dense(4, activation="softmax", name="outputs")(last_encoding)

    temp_model = tf.keras.Model(inputs=[input_ids, attention_masks], outputs=outputs)
    temp_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss="categorical_crossentropy",
        metrics=["accuracy", Precision(), Recall()]
    )

    return temp_model


def convert_labels_to_numerical(labels: list):
    # Transform labels to numerical value
    for index, label in enumerate(labels):
        if label == "normal":
            labels[index] = 0
        elif label == "spam":
            labels[index] = 1
        elif label == "abusive":
            labels[index] = 2
        elif label == "hateful":
            labels[index] = 3
        else:
            raise ValueError("Class column must have only 'normal', 'spam', 'abusive' or 'hateful' values")

    return labels


In [28]:
tokenizer = AutoTokenizer.from_pretrained(BYT5_TYPE)

train_df = pd.read_csv(TRAIN_SET_PATH, sep="\t", header=0)
val_df = pd.read_csv(VAL_SET_PATH, sep="\t", header=0)
test_df = pd.read_csv(TEST_SET_PATH, sep="\t", header=0)

train_texts = train_df["sentence"].tolist()
val_texts = val_df["sentence"].tolist()
test_texts = test_df["sentence"].tolist()

train_labels = convert_labels_to_numerical(train_df["class"].tolist())
val_labels = convert_labels_to_numerical(val_df["class"].tolist())
test_labels = convert_labels_to_numerical(test_df["class"].tolist())

train_labels = np_utils.to_categorical(train_labels)
val_labels = np_utils.to_categorical(val_labels)
test_labels = np_utils.to_categorical(test_labels)

train_ids, train_masks, train_labels = encode_tweets(
    tweets_text=train_texts,
    tweets_labels=train_labels,
    byt5_tokenizer=tokenizer
)
validation_ids, validation_masks, val_labels = encode_tweets(
    tweets_text=val_texts,
    tweets_labels=val_labels,
    byt5_tokenizer=tokenizer
)
test_ids, test_masks, test_labels = encode_tweets(
    tweets_text=test_texts,
    tweets_labels=test_labels,
    byt5_tokenizer=tokenizer
)

train_data = generate_byt5_dict(train_ids, train_masks)
validation_data = (generate_byt5_dict(validation_ids, validation_masks), val_labels)
test_data = generate_byt5_dict(test_ids, test_masks)


In [52]:
model = byt5_tuning()
print(model.summary())
early_stop = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=2, restore_best_weights=True)
csv_logger = CSVLogger(os.path.join(FOUNTA_MODEL_LOGS_PATH, f"{MODEL_FILE_NAME}.log"), separator=",",
                        append=False)
start_time = time.time()
hist = model.fit(train_data, train_labels, validation_data=validation_data, epochs=EPOCHS, batch_size=BATCH_SIZE,
                  callbacks=[csv_logger, early_stop])
end_time = time.time()
model.save(os.path.join(FOUNTA_MODEL_PATH, f"{MODEL_FILE_NAME}.h5"))
display_readable_time(start_time=start_time, end_time=end_time)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFT5Model: ['lm_head.weight', 'encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
- This IS expected if you are initializing TFT5Model from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFT5Model from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFT5Model were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5Model for predictions without further training.


REEEEEEEEEEEEEEEEEEEEE


TypeError: ignored

In [None]:
log_data = pd.read_csv(os.path.join(FOUNTA_MODEL_LOGS_PATH, f"{MODEL_FILE_NAME}.log"), sep=",", engine="python")
display_train_report_and_f1_score(log_data)
plot_train_data(log_data, train_metric="accuracy", validation_metric="val_accuracy")
plot_train_data(log_data, train_metric="loss", validation_metric="val_loss")
plot_train_data(log_data, train_metric="precision", validation_metric="val_precision")
plot_train_data(log_data, train_metric="recall", validation_metric="val_recall")
plt.show()

In [None]:
predictions = model.predict(test_data)

for prediction in predictions:
    for index, pred_class in enumerate(prediction):
        if pred_class == max(prediction):
            prediction[index] = 1
        else:
            prediction[index] = 0

print(predictions)
print(test_labels)
print(len(predictions), len(test_labels))
print(type(test_labels), type(predictions))

print(f"\n{classification_report(test_labels, predictions)}")