# BERT models cross-validation pipeline

### Library import and auxiliary function for cross-validation approach

In [None]:
!pip install transformers
!pip install -U sentence-transformers
!pip install datasets
#install nltk emoji library to be used with normalizeTweet()
!pip install nltk emoji==0.6.0
!pip install evaluate

In [None]:
import pandas as pd
import numpy as np
import torch

import transformers
from datasets import Dataset
from datasets.table import Table
import pyarrow as pa
import pyarrow.dataset as ds
from datasets import Dataset , DatasetDict

from evaluate import load
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support, f1_score
from sklearn.model_selection import StratifiedKFold, ParameterGrid,  GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

metric = load('glue','sst2')

import logging
logging.basicConfig(level=logging.ERROR)

from TweetNormalize import normalizeTweet

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

from nltk.corpus import stopwords, wordnet
from nltk import word_tokenize, pos_tag
from nltk.stem.wordnet import WordNetLemmatizer

import spacy
import re
nlp = spacy.load('en_core_web_sm')

stopword_list = stopwords.words('english')

doc_counter = 0
def reset_counter():
  global doc_counter
  doc_counter = 0

def increase_counter():
  global doc_counter
  doc_counter += 1
  if doc_counter % 100 == 0:
    print(doc_counter)

def spacy_nlp_tokenizer(text):
    increase_counter()

    # substituting all space characters with a single space
    text = re.sub('\s+', ' ', text)

    # we use spacy for main nlp tasks
    doc = nlp(text)
    # lemmatized tokens, skipping stopwords
    lemmas = ['LEMMA_'+token.lemma_ for token in doc if not token.is_stop]
    # entity_types
    entity_types = ['NER_'+token.ent_type_ for token in doc if token.ent_type_]

    # in case an entity linker is available, we can use it do put actual entities as
    # features, e.g. Queen Elizabeth, Elizabeth II, Her Majesty -> KB2912
    # see https://spacy.io/usage/training#entity-linker
    # entities = ['ENT_'+token.ent_kb_id_ for token in doc if token.ent_kb_id_]

    # we use a simple nltk function to create ngrams
    lemma_bigrams = ['BI_'+p1+'_'+p2 for p1,p2 in nltk.ngrams(lemmas,2)]
    lemma_trigrams = ['TRI_'+p1+'_'+p2+'_'+p3 for p1,p2,p3 in nltk.ngrams(lemmas,3)]

    all_tokens = list()
    all_tokens.extend(lemmas)
    all_tokens.extend(lemma_bigrams)
    all_tokens.extend(lemma_trigrams)
    all_tokens.extend(entity_types)
    return all_tokens

from EDA_AUG import eda_4

# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print('Is GPU available for usage?', torch.cuda.is_available())
print("How many devices available for 'cuda'?", torch.cuda.device_count())

In [None]:
#import datasets
df_en_train = pd.read_csv("/content/data_sets/corrected_df.csv", sep=',', header=0)
df_en_test = pd.read_csv("/content/data_sets/en_testing_labeled.tsv", sep='\t', header=0)

df_en_train

print(df_en_train.misogyny_category.value_counts())

x_train_binary = [normalizeTweet(i) for i in df_en_train['text']]
y_train_binary = [i for i in df_en_train['misogynous']]
x_test_binary = [normalizeTweet(i) for i in df_en_test['text']]
y_test_binary = [i for i in df_en_test['misogynous']]


df_en_train.drop(df_en_train[df_en_train['misogyny_category'] == '0'].index, inplace = True)
df_en_test.drop(df_en_test[df_en_test['misogyny_category'] == '0'].index, inplace = True)


x_train_multi = [normalizeTweet(i) for i in df_en_train['text']]
y_train_multi = [i for i in df_en_train['misogyny_category']]
x_test_multi = [normalizeTweet(i) for i in df_en_test['text']]
y_test_multi = [i for i in df_en_test['misogyny_category']]




In [None]:
#define a label encoding for multi_class categories
ordered_labels = sorted(list(set(y_train_multi)))
label_dict = {}
k=0
for i in ordered_labels:
  label_dict[i] = k
  k+=1

label_dict

y_train_multi = [label_dict[i] for i in df_en_train['misogyny_category']]
y_test_multi = [label_dict[i] for i in df_en_test['misogyny_category']]

In [None]:
label_dict

In [None]:
#create dataframes from the normalized tweets

df_train_binary = pd.DataFrame(data = {'Text': x_train_binary, 'label': y_train_binary},)
df_test_binary = pd.DataFrame(data = {'Text': x_test_binary, 'label': y_test_binary},)
df_train_multi = pd.DataFrame(data = {'Text': x_train_multi, 'label': y_train_multi},)
df_test_multi = pd.DataFrame(data = {'Text': x_test_multi, 'label': y_test_multi},)

display(df_train_binary.head())
display(df_train_multi.head())

Unnamed: 0,Text,label
0,Please tell me why the bitch next to me in the...,1
1,@USER @USER Bitch shut the fuck up,1
2,"@USER Dear cunt , please shut the fuck up .",1
3,RT @USER : Pls shut the fuck up bitch,1
4,"RT @USER : "" when u gonna get your license "" S...",1


Unnamed: 0,Text,label
0,Please tell me why the bitch next to me in the...,2
1,@USER @USER Bitch shut the fuck up,2
2,"@USER Dear cunt , please shut the fuck up .",2
3,RT @USER : Pls shut the fuck up bitch,2
4,"RT @USER : "" when u gonna get your license "" S...",2


In [None]:
#function used to perform BERTweet tokenization

def support_tokenizer(df, tokenizer, max_length = 256):

  input_ids = []
  token_type_ids = []
  attention_mask = []

  for sentence in df['Text']:
              #for each sentence, perform a tokenization compatible with bert models
              #getting input_ids, token_type_ids and attention_mask
              sent_tok = tokenizer.encode_plus(sentence, padding="max_length", \
                                              truncation=True,
                                              add_special_tokens=True,
                                              max_length=256,
                                              pad_to_max_length=True,
                                              return_token_type_ids=True)


              input_ids.append(sent_tok['input_ids'])
              token_type_ids.append(sent_tok['token_type_ids'])
              attention_mask.append(sent_tok['attention_mask'])


  #crete a dataframe with the original information + the new tokenized data
  tok_df = pd.DataFrame({'Text': df['Text'], 'label': df['label'],
                         'input_ids': input_ids,'token_type_ids':token_type_ids,
                         'attention_mask':attention_mask})
  return tok_df


In [None]:
#convert a pandas dataframe to a HuggingFace compatible one
def convert_hg_dataset(df):
  return Dataset(pa.Table.from_pandas(df))

In [None]:
from transformers import RobertaTokenizer, AutoTokenizer

In [None]:
#define two different tokenizers, one for roberta and the other for bertweet

#roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base', truncation=True, do_lower_case=True)
bertweet_tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, truncation = True, do_lower_case = True)

Downloading (…)lve/main/config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, RobertaModel


In [None]:
#download pre-trained models

#model_roberta = AutoModelForSequenceClassification.from_pretrained("roberta-base")
model_bertweet = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base")

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [None]:
#exploit sklearn libraries in order to have a classification report
#these functions will be used combined with HugginFace training and testing transformers libraries

def compute_metrics_binary(eval_pred):

    predictions, labels = eval_pred
    pred = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred )
    recall = recall_score(y_true=labels, y_pred=pred, )
    precision = precision_score(y_true=labels, y_pred=pred )
    f1 = f1_score(y_true=labels, y_pred=pred)

    print(classification_report(labels, pred))

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

def compute_metrics_multi(eval_pred):

    predictions, labels = eval_pred
    pred = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average = None )
    precision = precision_score(y_true=labels, y_pred=pred, average = None )
    f1 = f1_score(y_true=labels, y_pred=pred, average = None)

    print(classification_report(labels, pred))

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [None]:
#define a function that given a model_name from transformer library, performs a  k fold cross validation
#for binary and multi class problems, given a parameters setting in input

#the function always uses stratified cross validation for the classification problems

def transformer_crossval(model_name, params, dataframe, tokenizer, k=5, problem = 'binary', random_state = 42):

  if problem == 'binary':
    compute_metrics = compute_metrics_binary
  if problem == 'multi':
    compute_metrics = compute_metrics_multi

  skf = StratifiedKFold(n_splits=k, shuffle=True, random_state = random_state)
  #definining a precise random state allows us to keep the same folds for different intantiations of the same function

  X = dataframe['Text'] #consider the texts
  y = dataframe['label'] #consider the labels

  num_labels = len(set(y))

  ordered_train_sets = []
  ordered_validation_sets = []

  tokenize_func = lambda sentences: tokenizer(sentences['Text'], \
                                            padding="max_length", \
                                            truncation=True,
                                            )

  for train_index, test_index in skf.split(X, y):

    #for each split done by skf.split(X, y), convert the pandas dataframe into a hg_dataset
    #and perform tokenization on it.
    #then save it in appropriate lists.

    train_df = pd.DataFrame(data = {'Text': X.iloc[train_index, ], 'label': y.iloc[train_index]},)
    validation_df = pd.DataFrame(data = {'Text': X.iloc[test_index, ], 'label': y.iloc[test_index]},)

    train_df = support_tokenizer(train_df, tokenizer, max_length = 256)
    validation_df = support_tokenizer(validation_df, tokenizer, max_length = 256)


    train_df = convert_hg_dataset(train_df)
    validation_df = convert_hg_dataset(validation_df)

    ordered_train_sets.append(train_df)
    ordered_validation_sets.append(validation_df)


  loss_metrics_validation = []
  accuracy_metrics_validation = []
  f1_metrics_validation = []

  for training_set, validation_set in zip(ordered_train_sets,ordered_validation_sets):
    print(ordered_train_sets[0])
    print(ordered_validation_sets[0])

    #for each training and validation set, we perform a training and a validation
    #with our model, saving the accuracy and validation score

    training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs= params['epoch'],             # total number of training epochs
    per_device_train_batch_size= params['per_device_train_batch_size'],  # batch size per device during training
    per_device_eval_batch_size= params['per_device_validation_batch_size'],   # batch size for evaluation
    warmup_steps= params['warmup_steps'],                # number of warmup steps for learning rate scheduler
    weight_decay= params['weight_decay'],               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=20,
    learning_rate = params['learning_rate']

)


    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels= num_labels).to(device)

    trainer = Trainer(
    model= model,
    args= training_args,
    train_dataset=training_set,
    eval_dataset=validation_set,
    compute_metrics=compute_metrics,
    )

    trainer.train()

    evaluations = trainer.evaluate()

    print(evaluations) #############################

    loss_metrics_validation.append(evaluations['eval_loss'])

    accuracy_metrics_validation.append(evaluations['eval_accuracy'])
    f1_metrics_validation.append(evaluations['eval_f1'].mean())

  result_dict = {
  'avg_loss' : np.array(loss_metrics_validation).mean(),
  'std_loss' : np.array(loss_metrics_validation).std(),
  'avg_accuracy' : np.array(accuracy_metrics_validation).mean(),
  'std_accuracy' : np.array(accuracy_metrics_validation).std(),
  'avg_f1' : np.array(f1_metrics_validation).mean(),
  'std_f1' : np.array(f1_metrics_validation).std(),
  }

  return result_dict

In [None]:
#define a function which fit a transformer model to a dataframe and reports results given the test

def transformer_fit_predict(model_name, params, dataframe_training, dataframe_test, tokenizer, problem = 'binary', random_state = 42):

  if problem == 'binary':
    compute_metrics = compute_metrics_binary
  if problem == 'multi':
    compute_metrics = compute_metrics_multi

  y = dataframe_training['label'] #consider the labels

  num_labels = len(set(y))

  tokenize_func = lambda sentences: tokenizer(sentences['Text'], \
                                            padding="max_length", \
                                            truncation=True,
                                            )

  train_df = support_tokenizer(dataframe_training, tokenizer, max_length = 256)
  test_df = support_tokenizer(dataframe_test, tokenizer, max_length = 256)

  train_df = convert_hg_dataset(train_df)
  test_df = convert_hg_dataset(test_df)

  training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs= params['epoch'],             # total number of training epochs
    per_device_train_batch_size= params['per_device_train_batch_size'],  # batch size per device during training
    per_device_eval_batch_size= params['per_device_validation_batch_size'],   # batch size for evaluation
    warmup_steps= params['warmup_steps'],                # number of warmup steps for learning rate scheduler
    weight_decay= params['weight_decay'],               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=20,
    learning_rate = params['learning_rate'])



  model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels= num_labels).to(device)

  trainer = Trainer(
    model= model,
    args= training_args,
    train_dataset=train_df,
    eval_dataset=test_df,
    compute_metrics=compute_metrics,
)
  trainer.train()

  evaluations = trainer.evaluate()

  print(evaluations)

  return model, evaluations

## Stratified cross-validation with BERTweet for binary class

In [None]:
param_grid = {'epoch': [5],
              'per_device_train_batch_size' : [16],
              'per_device_validation_batch_size' : [8,16],
              'warmup_steps' : [500],
              'learning_rate' : [1e-05,3e-5],
              'weight_decay' : [0.0001, 0.01]
              }


grid = ParameterGrid(param_grid)

updated_grid = []

for i in grid:
  if i['per_device_train_batch_size'] > i['per_device_validation_batch_size']:
    updated_grid.append(i)


#specify the different possible parameters configuration to test
for x in updated_grid:
  print(x)

{'epoch': 5, 'learning_rate': 1e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.0001}
{'epoch': 5, 'learning_rate': 1e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.01}
{'epoch': 5, 'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.0001}
{'epoch': 5, 'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.01}


### NOTE: outputs for the entire cross-validation procedure are not reported due to the difficulty of running it in a single colab session; best configuration for all cases is reported in the next cell every time transformer_crossval() is called

In [None]:
#test different configurations for the binary case

results = []

for i in range(len(updated_grid)):

  print('Testing with configuration: \n')
  print(updated_grid[i], end = '\n\n')

  #tuple_result = (avg_loss, std_loss, avg_accuracy, std_accuracy,)
  tuple_result = transformer_crossval("vinai/bertweet-base" , updated_grid[i], df_train_binary, bertweet_tokenizer,k=5, problem = 'binary')
  result = (updated_grid[i], tuple_result)
  results.append(result)
  print(tuple_result)


In [None]:
best_config = {'weight_decay': 0.0001, 'warmup_steps': 500, 'per_device_validation_batch_size': 8, 'per_device_train_batch_size': 16, 'learning_rate': 3e-05, 'epoch': 5}


In [None]:
transformer_fit_predict("vinai/bertweet-base" , best_config, df_train_binary, df_test_binary, bertweet_tokenizer, problem = 'binary')

## Stratified cross-validation with BERTweet for multi-class

---



In [None]:
#test different configurations and

results = []

for i in range(len(updated_grid)):

  print('Testing with configuration: \n')
  print(updated_grid[i], end = '\n\n')

  #tuple_result = (avg_loss, std_loss, avg_accuracy, std_accuracy,)
  tuple_result = transformer_crossval("vinai/bertweet-base" , updated_grid[i], df_train_multi, bertweet_tokenizer,k=5, problem = 'multi')
  result = (updated_grid[i], tuple_result)
  results.append(result)
  print(tuple_result)


In [None]:
#crossvalidation results for multi-class case BERTweet [NO AUGMENTATION]
{'epoch': 5, 'learning_rate': 1e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.0001}
{'avg_loss': 0.9555049061775207, 'std_loss': 0.01990089166461212, 'avg_accuracy': 0.6688045572589427, 'std_accuracy': 0.020259804124640655, 'avg_f1': 0.41354925905999673, 'std_f1': 0.03231382637080065}

{'epoch': 5, 'learning_rate': 1e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.01}
{'avg_loss': 0.954667842388153, 'std_loss': 0.01865113807747386, 'avg_accuracy': 0.674584904079752, 'std_accuracy': 0.009966775463470062, 'avg_f1': 0.4224969606350145, 'std_f1': 0.02941109302009539}


{'epoch': 5, 'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.0001}
{'avg_loss': 0.949773371219635, 'std_loss': 0.03147020396051596, 'avg_accuracy': 0.682685766943118, 'std_accuracy': 0.015633047387544417, 'avg_f1': 0.5463946231350254, 'std_f1': 0.015116808746339593}


{'epoch': 5, 'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.01}
{'avg_loss': 0.9542999863624573, 'std_loss': 0.03613993268974634, 'avg_accuracy': 0.686747088883304, 'std_accuracy': 0.015550506742450145, 'avg_f1': 0.540460741372458, 'std_f1': 0.01830868482157618}



In [None]:
#test result with the best parameter configuration

best_configuration = {'epoch': 5, 'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.0001}

transformer_fit_predict("vinai/bertweet-base", best_configuration, df_train_multi, df_test_multi, bertweet_tokenizer, problem = 'multi')




### Data augmentation approach

In [None]:
#perform cross-validation with data-augmentation and see if it performs any better

#the function extends the dataset given a certain label and a limit

def enlarge_df(df, label, limit : float):
  df_new = df
  limit = int(len(df[df['label'] == label]['Text']) * limit)

  to_add = []

  for string in df[df['label'] == label]['Text']:
    k = eda_4(sentence = string, alpha_sr = 0.2, alpha_ri = 0.0, alpha_rs=0.2, num_aug = 10) #generate 10 instances
    for j in k:
      if len(to_add) == limit:
        print(len(to_add))
        return df_new
      df_new = df_new.append({'Text': j , 'label' : label}, ignore_index = True)
      to_add.append(j)


  return df_new


df_train_multi_five = df_en_train
df_test_multi_five = df_en_test



In [None]:
#rename multi-class dataset columns for consistency with enlarge_df

df_train_multi_five = df_train_multi_five[['text','misogyny_category']].rename(columns={"text": "Text", "misogyny_category": "label"})
df_test_multi_five = df_test_multi_five[['text','misogyny_category']].rename(columns={"text": "Text", "misogyny_category": "label"})

for key, value in label_dict.items():
  df_train_multi_five = df_train_multi_five.replace(key, value)
  df_test_multi_five = df_test_multi_five.replace(key, value)


In [None]:
#extend every label excpet discredit, the majority class

for key, value in label_dict.items():
    if key == 'discredit':
        continue
    new = enlarge_df(df_train_multi_five, value, 0.5)
    df_train_multi_five = new

df_train_multi_five['label'].value_counts()

45
72
167
88


1    982
3    501
4    264
2    217
0    135
Name: label, dtype: int64

In [None]:
#perform again a model selection

results = []

for i in range(len(updated_grid)):

  print('Testing with configuration: \n')
  print(updated_grid[i], end = '\n\n')

  #tuple_result = (avg_loss, std_loss, avg_accuracy, std_accuracy,)
  tuple_result = transformer_crossval("vinai/bertweet-base" , updated_grid[i], df_train_multi_five, bertweet_tokenizer,k=5, problem = 'multi')
  result = (updated_grid[i], tuple_result)
  results.append(result)
  print(tuple_result)


In [None]:
#cross-validation results for BERTweet after EDA AUGMENTATION APPROACH


{'epoch': 5, 'learning_rate': 1e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.0001}

{'avg_loss': 0.836749529838562, 'std_loss': 0.02075077741974913, 'avg_accuracy': 0.7359447004608295, 'std_accuracy': 0.008935815405375743, 'avg_f1': 0.6608662137094996, 'std_f1': 0.017969838953794084}

{'epoch': 5, 'learning_rate': 1e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.01}

{'avg_loss': 0.8311222553253174, 'std_loss': 0.019607455106381306, 'avg_accuracy': 0.7345622119815667, 'std_accuracy': 0.009147204258654005, 'avg_f1': 0.6586178401516201, 'std_f1': 0.01348793240955729}

{'epoch': 5, 'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.01}


{'avg_loss': 0.8364081501960754, 'std_loss': 0.04591267743316787, 'avg_accuracy': 0.7497695852534563, 'std_accuracy': 0.017218010776285174, 'avg_f1': 0.7020609259300116, 'std_f1': 0.023128162178396007}



{'epoch': 5, 'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.0001}



{'avg_loss': 0.8197516322135925, 'std_loss': 0.051124447542434746, 'avg_accuracy': 0.7571428571428571, 'std_accuracy': 0.013870891191974566, 'avg_f1': 0.7080092538817573, 'std_f1': 0.0265260270643998}



In [None]:
#test result with the best parameter configuration

best_configuration = {'epoch': 5, 'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.0001}

transformer_fit_predict("vinai/bertweet-base", best_configuration, df_train_multi_five, df_test_multi_five, bertweet_tokenizer, problem = 'multi')



In [None]:
#testing the augmented dataset with MLP

parameters_mlp = {'hidden_layer_sizes': [(8,16,32),(16,32,64,128),(64,128,256),],
                 'activation':['relu','logistic','tanh'],
                 'solver': ['lbfgs','sgd','adam'],
                 'max_iter' : [9000],
                  'early_stopping' : [True]}


from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#find lingustic features relevant for classification

def find_how_many(string_list,reference):
  count = 0
  for x in string_list:
    if x == reference:
      count += 1

  return count

def extract_adjs(string):
  tweet_adjs= []
  doc = nlp(string)
  adjs = 0
  for token in doc:
    if token.pos_ == 'ADJ':
     adjs += 1
  return adjs

def linguistic_extraction(dataset):
  linguistic_features = []
  for x in dataset:
    lenght = len(x)
    urls = find_how_many(x.split(),'HTTPURL')
    users = find_how_many(x.split(),'@USER')
    adjs =  extract_adjs(x)
    linguistic_features.append([lenght] + [urls] + [users] + [adjs])
  return linguistic_features

def perform_gridsearch(classifier, param_grid, cv, X_train, y_train, X_test, y_test, scoring = None):

  clf = GridSearchCV(estimator = classifier, param_grid = param_grid, n_jobs = -1, cv = cv, scoring = scoring)
  clf.fit(X_train, y_train)

  best_model = clf.best_estimator_

  predictions = best_model.predict(X_test)

  print('Best configuration', clf.best_params_)
  print('Best mean score for the validation', clf.best_score_)
  print('Std for the best mean score across folds',clf.cv_results_['std_test_score'][clf.best_index_])
  print('******************')
  print('Classification report:')
  print(classification_report(y_test, predictions, digits = 7))
  print('Confusion matrix:')
  cm = confusion_matrix(y_test, predictions)
  print(cm)
  print('******************')
  print(clf.cv_results_['mean_test_score'])
  return best_model

skf = StratifiedKFold(shuffle=True, random_state = 42, n_splits = 5)



In [None]:
x_train_multi = [normalizeTweet(i) for i in df_train_multi_five['Text']]
y_train_multi = [i for i in df_train_multi_five['label']]
x_test_multi = [normalizeTweet(i) for i in df_test_multi_five['Text']]
y_test_multi = [i for i in df_test_multi_five['label']]

In [None]:
linguistic_features_train_multi = linguistic_extraction(x_train_multi)
linguistic_features_test_multi = linguistic_extraction(x_test_multi)

vect = CountVectorizer(analyzer=spacy_nlp_tokenizer, min_df=5)
reset_counter()

X_train_tok_multi = vect.fit_transform(x_train_multi)
reset_counter()

X_test_tok_multi = vect.transform(x_test_multi)


tfidf = TfidfTransformer()  # weighting
tfidf.fit(X_train_tok_multi)
X_train_vec_multi = tfidf.transform(X_train_tok_multi)
X_test_vec_multi = tfidf.transform(X_test_tok_multi)


X_train_vec_multi_arr = X_train_vec_multi.toarray()
X_test_vec_multi_arr = X_test_vec_multi.toarray()

X_train_full_multi = []
X_test_full_multi = []



embeddings_train_multi = model.encode(x_train_multi)
embeddings_test_multi = model.encode(x_test_multi)

for i in range(len(X_train_vec_multi_arr)):
  arr1 = X_train_vec_multi_arr[i]
  arr2 = embeddings_train_multi[i]
  arr3 = linguistic_features_train_multi[i]
  X_train_full_multi.append(np.concatenate((arr1,arr2,arr3), axis =0))

for i in range(len(X_test_vec_multi_arr)):
  arr1 = X_test_vec_multi_arr[i]
  arr2 = embeddings_test_multi[i]
  arr3 = linguistic_features_test_multi[i]
  X_test_full_multi.append(np.concatenate((arr1,arr2,arr3), axis =0))



In [None]:
#RESULTS WITH sparse matrix + embedding + linguistic features

perform_gridsearch(MLPClassifier(), parameters_mlp, skf, X_train_full_multi, y_train_multi, X_test_full_multi, y_test_multi, scoring = 'f1_macro')


Best configuration {'activation': 'tanh', 'early_stopping': True, 'hidden_layer_sizes': (64, 128, 256), 'max_iter': 9000, 'solver': 'adam'}
Best mean score for the validation 0.6607431437334649
Std for the best mean score across folds 0.026382555174533463
******************
Classification report:
              precision    recall  f1-score   support

           0  0.2142857 0.2727273 0.2400000        11
           1  0.4363636 0.8510638 0.5769231       141
           2  0.9333333 0.1129032 0.2014388       124
           3  0.3039216 0.7045455 0.4246575        44
           4  0.9629630 0.3714286 0.5360825       140

    accuracy                      0.4782609       460
   macro avg  0.5701734 0.4625337 0.3958204       460
weighted avg  0.7126198 0.4782609 0.4406545       460

Confusion matrix:
[[  3   4   0   4   0]
 [  4 120   0  16   1]
 [  4  63  14  42   1]
 [  0  13   0  31   0]
 [  3  75   1   9  52]]
******************
[0.46080958 0.12901667 0.34981831 0.55917537 0.12847652 0.61

MLPClassifier(activation='tanh', early_stopping=True,
              hidden_layer_sizes=(64, 128, 256), max_iter=9000)

# From 5 labels to 3 labels

In [None]:
df_train_multi

Unnamed: 0,Text,label
0,Please tell me why the bitch next to me in the...,2
1,@USER @USER Bitch shut the fuck up,2
2,"@USER Dear cunt , please shut the fuck up .",2
3,RT @USER : Pls shut the fuck up bitch,2
4,"RT @USER : "" when u gonna get your license "" S...",2
...,...,...
1722,@USER @USER @USER @USER This ugly bitch has a ...,1
1723,@USER Your lady probably is a bitch though . M...,1
1724,"Women are bitches . 1 time , this bitch in BAL...",1
1725,Any woman that likes me is a bitch . All women...,1


In [None]:
#What happens dropping the most under-represented class and merging the other two ?

df_train_multi_three = df_train_multi
df_test_multi_three = df_test_multi

df_train_multi_three = df_train_multi_three[df_train_multi_three.label != label_dict['derailing']]
df_test_multi_three = df_test_multi_three[df_test_multi_three.label != label_dict['derailing']]

df_train_multi_three.label.replace( [label_dict['dominance']],  [ label_dict['stereotype']]   ,  inplace=True)
df_test_multi_three.label.replace( [label_dict['dominance']],  [ label_dict['stereotype']]   ,  inplace=True)

df_train_multi_three = df_train_multi_three.reset_index()
df_test_multi_three = df_test_multi_three.reset_index()



In [None]:
set(df_train_multi_three['label'])

{1, 3, 4}

In [None]:
label_dict
#0: discredit, 1: sexual_harassment, 2: stereotype_dominance

{'derailing': 0,
 'discredit': 1,
 'dominance': 2,
 'sexual_harassment': 3,
 'stereotype': 4}

In [None]:
df_train_multi_three['label'].replace([1,3,4], [0,1,2], inplace = True)
df_test_multi_three['label'].replace([1,3,4], [0,1,2], inplace = True)

In [None]:
df_train_multi_three['label'].value_counts()

0    982
1    334
2    321
Name: label, dtype: int64

In [None]:
#extend every label excpet discredit, the majority class

for key, value in label_dict.items():
    if key == 0:
        continue
    new = enlarge_df(df_train_multi_three, value, 0.5)
    df_train_multi_three = new

df_train_multi_three['label'].value_counts()

491
167
160


0    1473
1     501
2     481
Name: label, dtype: int64

In [None]:
#perform again a model selection

results = []

for i in range(len(updated_grid)):

  print('Testing with configuration: \n')
  print(updated_grid[i], end = '\n\n')

  #tuple_result = (avg_loss, std_loss, avg_accuracy, std_accuracy,)
  tuple_result = transformer_crossval("vinai/bertweet-base" , updated_grid[i], df_train_multi_three, bertweet_tokenizer,k=5, problem = 'multi')
  result = (updated_grid[i], tuple_result)
  results.append(result)
  print(tuple_result)


In [None]:
#test result with the best parameter configuration

best_configuration = {'epoch': 5, 'learning_rate': 3e-05, 'per_device_train_batch_size': 16, 'per_device_validation_batch_size': 8, 'warmup_steps': 500, 'weight_decay': 0.0001}

transformer_fit_predict("vinai/bertweet-base", best_configuration, df_train_multi_three, df_test_multi_three, bertweet_tokenizer, problem = 'multi')




PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--vinai--bertweet-base/snapshots/118ab1d567653bec16bbb081eafb6f8942f72108/config.json
Model config RobertaConfig {
  "_name_or_path": "vinai/bertweet-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1":

Step,Training Loss
20,1.0924
40,1.0671
60,1.0042
80,0.9602
100,0.9564
120,0.9011
140,0.8706
160,0.8269
180,0.6919
200,0.6774


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Text. If Text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 449
  Batch size = 8


Trainer is attempting to log a value of "[0.59798995 0.57407407 0.91326531]" of type <class 'numpy.ndarray'> for key "eval/precision" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.84397163 0.70454545 0.6780303 ]" of type <class 'numpy.ndarray'> for key "eval/recall" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "[0.7        0.63265306 0.77826087]" of type <class 'numpy.ndarray'> for key "eval/f1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


              precision    recall  f1-score   support

           0       0.60      0.84      0.70       141
           1       0.57      0.70      0.63        44
           2       0.91      0.68      0.78       264

    accuracy                           0.73       449
   macro avg       0.70      0.74      0.70       449
weighted avg       0.78      0.73      0.74       449

{'eval_loss': 0.9769389629364014, 'eval_accuracy': 0.732739420935412, 'eval_precision': array([0.59798995, 0.57407407, 0.91326531]), 'eval_recall': array([0.84397163, 0.70454545, 0.6780303 ]), 'eval_f1': array([0.7       , 0.63265306, 0.77826087]), 'eval_runtime': 6.7204, 'eval_samples_per_second': 66.811, 'eval_steps_per_second': 8.482, 'epoch': 5.0}


(RobertaForSequenceClassification(
   (roberta): RobertaModel(
     (embeddings): RobertaEmbeddings(
       (word_embeddings): Embedding(64001, 768, padding_idx=1)
       (position_embeddings): Embedding(130, 768, padding_idx=1)
       (token_type_embeddings): Embedding(1, 768)
       (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
       (dropout): Dropout(p=0.1, inplace=False)
     )
     (encoder): RobertaEncoder(
       (layer): ModuleList(
         (0): RobertaLayer(
           (attention): RobertaAttention(
             (self): RobertaSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.1, inplace=False)
             )
             (output): RobertaSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
