# Setup

Here we import some of the library we're going to use and install the needed packages.


In [None]:
import tensorflow as tf
import torch
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

!pip install transformers
!pip install bert-tensorflow

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 8.1MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 30.0MB/s 
[?25hCollecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 54.6MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB

In [None]:
from google.colab import drive
drive.mount('/content/gdrive') 
model_save_name = 'bert_ft_epoch.pt'
path = F"/content/gdrive/My Drive/{model_save_name}" 

Mounted at /content/gdrive


# Load Data

We then proceed by reading the data from the .csv file and peek at the first elements of our dataset.

After this we'll have a deeper look at the misogynous and aggressiveness classes.

In [None]:
data = pd.read_csv('AMI2020_training_raw.csv')
data.head()

Unnamed: 0,id,text,misogynous,aggressiveness
0,1,@KassemAmin4 @Laylasexgdr Fatti trovare te lo...,1,1
1,2,@meb Tu dovresti ricominciare dai semafori a f...,1,1
2,3,"Amore,sei presentabile? Xchè così via Skype ti...",1,1
3,4,"@Il_nulla Salvo poi mandare la culona a Mosca,...",1,0
4,5,@GiorgiaMeloni @FratellidItaIia Vediamo Gentil...,1,1


In [None]:
def label_col (row):
  if row['misogynous'] == 0 and row['aggressiveness'] == 0:
    return 0
  elif row['misogynous'] == 1 and row['aggressiveness'] == 0:
    return 1
  elif row['misogynous'] == 1 and row['aggressiveness'] == 1:
    return 2

In [None]:
data['labels'] = data.apply(lambda row: label_col(row), axis=1)
data.head()

Unnamed: 0,id,text,misogynous,aggressiveness,labels
0,1,@KassemAmin4 @Laylasexgdr Fatti trovare te lo...,1,1,2
1,2,@meb Tu dovresti ricominciare dai semafori a f...,1,1,2
2,3,"Amore,sei presentabile? Xchè così via Skype ti...",1,1,2
3,4,"@Il_nulla Salvo poi mandare la culona a Mosca,...",1,0,1
4,5,@GiorgiaMeloni @FratellidItaIia Vediamo Gentil...,1,1,2


In [None]:
data.labels.value_counts()

0    2663
2    1783
1     554
Name: labels, dtype: int64

In [None]:
label_dict = {0: [0,0], 1: [1,0], 2: [1,1]}
label_dict

{0: [0, 0], 1: [1, 0], 2: [1, 1]}

# Data Preparation

We will first split the the dataset in two different sets:


1.   the "training" set, which will be used to actually train the model
2.   the "validation" (and "test", here) set, which we are going to use to evaluate the model



In [None]:
from sklearn.model_selection import train_test_split

# Split dataset in traning and validation(test)
X_train, X_val, Y_train, Y_val = train_test_split(
    data.index.values,
    data.labels.values,
    test_size=0.06,
    random_state=17,
    stratify=data.labels.values
)

In [None]:
# Check datasets composition
data['data_type'] = ['not_set'] * data.shape[0]
data.loc[X_train, 'data_type'] = 'train'
data.loc[X_val, 'data_type'] = 'val'
data.groupby(['misogynous', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,id,text,aggressiveness,labels
misogynous,data_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,train,2503,2503,2503,2503
0,val,160,160,160,160
1,train,2197,2197,2197,2197
1,val,140,140,140,140


In [None]:
from transformers import BertTokenizer
from torch.utils.data import TensorDataset

# Load pre-trained BERT Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
# Encode training dataset using the tokenizer
encoded_data_train = tokenizer.batch_encode_plus(
    data[data.data_type == 'train'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,  # so we know when a sentence is finished
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

# Encode validation dataset using the tokenizer
encoded_data_val = tokenizer.batch_encode_plus(
    data[data.data_type == 'val'].text.values,
    add_special_tokens=True,
    return_attention_mask=True,  
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Extract IDs, attention masks and labels from training dataset
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(data[data.data_type == 'train'].labels.values)
labels_train

tensor([2, 2, 2,  ..., 0, 0, 0])

In [None]:
# Extract IDs, attention masks and labels from validation dataset
input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(data[data.data_type == 'val'].labels.values)

In [None]:
# Create train and validation dataset from extracted features
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
print("Train dataset length: {}\nValidation dataset length: {}".format(len(dataset_train), len(dataset_val)))

Train dataset length: 4700
Validation dataset length: 300


In [None]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

# Define the size of each batch
batch_size = 32

# Load training dataset
dataloader_train= DataLoader(
    dataset_train,
    sampler=RandomSampler(dataset_train),
    batch_size=batch_size)

# Load valuation dataset
dataloader_val= DataLoader(
    dataset_val,
    sampler=RandomSampler(dataset_val),
    batch_size=batch_size)

In [None]:
from transformers import BertForSequenceClassification
# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      num_labels=3,
                                                      output_attentions=False,
                                                      output_hidden_states=False
                                                      )

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

# Define model optimizer -> Adam
optimizer = AdamW(
    model.parameters(),
    lr = 1e-5, 
    eps=1e-8
)
# Define model scheduler
epochs = 8
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [None]:
import random

# Define random seeds
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# Define processor type for torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
device

device(type='cuda')

In [None]:
from sklearn.metrics import f1_score

# Returns the F1 score computed on the predictions
def f1_score_func(preds, labels):
    preds_flat=np.argmax(preds, axis=1).flatten()
    labels_flat=labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

# Prints the accuracy of the model for each class
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [None]:
# Evaluates the model using the validation set
def evaluate(dataloader_val):
  model.eval()
  loss_val_total = 0
  predictions, true_vals = [], []

  for batch in dataloader_val:
      batch = tuple(b.to(device) for b in batch)
      inputs = {'input_ids': batch[0],
        'attention_mask': batch[1],
        'labels': batch[2],
        }

      with torch.no_grad():
          outputs = model(**inputs)

      loss = outputs[0]
      logits = outputs[1]
      loss_val_total += loss.item()

      logits = logits.detach().cpu().numpy()
      label_ids = inputs['labels'].cpu().numpy()
      predictions.append(logits)
      true_vals.append(label_ids)

  loss_val_avg = loss_val_total / len(dataloader_val)

  predictions = np.concatenate(predictions, axis=0)
  true_vals = np.concatenate(true_vals, axis=0)

  return loss_val_avg, predictions, true_vals

In [None]:
# Training the model on the training set and checking the results on the validation
for epoch in tqdm(range(1, epochs + 1)):

    model.train()  # model is training

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}
        
        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()  # to backpropagate

        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                      1.0)  # prevents the gradient from being too small or too big

        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch))})

    
    torch.save(model, path)
    tqdm.write(f'\nEpoch {epoch}/{epochs}')

    loss_train_avg = loss_train_total / len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')  # make sure that model is still training

    val_loss, predictions, true_vals = evaluate(dataloader_val)  # to check overtraining (or overfitting)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score(weighted) : {val_f1}')

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, description='Epoch 1', max=147.0, style=ProgressStyle(description_widt…


Epoch 1/8
Training loss: 0.731764883208437
Validation loss: 0.5541198045015335
F1 Score(weighted) : 0.7315387305472986


HBox(children=(FloatProgress(value=0.0, description='Epoch 2', max=147.0, style=ProgressStyle(description_widt…


Epoch 2/8
Training loss: 0.5598283365470211
Validation loss: 0.47157498002052306
F1 Score(weighted) : 0.7556617545568952


HBox(children=(FloatProgress(value=0.0, description='Epoch 3', max=147.0, style=ProgressStyle(description_widt…


Epoch 3/8
Training loss: 0.4877256167178251
Validation loss: 0.4605808973312378
F1 Score(weighted) : 0.8027365089022177


HBox(children=(FloatProgress(value=0.0, description='Epoch 4', max=147.0, style=ProgressStyle(description_widt…


Epoch 4/8
Training loss: 0.42985436746052336
Validation loss: 0.49327516853809356
F1 Score(weighted) : 0.7964171213466987


HBox(children=(FloatProgress(value=0.0, description='Epoch 5', max=147.0, style=ProgressStyle(description_widt…


Epoch 5/8
Training loss: 0.383690375961414
Validation loss: 0.4553853988647461
F1 Score(weighted) : 0.7998342578650992


HBox(children=(FloatProgress(value=0.0, description='Epoch 6', max=147.0, style=ProgressStyle(description_widt…


Epoch 6/8
Training loss: 0.3384692285235236
Validation loss: 0.45757636427879333
F1 Score(weighted) : 0.7997643627381745


HBox(children=(FloatProgress(value=0.0, description='Epoch 7', max=147.0, style=ProgressStyle(description_widt…


Epoch 7/8
Training loss: 0.31054679917640426
Validation loss: 0.4599923640489578
F1 Score(weighted) : 0.8035771670190276


HBox(children=(FloatProgress(value=0.0, description='Epoch 8', max=147.0, style=ProgressStyle(description_widt…


Epoch 8/8
Training loss: 0.29550142353083814
Validation loss: 0.47993555963039397
F1 Score(weighted) : 0.814263588832988



In [None]:
# Check the results obtained on the validation set
_, predictions, true_vals = evaluate(dataloader_val)
f1_score_func(predictions, true_vals)

0.814263588832988

# Test

In [None]:
data_test = pd.read_csv('AMI2020_test_raw.csv')
data_test.head()

Unnamed: 0,id,text
0,5001,Aveva voglia di gridare tutta la mia rabbia ma...
1,5002,Lei è acida perché non ha figli penso che dare...
2,5003,Ma quanto è brutto sentirsi dire dal proprio f...
3,5004,Per chi ci facciamo venire il groppo in gola s...
4,5005,@sprankthatbooty Ti sborro io


In [None]:
# Encode validation dataset using the tokenizer
encoded_data_test = tokenizer.batch_encode_plus(
    data_test.text.values,
    add_special_tokens=True,
    return_attention_mask=True,  
    pad_to_max_length=True,
    max_length=256,
    return_tensors='pt'
)

Truncation was not explicitely activated but `max_length` is provided a specific value, please use `truncation=True` to explicitely truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Extract IDs, attention masks and labels from validation dataset
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']

dataset_test = TensorDataset(input_ids_test, attention_masks_test)
print("Test dataset length: {}".format(len(dataset_test)))

Test dataset length: 1000


In [None]:
from torch.utils.data import DataLoader
dataloader_test = DataLoader(dataset_test)

In [None]:
# Evaluates the model using the validation set
def predict(dataset_test):
    predictions = []

    for row in dataset_test:
      row = tuple(r.to(device) for r in row)
      inputs = {'input_ids': row[0],
        'attention_mask': row[1]
        }

      with torch.no_grad():
          outputs = model(**inputs)

      logits = outputs[0]
      logits = logits.detach().cpu().numpy()
      predictions.append(logits)

    return predictions

# Predict values for test dataset
predictions = predict(dataloader_test)

In [None]:
print(len(predictions))
results = []
for i, prediction in enumerate(predictions):
  predicted = np.argmax(prediction, axis=1)[0]
  # print(f"index: {i} -- prediction: {predicted}")
  results.append(predicted)

print(results)

print(results.count(0))
print(results.count(1))
print(results.count(2))

1000
[2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 0, 2, 1, 2, 2, 0, 0, 0, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 1, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 1, 2, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 1, 1, 0, 2, 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 2, 0, 2, 0, 1, 0, 2, 0, 2, 0, 2, 0, 2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 2, 2, 0, 2, 0, 0, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 1, 2, 0, 1, 0, 0, 2, 2, 0, 0, 2, 2, 2, 2, 1, 2, 1, 2, 0, 1, 2, 2, 2, 2, 2, 2, 0, 2, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 2, 0, 0, 1, 2, 0, 0, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0, 2, 0, 1, 2, 0, 2, 1, 0, 0, 2, 0, 2, 2, 2, 2, 0, 1, 1, 2, 0, 0, 2, 0, 2, 0, 0, 2, 2, 2, 2, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 0, 1, 2, 1, 0, 2, 2, 0, 2, 2, 2, 2, 0, 0, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 0, 0, 2, 2, 2, 2, 2, 0, 2, 2, 0, 2, 0, 2, 2, 2, 2, 1, 2, 2, 1, 2, 0, 2, 1, 2, 2, 2, 0, 2, 0, 0, 0, 0, 2, 1, 2, 2, 1, 0, 0, 2, 1, 2, 2, 2, 0, 2, 2, 0, 2, 0, 1

In [None]:
label_dict = {0: [0,0],
              1: [1,0],
              2: [1,1]}

misogynous, aggressiveness = [], []
for prediction in results:
  misogynous.append(label_dict[prediction][0])
  aggressiveness.append(label_dict[prediction][1])

print(f"Misogynous length: {len(misogynous)}\nAggressiveness length: {len(aggressiveness)}")

Misogynous length: 1000
Aggressiveness length: 1000


In [None]:
data_test['misogynous'] = misogynous
data_test['aggressiveness'] = aggressiveness
data_test.head(20)

Unnamed: 0,id,text,misogynous,aggressiveness
0,5001,Aveva voglia di gridare tutta la mia rabbia ma...,1,1
1,5002,Lei è acida perché non ha figli penso che dare...,0,0
2,5003,Ma quanto è brutto sentirsi dire dal proprio f...,1,1
3,5004,Per chi ci facciamo venire il groppo in gola s...,1,1
4,5005,@sprankthatbooty Ti sborro io,1,1
5,5006,@Fedoc5 in ginocchio con te che comandi il rit...,1,1
6,5007,La cena è pronta. Lo squalo balena affonda nei...,1,1
7,5008,ho appena letto quello che ha pubblicato Hazza...,0,0
8,5009,Ma ci sono sempre stati tutti sti tuit di gent...,0,0
9,5010,@NoraNature1 Che troia che sei &#x2764;️&#x1f60d;,1,1


In [None]:
data_test.misogynous.value_counts()

1    662
0    338
Name: misogynous, dtype: int64

In [None]:
data_test.aggressiveness.value_counts()

1    560
0    440
Name: aggressiveness, dtype: int64

In [None]:
final_df = data_test.drop('text', axis=1)
final_df

Unnamed: 0,id,misogynous,aggressiveness
0,5001,1,1
1,5002,0,0
2,5003,1,1
3,5004,1,1
4,5005,1,1
...,...,...,...
995,5996,1,1
996,5997,1,1
997,5998,1,1
998,5999,1,0


In [None]:
final_df.to_csv('./AriannaMuti.a.r.c.run3.csv')

In [None]:
model_path = "/content/gdrive/My Drive/TheOne" 
torch.save(model, model_path)