# Bert: binary classification

In [None]:
pip install transformers

In [None]:
import torch
from torch.utils.data import (TensorDataset, DataLoader,
                              RandomSampler, SequentialSampler)

from transformers import BertTokenizer, BertConfig
from transformers import BertForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup

from distutils.version import LooseVersion as LV

from sklearn.model_selection import train_test_split

import io

import pandas as pd
import numpy as np

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [None]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    devicename = '['+torch.cuda.get_device_name(0)+']'
else:
    device = torch.device('cpu')
    devicename = ""
    
print('Using PyTorch version:', torch.__version__,
      'Device:', device, devicename)
assert(LV(torch.__version__) >= LV("1.0.0"))

## Loading dataset

In [None]:
import pandas as pd
df = pd.read_csv('.../mean_tfIdf.csv', delimiter=',', skiprows=0, lineterminator='\n', low_memory=False)
#Trasformiamo le etichette bull e bear in valori numerici
df['value'].replace(to_replace="bull.+", value=1, regex=True, inplace=True)
df['value'].replace(to_replace="bear.+", value=0, regex=True, inplace=True)

In [None]:
#split train/test
from sklearn.model_selection import train_test_split

tweets = df['clean'].values
y = df['value'].values

x_train1, x_test1, y_train1, y_test1 = train_test_split(tweets, y, test_size=0.3, random_state=1000)

In [None]:
x_train = list()
y_train = list()

for r in x_train1:
    x_train.append(r)

for r in y_train1:
    y_train.append(r)

x_test = list()
y_test = list()

for r in x_test1:
    x_test.append(r)

for r in y_test1:
    y_test.append(r)


In [None]:
len(x_train),len(y_train),len(x_test),len(y_test)

In [None]:
set(y_train)

In [None]:
sample_idx = 10
x_train[sample_idx]

In [None]:
y_train[sample_idx]

# Binary classification

Bear vs Bull

In [None]:
import numpy as np

y_train_bin = np.asarray(y_train)==y_train[sample_idx]
y_test_bin = np.asarray(y_test)==y_train[sample_idx]
y_train_bin,y_test_bin

# Prepariamo il testo per BERT.

In [None]:
train_sentences = ["[CLS] " + s for s in x_train]
test_sentences = ["[CLS] " + s for s in x_test]
train_labels = [1 if value else 0 for value in y_train_bin]
test_labels = [1 if value else 0 for value in y_test_bin]

Adesso convertiamo i tweets in tokens, usando il tokenizer di BERT.

In [None]:
BERTMODEL = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(BERTMODEL,
                                          do_lower_case=True)
tokenizer

In [None]:
from tqdm import tqdm

train_tokenized = [tokenizer.tokenize(s) for s in tqdm(train_sentences)]
test_tokenized = [tokenizer.tokenize(s) for s in tqdm(test_sentences)]

In [None]:
print ("The full tokenized first training sentence:")
print (train_tokenized[0])

In [None]:
TRAIN_MAX_LEN, TEST_MAX_LEN = 128, 512

train_tokenized = [t[:(TRAIN_MAX_LEN-1)]+['SEP'] for t in train_tokenized]
test_tokenized  = [t[:(TEST_MAX_LEN-1)]+['SEP'] for t in test_tokenized]

print ("The truncated tokenized first training sentence:")
print (train_tokenized[0])

In [None]:
train_ids = [tokenizer.convert_tokens_to_ids(t) for t in train_tokenized]
train_ids = np.array([np.pad(i, (0, TRAIN_MAX_LEN-len(i)),
                             mode='constant') for i in train_ids])

test_ids = [tokenizer.convert_tokens_to_ids(t) for t in test_tokenized]
test_ids = np.array([np.pad(i, (0, TEST_MAX_LEN-len(i)),
                            mode='constant') for i in test_ids])

print ("The indices of the first training sentence:")
print (ids_train[0])

In [None]:
#attention masks
amasks_train, amasks_test = [], []

for seq in train_ids:
  seq_mask = [float(i>0) for i in seq]
  amasks_train.append(seq_mask)

for seq in test_ids:
  seq_mask = [float(i>0) for i in seq]
  amasks_test.append(seq_mask)

Usiamo train_test_split() di Scikit-Learn per usare il 10% del training dataset come validation set, e poi convertiamo tutti i dati in torch.tensor.

In [None]:
(train_inputs, validation_inputs,
 train_labels, validation_labels) = train_test_split(train_ids, train_labels,
                                                     random_state=42,
                                                     test_size=0.1)
(train_masks, validation_masks,
 _, _) = train_test_split(amasks_train, train_ids,
                          random_state=42, test_size=0.1)

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks  = torch.tensor(train_masks)
validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks  = torch.tensor(validation_masks)
test_inputs = torch.tensor(test_ids)
test_labels = torch.tensor(test_labels)
test_masks  = torch.tensor(amasks_test)

Poi creiamo PyTorch *DataLoader* per tutti i set di dati.

Abbiamo provato sia 16 che 32 per il batch size. Il risultato era leggermente migliore con 32.

In [None]:
BATCH_SIZE = 32

print('Datasets:')
print('Train: ', end="")
train_data = TensorDataset(train_inputs, train_masks,
                           train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler,
                              batch_size=BATCH_SIZE)
print(len(train_data), 'tweet')

print('Validation: ', end="")
validation_data = TensorDataset(validation_inputs, validation_masks,
                                validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data,
                                   sampler=validation_sampler,
                                   batch_size=BATCH_SIZE)
print(len(validation_data), 'tweet')

print('Test: ', end="")
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler,
                             batch_size=BATCH_SIZE)
print(len(test_data), 'tweet')

# BERT MODEL INITIALIZATION

In [None]:
pip install pytorch-pretrained-bert

In [None]:
model = BertForSequenceClassification.from_pretrained(BERTMODEL,
                                                      num_labels=2)
model.cuda()
print('Pretrained BERT model "{}" loaded'.format(BERTMODEL))

In [None]:
print(model)

In [None]:
EPOCHS = 4
WEIGHT_DECAY = 0.01
LR = 2e-5
WARMUP_STEPS =int(0.2*len(train_dataloader))

no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)],
     'weight_decay': WEIGHT_DECAY},
    {'params': [p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)],
     'weight_decay': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=LR, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS,
                                 num_training_steps =len(train_dataloader)*EPOCHS)

## Training

In [None]:
def train(epoch, loss_vector=None, log_interval=200):
  # Set model to training mode
  model.train()

  # Loop over each batch from the training set
  for step, batch in enumerate(train_dataloader):

    # Copy data to GPU if needed
    batch = tuple(t.to(device) for t in batch)

    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch

    # Zero gradient buffers
    optimizer.zero_grad()

    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None,
                    attention_mask=b_input_mask, labels=b_labels)

    loss = outputs[0]
    if loss_vector is not None:
        loss_vector.append(loss.item())

    # Backward pass
    loss.backward()

    # Update weights
    optimizer.step()
    scheduler.step()

    if step % log_interval == 0:
        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, step * len(b_input_ids),
                len(train_dataloader.dataset),
                100. * step / len(train_dataloader), loss))

def evaluate(loader):
  model.eval()

  n_correct, n_all = 0, 0

  for batch in loader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
      outputs = model(b_input_ids, token_type_ids=None,
                      attention_mask=b_input_mask)
      logits = outputs[0]

    logits = logits.detach().cpu().numpy()
    predictions = np.argmax(logits, axis=1)

    labels = b_labels.to('cpu').numpy()
    n_correct += np.sum(predictions == labels)
    n_all += len(labels)

  print('Accuracy: [{}/{}] {:.4f}'.format(n_correct, n_all,
                                          n_correct/n_all))

In [None]:
train_lossv = []
for epoch in range(1, EPOCHS + 1):
    print()
    train(epoch, train_lossv)
    print('\nValidation set:')
    evaluate(validation_dataloader)

In [None]:
%matplotlib inline
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_lossv, label='original')
plt.plot(np.convolve(train_lossv, np.ones(101), 'same') / 101,
         label='averaged')
plt.legend(loc='best')
plt.show()

## Evaluation

In [None]:
print('Test set:')
evaluate(test_dataloader)