# Test Bert

In [1]:
# load packages 
import numpy as np 
import pandas as pd 

from transformers import BertTokenizer, BertForSequenceClassification

import torch

from torch.utils.data import TensorDataset, DataLoader

from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split


In [2]:
# load file 
# from src.data_ingestion import * 


In [3]:
# load data 
train_data = pd.read_csv('advanced_trainset.csv')
train_data.head()

Unnamed: 0,Sentence,Sentiment
0,According to the Finnish-Russian Chamber of Co...,neutral
1,The Swedish buyout firm has sold its remaining...,neutral
2,$SPY wouldn't be surprised to see a green close,positive
3,Shell's $70 Billion BG Deal Meets Shareholder ...,negative
4,SSH COMMUNICATIONS SECURITY CORP STOCK EXCHANG...,negative


In [4]:
# train test split
X, y = train_data['Sentence'].values, train_data['Sentiment'].values
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, train_size=3300)
print(f'shape of train data is {x_train.shape}')
print(f'shape of test data is {x_test.shape}')


shape of train data is (3300,)
shape of test data is (1082,)


In [5]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased',
    do_lower_case=True
)


In [6]:
encoded_data_train = tokenizer.batch_encode_plus(
    x_train,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=40,
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    x_test,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=40,
    return_tensors='pt'
)

def assign_label(y): 
    if y == 'positive':
        return 2 
    elif y == 'neutral':
        return 1
    else:
        return 0
    
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor([assign_label(y) for y in y_train])

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor([assign_label(y) for y in y_test])


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
dataset_train = TensorDataset(input_ids_train,
                              attention_masks_train,
                              labels_train)

dataset_val = TensorDataset(input_ids_val,
                            attention_masks_val,
                            labels_val)


In [8]:
len(dataset_train)


3300

In [9]:
# model architecture 
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False,
#     num_hidden_layers=20,
#     hidden_dropout_prob=0.2
)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
# dataloaders
batch_size = 20

# make sure to SHUFFLE your data
train_loader = DataLoader(dataset_train, shuffle=True, batch_size=batch_size, drop_last=True)
valid_loader = DataLoader(dataset_val, shuffle=True, batch_size=batch_size, drop_last=True)


In [11]:
from transformers import AdamW, get_linear_schedule_with_warmup


In [12]:
optimizer = AdamW(
    model.parameters(),
    lr=1e-5,
    eps=1e-8
)




In [13]:
epochs = 10

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_loader)*epochs
)


In [14]:
from sklearn.metrics import f1_score


In [15]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def acc_score_func(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return (pred_flat == labels_flat).mean()

In [16]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {0: 'negative', 1: 'neutral', 2: 'positive'}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat == label]
        y_true = labels_flat[labels_flat == label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy:{len(y_preds[y_preds==label])}/{len(y_true)}\n')


In [17]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)


cuda


In [18]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in tqdm(dataloader_val):

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                  }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

    loss_val_avg = loss_val_total/len(dataloader_val)

    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)

    return loss_val_avg, predictions, true_vals


In [19]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0

    progress_bar = tqdm(train_loader,
                        desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)

    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix(
            {'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    #torch.save(model.state_dict(), f'Models/BERT_ft_Epoch{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')
    
    # Train Eval
    loss_train_avg = loss_train_total/len(train_loader)
    train_loss, train_predictions, true_trains = evaluate(train_loader)
    train_acc = acc_score_func(train_predictions, true_trains)
    train_f1 = f1_score_func(train_predictions, true_trains)
    
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Training Acc: {train_acc}, F1 (Weighted): {train_f1}')
    
    # validation Eval
    val_loss, predictions, true_vals = evaluate(valid_loader)
    val_acc = acc_score_func(predictions, true_vals)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'Validation Acc: {val_acc}, F1 (weighted): {val_f1}')


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/165 [00:00<?, ?it/s]


Epoch 1


  0%|          | 0/165 [00:00<?, ?it/s]

Training loss: 0.8472007457054023
Training Acc: 0.76, F1 (Weighted): 0.7526836054028258


  0%|          | 0/54 [00:00<?, ?it/s]

Validation loss: 0.6203811085886426
Validation Acc: 0.7462962962962963, F1 (weighted): 0.7388314568152405


Epoch 2:   0%|          | 0/165 [00:00<?, ?it/s]


Epoch 2


  0%|          | 0/165 [00:00<?, ?it/s]

Training loss: 0.5167929336880193
Training Acc: 0.8521212121212122, F1 (Weighted): 0.8396342022834087


  0%|          | 0/54 [00:00<?, ?it/s]

Validation loss: 0.4540005596699538
Validation Acc: 0.7981481481481482, F1 (weighted): 0.787861070633527


Epoch 3:   0%|          | 0/165 [00:00<?, ?it/s]


Epoch 3


  0%|          | 0/165 [00:00<?, ?it/s]

Training loss: 0.37755420438267967
Training Acc: 0.8912121212121212, F1 (Weighted): 0.8828557446530019


  0%|          | 0/54 [00:00<?, ?it/s]

Validation loss: 0.44987498582513247
Validation Acc: 0.7972222222222223, F1 (weighted): 0.7890600358528134


Epoch 4:   0%|          | 0/165 [00:00<?, ?it/s]


Epoch 4


  0%|          | 0/165 [00:00<?, ?it/s]

Training loss: 0.280313198200681
Training Acc: 0.9087878787878788, F1 (Weighted): 0.9126911759684526


  0%|          | 0/54 [00:00<?, ?it/s]

Validation loss: 0.4847241761507811
Validation Acc: 0.7907407407407407, F1 (weighted): 0.795718025515693


Epoch 5:   0%|          | 0/165 [00:00<?, ?it/s]


Epoch 5


  0%|          | 0/165 [00:00<?, ?it/s]

Training loss: 0.2307829371455944
Training Acc: 0.9196969696969697, F1 (Weighted): 0.9235611746773975


  0%|          | 0/54 [00:00<?, ?it/s]

Validation loss: 0.5113193741827099
Validation Acc: 0.787962962962963, F1 (weighted): 0.7928271770727587


Epoch 6:   0%|          | 0/165 [00:00<?, ?it/s]


Epoch 6


  0%|          | 0/165 [00:00<?, ?it/s]

Training loss: 0.19492382192702004
Training Acc: 0.9278787878787879, F1 (Weighted): 0.9301430497016907


  0%|          | 0/54 [00:00<?, ?it/s]

Validation loss: 0.5704504516389635
Validation Acc: 0.7861111111111111, F1 (weighted): 0.7892495261688484


Epoch 7:   0%|          | 0/165 [00:00<?, ?it/s]


Epoch 7


  0%|          | 0/165 [00:00<?, ?it/s]

Training loss: 0.1722386781634255
Training Acc: 0.9296969696969697, F1 (Weighted): 0.9286123783019323


  0%|          | 0/54 [00:00<?, ?it/s]

Validation loss: 0.5750411288053902
Validation Acc: 0.7861111111111111, F1 (weighted): 0.7837101428634621


Epoch 8:   0%|          | 0/165 [00:00<?, ?it/s]


Epoch 8


  0%|          | 0/165 [00:00<?, ?it/s]

Training loss: 0.15496783158318564
Training Acc: 0.9348484848484848, F1 (Weighted): 0.9352254704202725


  0%|          | 0/54 [00:00<?, ?it/s]

Validation loss: 0.6204002586503824
Validation Acc: 0.7768518518518519, F1 (weighted): 0.7786986948605047


Epoch 9:   0%|          | 0/165 [00:00<?, ?it/s]


Epoch 9


  0%|          | 0/165 [00:00<?, ?it/s]

Training loss: 0.13933475954514563
Training Acc: 0.9363636363636364, F1 (Weighted): 0.9364423749987134


  0%|          | 0/54 [00:00<?, ?it/s]

Validation loss: 0.6357314368640935
Validation Acc: 0.7703703703703704, F1 (weighted): 0.7706135236155097


Epoch 10:   0%|          | 0/165 [00:00<?, ?it/s]


Epoch 10


  0%|          | 0/165 [00:00<?, ?it/s]

Training loss: 0.13018295027767168
Training Acc: 0.9363636363636364, F1 (Weighted): 0.937000147645123


  0%|          | 0/54 [00:00<?, ?it/s]

Validation loss: 0.6531561453033377
Validation Acc: 0.7712962962962963, F1 (weighted): 0.7724190955222683


In [23]:
# inference
test_data = pd.read_csv('advanced_testset.csv')
test_data.head()

Unnamed: 0,Sentence
0,Earnings per share ( EPS ) dropped to EUR 0.21...
1,$SONC Amazing run since middle of March - obvi...
2,"Ruukki Romania , the local arm of Finnish meta..."
3,Self-service and automation are in a bigger ro...
4,Alma Media 's operating profit amounted to EUR...


In [24]:
x_test = test_data['Sentence']
encoded_data_test = tokenizer.batch_encode_plus(
    x_test,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=40,
    return_tensors='pt'
)



In [26]:
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
inputs = {
    'input_ids': input_ids_test,
    'attention_masks': attention_masks_test
}

In [42]:
dataset_test = TensorDataset(input_ids_test,
                              attention_masks_test
                        )
test_loader = DataLoader(dataset_test, shuffle=False, batch_size=1, drop_last=False)

In [60]:
output_pred_list = []

for batch in tqdm(test_loader):

    batch = tuple(b.to(device) for b in batch)

    inputs = {'input_ids':      batch[0],
#               'attention_mask': batch[1]
              }
#     print(inputs)
    with torch.no_grad():
        outputs = model(**inputs).logits
        outputs_predictions = torch.argmax(outputs, axis=1).item()
        output_pred_list.append(outputs_predictions)


  0%|          | 0/1460 [00:00<?, ?it/s]

In [61]:
back_convert_list = []
for pred in output_pred_list:
    if pred == 1:
        back_convert_list.append('neutral')
    elif pred == 2:
        back_convert_list.append('positive')
    else:
        back_convert_list.append('negative')
back_convert_list

['neutral',
 'positive',
 'positive',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'negative',
 'positive',
 'positive',
 'negative',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'negative',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'positive',
 'positive',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'positive',
 'neutral',
 'positive',
 'positive',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'negative',
 'neutral',
 'negative',
 'positive',
 'positive',
 'positive',
 'neutral',
 'neutral',
 'positive',
 'neutral',
 'positive',
 'neutral',
 'positive',
 'positive',
 'neutral',
 'neutral',
 'negative',
 'neutral',
 'neutral',
 'neutral',
 'neutr

In [62]:
# pd.Series(back_convert_list, name='Sentiment').to_csv('prediction.csv')

In [69]:
# total train 
encoded_data_total_train = tokenizer.batch_encode_plus(
    train_data['Sentence'],
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=40,
    return_tensors='pt'
)

    
input_ids_total_train = encoded_data_total_train['input_ids']
attention_masks_total_train = encoded_data_total_train['attention_mask']
labels_total_train = torch.tensor([assign_label(y) for y in train_data['Sentiment']])

dataset_total_train = TensorDataset(input_ids_total_train,
                              attention_masks_total_train,
                              labels_total_train)

In [70]:
# dataloaders
batch_size = 20

# make sure to SHUFFLE your data
total_train_loader = DataLoader(dataset_total_train, shuffle=True, batch_size=batch_size, drop_last=True)
# valid_loader = DataLoader(dataset_val, shuffle=True, batch_size=batch_size, drop_last=True)

In [71]:
for epoch in tqdm(range(1, epochs+1)):
    model.train()
    loss_train_total = 0

    progress_bar = tqdm(total_train_loader,
                        desc='Epoch {:1d}'.format(epoch),
                        leave=False,
                        disable=False)

    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }

        outputs = model(**inputs)
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix(
            {'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    #torch.save(model.state_dict(), f'Models/BERT_ft_Epoch{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')
    
    # Train Eval
    loss_train_avg = loss_train_total/len(total_train_loader)
    train_loss, train_predictions, true_trains = evaluate(total_train_loader)
    train_acc = acc_score_func(train_predictions, true_trains)
    train_f1 = f1_score_func(train_predictions, true_trains)
    
    tqdm.write(f'Training loss: {loss_train_avg}')
    tqdm.write(f'Training Acc: {train_acc}, F1 (Weighted): {train_f1}')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 1


  0%|          | 0/219 [00:00<?, ?it/s]

Training loss: 0.26601154320409787
Training Acc: 0.895662100456621, F1 (Weighted): 0.8965095636075229


Epoch 2:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 2


  0%|          | 0/219 [00:00<?, ?it/s]

Training loss: 0.26364594546133935
Training Acc: 0.895662100456621, F1 (Weighted): 0.8965113210019465


Epoch 3:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 3


  0%|          | 0/219 [00:00<?, ?it/s]

Training loss: 0.26163204417236524
Training Acc: 0.895662100456621, F1 (Weighted): 0.8965095724193118


Epoch 4:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 4


  0%|          | 0/219 [00:00<?, ?it/s]

Training loss: 0.2733523631255785
Training Acc: 0.895662100456621, F1 (Weighted): 0.8965095724193118


Epoch 5:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 5


  0%|          | 0/219 [00:00<?, ?it/s]

Training loss: 0.26160437561418504
Training Acc: 0.895662100456621, F1 (Weighted): 0.8965095636075229


Epoch 6:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 6


  0%|          | 0/219 [00:00<?, ?it/s]

Training loss: 0.26040565034110935
Training Acc: 0.895662100456621, F1 (Weighted): 0.8965095724193118


Epoch 7:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 7


  0%|          | 0/219 [00:00<?, ?it/s]

Training loss: 0.26272081968599936
Training Acc: 0.8958904109589041, F1 (Weighted): 0.8967350678613405


Epoch 8:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 8


  0%|          | 0/219 [00:00<?, ?it/s]

Training loss: 0.26164659983202065
Training Acc: 0.895662100456621, F1 (Weighted): 0.8965095636075229


Epoch 9:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 9


  0%|          | 0/219 [00:00<?, ?it/s]

Training loss: 0.2585843992058069
Training Acc: 0.895662100456621, F1 (Weighted): 0.8965113210019465


Epoch 10:   0%|          | 0/219 [00:00<?, ?it/s]


Epoch 10


  0%|          | 0/219 [00:00<?, ?it/s]

Training loss: 0.26493152678353055
Training Acc: 0.895662100456621, F1 (Weighted): 0.8965095636075229
