<a href="https://colab.research.google.com/github/aksanaboo/persona_predict/blob/master/Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [2]:
import pandas as pd
import numpy as np

import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
mbti_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MBTI 500.csv')

In [5]:
mbti_data.head()

Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably p hd low except wew lad video p min...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ


In [6]:
mbti_data.type.value_counts()

INTP    24961
INTJ    22427
INFJ    14963
INFP    12134
ENTP    11725
ENFP     6167
ISTP     3424
ENTJ     2955
ESTP     1986
ENFJ     1534
ISTJ     1243
ISFP      875
ISFJ      650
ESTJ      482
ESFP      360
ESFJ      181
Name: type, dtype: int64

In [7]:
possible_labels = mbti_data.type.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'INTJ': 0,
 'INTP': 1,
 'ISFJ': 2,
 'ISFP': 3,
 'ISTJ': 4,
 'ISTP': 5,
 'ENFJ': 6,
 'ENFP': 7,
 'ENTJ': 8,
 'ENTP': 9,
 'ESFJ': 10,
 'ESFP': 11,
 'ESTJ': 12,
 'ESTP': 13,
 'INFJ': 14,
 'INFP': 15}

In [8]:
mbti_data['label'] = mbti_data.type.replace(label_dict)

In [9]:
mbti_data.head()

Unnamed: 0,posts,type,label
0,know intj tool use interaction people excuse a...,INTJ,0
1,rap music ehh opp yeah know valid well know fa...,INTJ,0
2,preferably p hd low except wew lad video p min...,INTJ,0
3,drink like wish could drink red wine give head...,INTJ,0
4,space program ah bad deal meing freelance max ...,INTJ,0


In [10]:
mbti = mbti_data.sample(frac=1).reset_index(drop=True)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(mbti.index.values,
                                                  mbti.label.values,
                                                  test_size=0.3,
                                                  random_state=42,
                                                  stratify=mbti.label.values)

In [12]:
# imbalanced dataset

In [13]:
mbti['data_type'] = ['not_set']*mbti.shape[0]

mbti.loc[X_train, 'data_type'] = 'train'
mbti.loc[X_test, 'data_type'] = 'test'

In [14]:
mbti.groupby(['type', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,posts
type,label,data_type,Unnamed: 3_level_1
ENFJ,6,test,460
ENFJ,6,train,1074
ENFP,7,test,1850
ENFP,7,train,4317
ENTJ,8,test,887
ENTJ,8,train,2068
ENTP,9,test,3518
ENTP,9,train,8207
ESFJ,10,test,54
ESFJ,10,train,127


In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

In [16]:
encoded_data_train = tokenizer.batch_encode_plus(
    mbti[mbti.data_type=='train'].posts.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=350,
    return_tensors='pt'
)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:
encoded_data_test = tokenizer.batch_encode_plus(
    mbti[mbti.data_type=='test'].posts.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=350,
    return_tensors='pt'
)

In [18]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(mbti[mbti.data_type=='train'].label.values)

In [19]:
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(mbti[mbti.data_type=='test'].label.values)

In [20]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [21]:
len(dataset_train), len(dataset_test)

(74246, 31821)

In [22]:
!pip install transformers --upgrade



In [23]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


In [25]:
batch_size = 16

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_test = DataLoader(dataset_test,
                                   sampler=SequentialSampler(dataset_test),
                                   batch_size=batch_size)

In [26]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [27]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)



In [28]:
epochs = 8

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [29]:
# performance metrics : f1 score and accuracy score

In [30]:
from sklearn.metrics import f1_score, accuracy_score

In [31]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [32]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [33]:
import random

seed_test = 17
random.seed(seed_test)
np.random.seed(seed_test)
torch.manual_seed(seed_test)
torch.cuda.manual_seed_all(seed_test)

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [35]:
def evaluate(dataloader_test):

    model.eval()

    loss_test_total = 0
    predictions, true_test = [], []

    for batch in dataloader_test:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_test_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_test.append(label_ids)

    loss_test_avg = loss_test_total/len(dataloader_test)

    predictions = np.concatenate(predictions, axis=0)
    true_test = np.concatenate(true_test, axis=0)

    return loss_test_avg, predictions, true_test

In [36]:
best_val_loss = float('inf')  # Initialize best validation loss with a high value
patience = 3  # Number of epochs to wait before stopping
epochs_without_improvement = 0

In [39]:
import os
import torch

# Define the directory path
directory = 'data_volume'

# Ensure the directory exists, if not, create it
if not os.path.exists(directory):
    os.makedirs(directory)

# Save the model's state dictionary
torch.save(model.state_dict(), os.path.join(directory, f'finetuned_BERT_epoch_{epoch}.model'))

In [40]:
for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


    torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_test = evaluate(dataloader_test)
    val_f1 = f1_score_func(predictions, true_test)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

    # Check if validation loss improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
      epochs_without_improvement += 1
    # Early stopping check
    if epochs_without_improvement >= patience:
      print(f'Early stopping at {epoch}')
      break


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/4641 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.0959128612936173
Validation loss: 0.8656419762763742
F1 Score (Weighted): 0.7299803434627727


Epoch 2:   0%|          | 0/4641 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.7882462639977285
Validation loss: 0.7851685045681268
F1 Score (Weighted): 0.7574908542418513


Epoch 3:   0%|          | 0/4641 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.6608703725667664
Validation loss: 0.7675126717131362
F1 Score (Weighted): 0.7671026017825413


Epoch 4:   0%|          | 0/4641 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.5561108368231584
Validation loss: 0.779687130890221
F1 Score (Weighted): 0.769446218676761


Epoch 5:   0%|          | 0/4641 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.46486879163838085
Validation loss: 0.8078716372066239
F1 Score (Weighted): 0.7741408694955754


Epoch 6:   0%|          | 0/4641 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.3833104286809825
Validation loss: 0.9045354473437831
F1 Score (Weighted): 0.7702757089005137
Early stopping at 6


In [41]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)


model.to(device)

model.load_state_dict(torch.load('data_volume/finetuned_BERT_epoch_1.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_test)
accuracy_per_class(predictions, true_vals)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class: INTJ
Accuracy: 4926/6728

Class: INTP
Accuracy: 6097/7489

Class: ISFJ
Accuracy: 100/195

Class: ISFP
Accuracy: 142/262

Class: ISTJ
Accuracy: 193/373

Class: ISTP
Accuracy: 677/1027

Class: ENFJ
Accuracy: 256/460

Class: ENFP
Accuracy: 1251/1850

Class: ENTJ
Accuracy: 588/887

Class: ENTP
Accuracy: 2304/3518

Class: ESFJ
Accuracy: 0/54

Class: ESFP
Accuracy: 3/108

Class: ESTJ
Accuracy: 90/145

Class: ESTP
Accuracy: 453/596

Class: INFJ
Accuracy: 3240/4489

Class: INFP
Accuracy: 2958/3640



In [42]:
def get_predictions(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(b.to(device) for b in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2],
            }
            outputs = model(**inputs)
            _, predicted_labels = torch.max(outputs[1], dim=1)  # Get predicted labels from logits
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(batch[2].cpu().numpy())  # Assuming batch[2] contains true labels

    return predictions, true_labels

# Get predictions for validation dataset
val_predictions, val_true_labels = get_predictions(model, dataloader_test)

# Calculate accuracy
accuracy = accuracy_score(val_true_labels, val_predictions)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.73
