<a href="https://colab.research.google.com/github/aksanaboo/persona_predict/blob/master/Bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m32.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m84.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m82.0 MB/s[0m eta [36m0:00:0

In [2]:
import pandas as pd
import numpy as np

import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification
from sklearn.model_selection import train_test_split

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
mbti_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MBTI 500.csv')

In [5]:
mbti_data.head()

Unnamed: 0,posts,type
0,know intj tool use interaction people excuse a...,INTJ
1,rap music ehh opp yeah know valid well know fa...,INTJ
2,preferably p hd low except wew lad video p min...,INTJ
3,drink like wish could drink red wine give head...,INTJ
4,space program ah bad deal meing freelance max ...,INTJ


In [6]:
mbti_data.type.value_counts()

INTP    24961
INTJ    22427
INFJ    14963
INFP    12134
ENTP    11725
ENFP     6167
ISTP     3424
ENTJ     2955
ESTP     1986
ENFJ     1534
ISTJ     1243
ISFP      875
ISFJ      650
ESTJ      482
ESFP      360
ESFJ      181
Name: type, dtype: int64

In [None]:
possible_labels = mbti_data.type.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

In [8]:
mbti_data['label'] = mbti_data.type.replace(label_dict)

In [9]:
mbti_data.head()

Unnamed: 0,posts,type,label
0,know intj tool use interaction people excuse a...,INTJ,0
1,rap music ehh opp yeah know valid well know fa...,INTJ,0
2,preferably p hd low except wew lad video p min...,INTJ,0
3,drink like wish could drink red wine give head...,INTJ,0
4,space program ah bad deal meing freelance max ...,INTJ,0


In [10]:
mbti = mbti_data.sample(frac=1).reset_index(drop=True)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(mbti.index.values,
                                                  mbti.label.values,
                                                  test_size=0.3,
                                                  random_state=42,
                                                  stratify=mbti.label.values)

In [12]:
# imbalanced dataset

In [13]:
mbti['data_type'] = ['not_set']*mbti.shape[0]

mbti.loc[X_train, 'data_type'] = 'train'
mbti.loc[X_test, 'data_type'] = 'test'

In [14]:
mbti.groupby(['type', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,posts
type,label,data_type,Unnamed: 3_level_1
ENFJ,6,test,460
ENFJ,6,train,1074
ENFP,7,test,1850
ENFP,7,train,4317
ENTJ,8,test,887
ENTJ,8,train,2068
ENTP,9,test,3518
ENTP,9,train,8207
ESFJ,10,test,54
ESFJ,10,train,127


In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
encoded_data_train = tokenizer.batch_encode_plus(
    mbti[mbti.data_type=='train'].posts.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=450,
    return_tensors='pt'
)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:
encoded_data_test = tokenizer.batch_encode_plus(
    mbti[mbti.data_type=='test'].posts.values,
    add_special_tokens=True,
    return_attention_mask=True,
    pad_to_max_length=True,
    max_length=450,
    return_tensors='pt'
)

In [18]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(mbti[mbti.data_type=='train'].label.values)

In [19]:
input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(mbti[mbti.data_type=='test'].label.values)

In [20]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

In [21]:
len(dataset_train), len(dataset_test)

(74246, 31821)

In [22]:
!pip install transformers --upgrade



In [23]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)


Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler


In [25]:
batch_size = 16

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=batch_size)

dataloader_test = DataLoader(dataset_test,
                                   sampler=SequentialSampler(dataset_test),
                                   batch_size=batch_size)

In [26]:
from transformers import AdamW, get_linear_schedule_with_warmup

In [27]:
optimizer = AdamW(model.parameters(),
                  lr=1e-5,
                  eps=1e-8)



In [28]:
epochs = 8

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

In [29]:
# performance metrics : f1 score and accuracy score

In [30]:
from sklearn.metrics import f1_score, accuracy_score

In [31]:
def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

In [32]:
def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [33]:
import random

seed_test = 17
random.seed(seed_test)
np.random.seed(seed_test)
torch.manual_seed(seed_test)
torch.cuda.manual_seed_all(seed_test)

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print(device)

cuda


In [35]:
def evaluate(dataloader_test):

    model.eval()

    loss_test_total = 0
    predictions, true_test = [], []

    for batch in dataloader_test:

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_test_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_test.append(label_ids)

    loss_test_avg = loss_test_total/len(dataloader_test)

    predictions = np.concatenate(predictions, axis=0)
    true_test = np.concatenate(true_test, axis=0)

    return loss_test_avg, predictions, true_test

In [36]:
best_val_loss = float('inf')  # Initialize best validation loss with a high value
patience = 3  # Number of epochs to wait before stopping
epochs_without_improvement = 0

In [40]:
import os
import torch

# Define the directory path
directory = 'data_volume'

# Ensure the directory exists, if not, create it
if not os.path.exists(directory):
    os.makedirs(directory)

# Save the model's state dictionary
torch.save(model.state_dict(), os.path.join(directory, f'finetuned_BERT_epoch_{epoch}.model'))

In [41]:
for epoch in tqdm(range(1, epochs+1)):

    model.train()

    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


    torch.save(model.state_dict(), f'data_volume/finetuned_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_test = evaluate(dataloader_test)
    val_f1 = f1_score_func(predictions, true_test)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

    # Check if validation loss improved
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        epochs_without_improvement = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
      epochs_without_improvement += 1
    # Early stopping check
    if epochs_without_improvement >= patience:
      print(f'Early stopping at {epoch}')
      break


  0%|          | 0/8 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/4641 [00:00<?, ?it/s]


Epoch 1
Training loss: 0.988284615943065
Validation loss: 0.74344688448671
F1 Score (Weighted): 0.7735106226533216


Epoch 2:   0%|          | 0/4641 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.6413266606616052
Validation loss: 0.6162814078969594
F1 Score (Weighted): 0.8115380316603557


Epoch 3:   0%|          | 0/4641 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.5183626343719894
Validation loss: 0.6339808778810165
F1 Score (Weighted): 0.8121970373971313


Epoch 4:   0%|          | 0/4641 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.4216467354766121
Validation loss: 0.6013620403525501
F1 Score (Weighted): 0.8239943587446362


Epoch 5:   0%|          | 0/4641 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.3423800701494201
Validation loss: 0.6403717207078541
F1 Score (Weighted): 0.8238333296425947


Epoch 6:   0%|          | 0/4641 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.2789416168931775
Validation loss: 0.7035014673249488
F1 Score (Weighted): 0.8215679739638755


Epoch 7:   0%|          | 0/4641 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.2324979597390777
Validation loss: 0.754215016777549
F1 Score (Weighted): 0.8225425782694721
Early stopping at 7


In [48]:
model.load_state_dict(torch.load('data_volume/finetuned_BERT_epoch_4.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_test)
accuracy_per_class(predictions, true_vals)

Class: INTJ
Accuracy: 5651/6728

Class: INTP
Accuracy: 6374/7489

Class: ISFJ
Accuracy: 129/195

Class: ISFP
Accuracy: 158/262

Class: ISTJ
Accuracy: 231/373

Class: ISTP
Accuracy: 848/1027

Class: ENFJ
Accuracy: 301/460

Class: ENFP
Accuracy: 1516/1850

Class: ENTJ
Accuracy: 701/887

Class: ENTP
Accuracy: 3016/3518

Class: ESFJ
Accuracy: 28/54

Class: ESFP
Accuracy: 64/108

Class: ESTJ
Accuracy: 112/145

Class: ESTP
Accuracy: 559/596

Class: INFJ
Accuracy: 3632/4489

Class: INFP
Accuracy: 2901/3640



In [47]:
def get_predictions(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []

    with torch.no_grad():
        for batch in dataloader:
            batch = tuple(b.to(device) for b in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2],
            }
            outputs = model(**inputs)
            _, predicted_labels = torch.max(outputs[1], dim=1)  # Get predicted labels from logits
            predictions.extend(predicted_labels.cpu().numpy())
            true_labels.extend(batch[2].cpu().numpy())  # Assuming batch[2] contains true labels

    return predictions, true_labels

# Get predictions for validation dataset
val_predictions, val_true_labels = get_predictions(model, dataloader_test)

# Calculate accuracy
accuracy = accuracy_score(val_true_labels, val_predictions)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.82
