In [2]:
!pip install lightning



In [3]:
from transformers import AutoTokenizer
from transformers import DistilBertModel, DistilBertTokenizer
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
import lightning as L
import torch.nn as nn
import csv

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
!unzip "/content/archive_27.zip"

unzip:  cannot find or open /content/archive_27.zip, /content/archive_27.zip.zip or /content/archive_27.zip.ZIP.


In [6]:
data = pd.read_csv("/kaggle/input/mbtidata/mbti_1.csv")

In [7]:
data

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...
...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...


In [8]:
encode_dict = {}

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

data['ENCODE_CAT'] = data['type'].apply(lambda x: encode_cat(x))

In [9]:
data

Unnamed: 0,type,posts,ENCODE_CAT
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,0
1,ENTP,'I'm finding the lack of me in these posts ver...,1
2,INTP,'Good one _____ https://www.youtube.com/wat...,2
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...",3
4,ENTJ,'You're fired.|||That's another silly misconce...,4
...,...,...,...
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,8
8671,ENFP,'So...if this thread already exists someplace ...,7
8672,INTP,'So many questions when i do these things. I ...,2
8673,INFP,'I am very conflicted right now when it comes ...,6


In [10]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [11]:
from sklearn.preprocessing import LabelEncoder

LabelEncoder().fit_transform(data['type'])

array([ 8,  3, 11, ..., 11,  9,  9])

In [12]:
from torch.utils.data import random_split

In [13]:
class MBTIdataset(Dataset):
    def __init__(self, data, tokenizer, max_len):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        post = self.data.posts[index]
        inputs = self.tokenizer.encode_plus(text=post,max_length=self.max_len,truncation=True, pad_to_max_length=True, return_token_type_ids=True)
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids,dtype=torch.long),
            'mask': torch.tensor(mask,dtype=torch.long),
            'targets': torch.tensor(self.data.ENCODE_CAT[index], dtype=torch.long)

        }

    def __len__(self):
        return len(self.data)

train_size = int(0.8 * len(data))
val_size = len(data) - train_size

dataset = MBTIdataset(data,tokenizer,512)

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False,num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False,num_workers=0)

In [14]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [15]:
from sklearn.model_selection import train_test_split
import torchmetrics

In [16]:
class Model(L.LightningModule):
    def __init__(self, *args, **kwargs) -> None:
        super().__init__(*args, **kwargs)
        self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = nn.Linear(768,768)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768,16)
        self.accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=16)
        self.f1_score = torchmetrics.F1Score(task="multiclass", num_classes=16)

    def forward(self,input_ids, attention_mask):
        out_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = out_1[0]
        pooler = hidden_state[:,0]
        pooler = self.pre_classifier(pooler)
        pooler = nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

    def training_step(self, batch, batch_idx):
        item = batch
        ids = item['ids']
        mask = item['mask']
        targets = item['targets']
        outputs = self(ids, mask)
        loss = torch.nn.CrossEntropyLoss()(outputs, targets)
        accuracy = self.accuracy(outputs, targets)
        f1_score = self.f1_score(outputs, targets)
        self.log_dict({'train_loss':loss,'train_accuracy':accuracy,'train_f1_score':f1_score},on_epoch=True,prog_bar=True)

        return loss

    def validation_step(self,batch,batch_idx):
      item = batch
      ids = item['ids']
      mask = item['mask']
      targets = item['targets']
      outputs = self(ids, mask)
      loss = torch.nn.CrossEntropyLoss()(outputs, targets)
      self.log('val_loss',loss)

      return loss

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=1e-5)


In [17]:
next(iter(train_loader))



{'ids': tensor([[  101,   112,  4514,  ...,  1547,  1136,   102],
         [  101,   112,   146,  ..., 21155, 11776,   102],
         [  101,   112,  2066,  ...,   119,   119,   102],
         ...,
         [  101,  2009,  1132,  ...,   119,   119,   102],
         [  101,   112,  7277,  ...,  9020, 15969,   102],
         [  101,   112,   113,  ...,   117,  1105,   102]]),
 'mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         ...,
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 1, 1, 1]]),
 'targets': tensor([ 7,  0,  2,  0,  6,  5,  6,  2,  8,  0,  6,  0,  6, 13,  2,  6,  7,  3,
          3, 10,  2, 11,  2,  2,  2,  0,  0,  7,  6,  8,  5,  0])}

In [18]:
import wandb
from pytorch_lightning.loggers import WandbLogger

In [19]:
wandb_logger = WandbLogger(project='my-awesome-project')

In [20]:
model = Model()
trainer = L.Trainer(logger=wandb_logger, max_epochs=10)
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

[34m[1mwandb[0m: Currently logged in as: [33mameen-91[0m. Use [1m`wandb login --relogin`[0m to force relogin


INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name           | Type               | Params
------------------------------------------------------
0 | bert           | DistilBertModel    | 66.4 M
1 | pre_classifier | Linear             | 590 K 
2 | dropout        | Dropout            | 0     
3 | classifier     | Linear             | 12.3 K
4 | accuracy       | MulticlassAccuracy | 0     
5 | f1_score       | MulticlassF1Score  | 0     
------------------------------------------------------
67.0 M    Trainable params
0         Non-trainable params
67.0 M    Total params
267.863   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/opt/conda/lib/python3.10/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

In [21]:
wandb.finish()

VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇████
train_accuracy_epoch,▁▂▄▄▅▆▇▇██
train_accuracy_step,▂▂▁▂▂▃▂▄▄▄▃▄▆▄▅▅▅▅▄▄▆▆▆▆▆█▆▇▇▇▇▆▇▇▇▇███▇
train_f1_score_epoch,▁▂▄▄▅▆▇▇██
train_f1_score_step,▂▂▁▂▂▃▂▄▄▄▃▄▆▄▅▅▅▅▄▄▆▆▆▆▆█▆▇▇▇▇▆▇▇▇▇███▇
train_loss_epoch,█▇▆▅▄▄▃▂▂▁
train_loss_step,███▇█▇▇▇▆▆▆▅▄▆▆▅▅▄▅▆▄▅▄▅▄▂▄▃▃▂▂▄▂▂▂▃▂▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▄▁▁▁▁▂▃▅▆

0,1
epoch,9.0
train_accuracy_epoch,0.84899
train_accuracy_step,0.84375
train_f1_score_epoch,0.84899
train_f1_score_step,0.84375
train_loss_epoch,0.62346
train_loss_step,0.49634
trainer/global_step,2169.0
val_loss,2.11936


In [22]:
torch.cuda.empty_cache()

In [23]:
# Saving the files for re-use

output_model_file = './pytorch_distilbert_mbti.bin'
output_vocab_file = './vocab_distilbert_mbti.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')
print('This tutorial is completed')

All files saved
This tutorial is completed


In [24]:
%cd /kaggle/working

/kaggle/working


In [28]:
from IPython.display import FileLink 
FileLink(r'pytorch_distilbert_mbti.bin')