In [1]:
import pandas as pd
import numpy as np
import torch

import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
!pip install -q transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/인공지능 프로젝트/project/mbti_1.csv',
                   engine='python', error_bad_lines=False)
print(len(data))
print(data.head(5))

8675
   type                                              posts
0  INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1  ENTP  'I'm finding the lack of me in these posts ver...
2  INTP  'Good one  _____   https://www.youtube.com/wat...
3  INTJ  'Dear INTP,   I enjoyed our conversation the o...
4  ENTJ  'You're fired.|||That's another silly misconce...


In [5]:
mbti_type = data['type'].unique()
type2idx = {type: i for i, type in enumerate(mbti_type)}

data['type_idx'] = data['type'].apply(lambda x: type2idx[x])
data.tail(5)

Unnamed: 0,type,posts,type_idx
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...,8
8671,ENFP,'So...if this thread already exists someplace ...,7
8672,INTP,'So many questions when i do these things. I ...,2
8673,INFP,'I am very conflicted right now when it comes ...,6
8674,INFP,'It has been too long since I have been on per...,6


In [6]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

tokenizer = None

class MBTI_Dataset(Dataset):
    def __init__(self, df):
        self.labels = torch.tensor(df['type_idx'].to_numpy())
        self.posts = [tokenizer(post, padding='max_length', max_length = 512,
                                truncation=True, return_tensors="pt") 
                        for post in df['posts']]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        post = self.posts[idx]
        label = self.labels[idx]
        return post, label

In [7]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, random_state=321)

## SequenceClassification

In [8]:
from transformers import AutoModelForSequenceClassification

model_name = 'distilbert-base-multilingual-cased'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
            model_name,
            num_labels=16, problem_type="single_label_classification"
        ).to(device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['pre_classifier.weight', 'pre_cla

In [9]:
train_set = MBTI_Dataset(train_data.reset_index(drop=True))
train_loader = DataLoader(train_set, batch_size=2, shuffle=True, num_workers=2)
test_set = MBTI_Dataset(test_data.reset_index(drop=True))
test_loader = DataLoader(test_set, batch_size=2, shuffle=False, num_workers=2)

In [10]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

num_epoch = 1

model.train()
for epoch in range(1, num_epoch + 1):
    stat_loss = 0

    for i, (posts, labels) in enumerate(train_loader, 1):
        optimizer.zero_grad()

        posts, labels = posts.to(device), labels.to(device)
        input_id = posts['input_ids'].squeeze(1)
        mask = posts['attention_mask']

        output = model(input_id, mask, labels=labels)
        loss = output.loss
                
        stat_loss += loss.item()
        if i % 500 == 0:
            print(f'[{epoch:-02}, {i:-4}] loss: {stat_loss / i:.4f}')

        loss.backward()
        optimizer.step()

[01,  500] loss: 2.4054
[01, 1000] loss: 2.3379
[01, 1500] loss: 2.2268
[01, 2000] loss: 2.1499
[01, 2500] loss: 2.0833
[01, 3000] loss: 2.0276


In [11]:
acc = 0
model.eval()

with torch.no_grad():
    for i, (posts, labels) in enumerate(test_loader, 1):
        posts, labels = posts.to(device), labels.to(device)
        input_id, mask = posts['input_ids'].squeeze(1), posts['attention_mask']

        output = model(input_id, mask)
        acc += (output.logits.argmax(dim=1) == labels).sum()
        
    print(f'test accuracy: {acc / (len(test_loader) * 2):.2%}')

test accuracy: 47.93%


## Custom

In [12]:
from transformers import AutoModel

model_name = 'distilbert-base-multilingual-cased'

class CustomClassifier(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = AutoModel.from_pretrained(model_name)
        self.layer2 = torch.nn.Sequential(
            torch.nn.Linear(768, 768),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
        )
        self.layer3 = torch.nn.Linear(768, 16)

    def forward(self, input_ids, attention_mask):
        y = self.layer1(input_ids, attention_mask).last_hidden_state[:, 0]
        y = self.layer2(y)
        y = self.layer3(y)
        return y

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = CustomClassifier().to(device)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
train_set = MBTI_Dataset(train_data.reset_index(drop=True))
train_loader = DataLoader(train_set, batch_size=2, shuffle=True, num_workers=2)
test_set = MBTI_Dataset(test_data.reset_index(drop=True))
test_loader = DataLoader(test_set, batch_size=2, shuffle=False, num_workers=2)

In [14]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

num_epoch = 1

model.train()
for epoch in range(1, num_epoch + 1):
    stat_loss = 0

    for i, (posts, labels) in enumerate(train_loader, 1):
        optimizer.zero_grad()

        posts, labels = posts.to(device), labels.to(device)
        input_id, mask = posts['input_ids'].squeeze(1), posts['attention_mask']

        output = model(input_id, mask)
        loss = criterion(output, labels)
                
        stat_loss += loss.item()
        if i % 500 == 0:
            print(f'[{epoch:-02}, {i:-4}] loss: {stat_loss / 500:.4f}')
            stat_loss = 0

        loss.backward()
        optimizer.step()

[01,  500] loss: 2.3495
[01, 1000] loss: 2.2816
[01, 1500] loss: 2.0932
[01, 2000] loss: 1.9732
[01, 2500] loss: 1.8202
[01, 3000] loss: 1.7548


In [15]:
acc = 0
model.eval()

with torch.no_grad():
    for i, (posts, labels) in enumerate(test_loader, 1):
        posts, labels = posts.to(device), labels.to(device)
        input_id, mask = posts['input_ids'].squeeze(1), posts['attention_mask']

        output = model(input_id, mask)
        acc += (output.argmax(dim=1) == labels).sum()
        
    print(f'test accuracy: {acc / (len(test_loader) * 2):.2%}')

test accuracy: 44.70%
