In [None]:
import pandas as pd
import numpy as np
import torch

import warnings
warnings.filterwarnings("ignore")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
%pip install -q transformers emoji

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.9/240.9 kB[0m [31m23.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m20.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m37.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for emoji (setup.py) ... [?25l[?25hdone


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Model

In [None]:
from transformers import AutoModel, AutoTokenizer

class MBTI_Classifier(torch.nn.Module):
    def __init__(self, model_name='distilbert-base-multilingual-cased'):
        super().__init__()
        self.layer1 = AutoModel.from_pretrained(model_name)
        self.layer2 = torch.nn.Sequential(
            torch.nn.Linear(768, 768),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
        )
        self.layer3 = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask):
        y = self.layer1(input_ids, attention_mask).last_hidden_state[:, 0]
        y = self.layer2(y)
        y = self.layer3(y)
        return y

In [None]:
def train(model, optimizer, criterion, train_loader, num_epoch):

    model.train()
    for epoch in range(1, num_epoch + 1):
        stat_loss = 0

        for i, (inputs, labels) in enumerate(train_loader, 1):
            optimizer.zero_grad()

            inputs, labels = inputs.to(device), labels.to(device, torch.float)
            ids, mask = inputs['input_ids'].squeeze(1), inputs['attention_mask']

            output = model(ids, mask)
            loss = criterion(output, labels)
                    
            stat_loss += loss.item()
            if i % 500 == 0:
                print(f'[{epoch:-02}, {i:-4}] loss: {stat_loss / 500:.4f} ({i * epoch / len(train_loader) / num_epoch:.2%})')
                stat_loss = 0

            loss.backward()
            optimizer.step()

In [None]:
from tqdm import tqdm

def test(model, criterion, test_loader, batch_size):
    acc = 0
    perfect_match_acc = 0
    loss = 0
    model.eval()

    with torch.no_grad():
        for inputs, labels in tqdm(test_loader):
            inputs, labels = inputs.to(device), labels.to(device, torch.float)
            ids, mask = inputs['input_ids'].squeeze(1), inputs['attention_mask']

            output = model(ids, mask)
            acc += ((torch.sigmoid(output) > 0.5) == labels).sum()
            perfect_match_acc += torch.all((torch.sigmoid(output) > 0.5) == labels, 1).sum()
            loss += criterion(output, labels)
            
        print()
        print(f'test accuracy: {acc / len(test_loader) / batch_size / 4:.2%}')
        print(f'test perfect match accuracy: {perfect_match_acc / len(test_loader) / batch_size:.2%}')
        print(f'loss: {loss / len(test_loader)}')

In [None]:
from torch.utils.data import Dataset, DataLoader

class MBTI_Dataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.tokenizer(self.df.loc[idx, 'text'], padding='max_length',
                              max_length = 512, add_special_tokens = True, 
                              truncation=True, return_tensors="pt")
        label = torch.tensor(self.df.loc[idx, 'vector_label'])
        return text, label

## Twitter MBTI Dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/인공지능 프로젝트/project/twitter_MBTI.csv')

In [None]:
df = df.drop('Unnamed: 0', axis= 1)
df['label'] = df['label'].str.upper()

def vectorize_label(row):
    return [int(row[0] == 'I'), int(row[1] == 'N'), 
            int(row[2] == 'T'), int(row[3] == 'J')]
df['vector_label'] = df['label'].apply(vectorize_label)

print(df.head(5))

                                                text label  vector_label
0  @Pericles216 @HierBeforeTheAC @Sachinettiyil T...  INTJ  [1, 1, 1, 1]
1  @Hispanthicckk Being you makes you look cute||...  INTJ  [1, 1, 1, 1]
2  @Alshymi Les balles sont réelles et sont tirée...  INTJ  [1, 1, 1, 1]
3  I'm like entp but idiotic|||Hey boy, do you wa...  INTJ  [1, 1, 1, 1]
4  @kaeshurr1 Give it to @ZargarShanif ... He has...  INTJ  [1, 1, 1, 1]


In [None]:
import re
from emoji import demojize

url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
escapes = ''.join([chr(char) for char in range(1, 32)])
translator = str.maketrans('', '', escapes)

def preprocess(row):
    row = demojize(row)
    row = re.sub(r'[\r\n]', '', row)
    row = re.sub(url_regex, '', row)
    row = row.translate(translator)
    row = '[SEP]'.join([i.strip() for i in row.split('|||') if i.strip() != ''][:25])
    return row

df['text'] = df['text'].apply(preprocess)
twitter_data = df
print(df.head(5))

                                                text label  vector_label
0  @Pericles216 @HierBeforeTheAC @Sachinettiyil T...  INTJ  [1, 1, 1, 1]
1  @Hispanthicckk Being you makes you look cute[S...  INTJ  [1, 1, 1, 1]
2  @Alshymi Les balles sont réelles et sont tirée...  INTJ  [1, 1, 1, 1]
3  I'm like entp but idiotic[SEP]Hey boy, do you ...  INTJ  [1, 1, 1, 1]
4  @kaeshurr1 Give it to @ZargarShanif ... He has...  INTJ  [1, 1, 1, 1]


## Forum MBTI Dataset
https://www.kaggle.com/datasets/datasnaek/mbti-type

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/인공지능 프로젝트/project/mbti_1.csv',
                   engine='python', error_bad_lines=False)
print(len(data))
print(data.head(5))

8675
   type                                              posts
0  INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1  ENTP  'I'm finding the lack of me in these posts ver...
2  INTP  'Good one  _____   https://www.youtube.com/wat...
3  INTJ  'Dear INTP,   I enjoyed our conversation the o...
4  ENTJ  'You're fired.|||That's another silly misconce...


In [None]:
idx2type = ['IE', 'NS', 'TF', 'JP']

def vectorize_label(row):
    return [int(row[0] == 'I'), int(row[1] == 'N'), 
            int(row[2] == 'T'), int(row[3] == 'J')]
data['vector_label'] = data['type'].apply(vectorize_label)

print(data.head(5))

   type                                              posts  vector_label
0  INFJ  'http://www.youtube.com/watch?v=qsXHcwe3krw|||...  [1, 1, 0, 1]
1  ENTP  'I'm finding the lack of me in these posts ver...  [0, 1, 1, 0]
2  INTP  'Good one  _____   https://www.youtube.com/wat...  [1, 1, 1, 0]
3  INTJ  'Dear INTP,   I enjoyed our conversation the o...  [1, 1, 1, 1]
4  ENTJ  'You're fired.|||That's another silly misconce...  [0, 1, 1, 1]


In [None]:
import re
from emoji import demojize

url_regex = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'

def preprocess(row):
    row = demojize(row)
    row = re.sub(r'[\r\n]', '', row)
    row = re.sub(url_regex, '', row)
    row = re.sub(r'^\W*\'|\'\W*$', '', row)
    row = '[SEP]'.join([i.strip() for i in row.split('|||') if i.strip() != ''][:25])
    return row

data['posts'] = data['posts'].apply(preprocess)
print(data.head(5))

   type                                              posts  vector_label
0  INFJ  enfp and intj moments    sportscenter not top ...  [1, 1, 0, 1]
1  ENTP  I'm finding the lack of me in these posts very...  [0, 1, 1, 0]
2  INTP  Good one  _____[SEP]Of course, to which I say ...  [1, 1, 1, 0]
3  INTJ  Dear INTP,   I enjoyed our conversation the ot...  [1, 1, 1, 1]
4  ENTJ  You're fired.[SEP]That's another silly misconc...  [0, 1, 1, 1]


In [None]:
data = data.rename(columns={'posts': 'text'})

## Forum Train & Forum Test

In [None]:
model_name = 'distilbert-base-multilingual-cased'
model = MBTI_Classifier(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size=0.2, random_state=321)
train_data, test_data = train_data.reset_index(drop=True), test_data.reset_index(drop=True)

In [None]:
batch_size = 2

train_set = MBTI_Dataset(train_data, tokenizer)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
test_set = MBTI_Dataset(test_data, tokenizer)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()
train(model, optimizer, criterion, train_loader, 1)

[01,  500] loss: 0.5887 (14.41%)
[01, 1000] loss: 0.5656 (28.82%)
[01, 1500] loss: 0.4973 (43.23%)
[01, 2000] loss: 0.4937 (57.64%)
[01, 2500] loss: 0.4477 (72.05%)
[01, 3000] loss: 0.4507 (86.46%)


In [None]:
test(model, criterion, test_loader, batch_size)

100%|██████████| 868/868 [00:29<00:00, 29.38it/s]


test accuracy: 79.49%
test perfect match accuracy: 48.04%
loss: 0.44923773407936096





## Twitter Train & Twitter Test

In [None]:
model_name = 'distilbert-base-multilingual-cased'
model = MBTI_Classifier(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(twitter_data, test_size=0.2, random_state=321)
train_data, test_data = train_data.reset_index(drop=True), test_data.reset_index(drop=True)

In [None]:
batch_size = 2

train_set = MBTI_Dataset(train_data, tokenizer)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
test_set = MBTI_Dataset(test_data, tokenizer)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()
train(model, optimizer, criterion, train_loader, 1)

[01,  500] loss: 0.6434 (16.01%)
[01, 1000] loss: 0.6399 (32.01%)
[01, 1500] loss: 0.6277 (48.02%)
[01, 2000] loss: 0.6237 (64.02%)
[01, 2500] loss: 0.6307 (80.03%)
[01, 3000] loss: 0.6311 (96.03%)


In [None]:
test(model, criterion, test_loader, batch_size)

100%|██████████| 782/782 [00:26<00:00, 28.97it/s]


test accuracy: 66.10%
test perfect match accuracy: 18.09%
loss: 0.621288537979126





## Forum Train & Twitter Test

In [None]:
model_name = 'distilbert-base-multilingual-cased'
model = MBTI_Classifier(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
train_data = data
test_data = twitter_data

In [None]:
batch_size = 2

train_set = MBTI_Dataset(train_data, tokenizer)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
test_set = MBTI_Dataset(test_data, tokenizer)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()
train(model, optimizer, criterion, train_loader, 1)

[01,  500] loss: 0.5756 (11.53%)
[01, 1000] loss: 0.5523 (23.05%)
[01, 1500] loss: 0.5303 (34.58%)
[01, 2000] loss: 0.4807 (46.10%)
[01, 2500] loss: 0.4789 (57.63%)
[01, 3000] loss: 0.4638 (69.16%)
[01, 3500] loss: 0.4629 (80.68%)
[01, 4000] loss: 0.4401 (92.21%)


In [None]:
test(model, criterion, test_loader, batch_size)

100%|██████████| 3906/3906 [02:16<00:00, 28.62it/s]


test accuracy: 67.68%
test perfect match accuracy: 23.31%
loss: 0.6003632545471191





## Twitter Train & Forum Test

In [None]:
model_name = 'distilbert-base-multilingual-cased'
model = MBTI_Classifier(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-multilingual-cased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
train_data = twitter_data
test_data = data

In [None]:
batch_size = 2

train_set = MBTI_Dataset(train_data, tokenizer)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
test_set = MBTI_Dataset(test_data, tokenizer)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.BCEWithLogitsLoss()
train(model, optimizer, criterion, train_loader, 1)

[01,  500] loss: 0.6495 (12.80%)
[01, 1000] loss: 0.6300 (25.60%)
[01, 1500] loss: 0.6299 (38.40%)
[01, 2000] loss: 0.6249 (51.20%)
[01, 2500] loss: 0.6326 (64.00%)
[01, 3000] loss: 0.6223 (76.80%)
[01, 3500] loss: 0.6252 (89.61%)


In [None]:
test(model, criterion, test_loader, batch_size)

100%|██████████| 4338/4338 [02:35<00:00, 27.97it/s]


test accuracy: 64.77%
test perfect match accuracy: 17.70%
loss: 0.6243722438812256



