In [1]:
import os
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import mlflow

import torch
from torch.utils.data import Dataset, DataLoader

from transformers import AutoModel, AutoTokenizer

2023-04-27 18:49:28.718494: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-27 18:49:29.755712: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/include:/usr/local/cuda/lib64::/usr/local/cuda/extras/CUPTI/lib64
2023-04-27 18:49:29.755825: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/include:/usr/local/cuda/lib64::/

In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
else:
    device = torch.device('cpu')
print(device)

cuda:0


In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# file_name = "./dataset/cleaned_data.csv"
file_name = "./dataset/preprocessed_data.csv"

df = pd.read_csv(file_name)
LABEL = 'label'

# df > X, y
X, y = df.drop(LABEL, axis=1), df.loc[:, LABEL]

# encoding label
label_encoder = LabelEncoder()
label_encoder.fit(y)
df['label'] = y = label_encoder.transform(y) # range(7) > ['anger', 'disgust', 'fear', 'happiness', 'neutralism', 'sadness', 'surprise']

n_classes = len(label_encoder.classes_)

# split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)

trainset, testset = train_test_split(df, test_size=0.3, random_state=42, shuffle=True, stratify=df['label'])

In [4]:
label_encoder.inverse_transform(range(7))

array(['anger', 'disgust', 'fear', 'happiness', 'neutralism', 'sadness',
       'surprise'], dtype=object)

---

In [5]:
trainset['text'].apply(len).describe()

count    26955.000000
mean        21.755741
std         13.089562
min          1.000000
25%         13.000000
50%         19.000000
75%         27.000000
max        294.000000
Name: text, dtype: float64

In [6]:
testset['text'].apply(len).describe()

count    11553.000000
mean        21.373583
std         12.587119
min          1.000000
25%         13.000000
50%         19.000000
75%         27.000000
max        299.000000
Name: text, dtype: float64

In [7]:
# https://huggingface.co/klue/bert-base

# model = AutoModel.from_pretrained(CHECKPOINT)
# tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)
CHECKPOINT = "klue/bert-base"

In [8]:
class TokenDataset(Dataset):
    def __init__(self, dataframe, tokenizer_pretrained=CHECKPOINT, token_max_length=512):
        self.data = dataframe        
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_pretrained)
        self.token_max_len = token_max_length
  
    def __len__(self):
        return len(self.data)
  
    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']

        # 토큰화 처리
        tokens = self.tokenizer(
            sentence,                
            return_tensors='pt',     
            truncation=True,         
            padding='max_length',    
            add_special_tokens=True,
            max_length=self.token_max_len # max_len
        )

        input_ids = tokens['input_ids'].squeeze(0)           # 2D -> 1D
        attention_mask = tokens['attention_mask'].squeeze(0) # 2D -> 1D
        token_type_ids = torch.zeros_like(attention_mask)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask, 
            'token_type_ids': token_type_ids,
        }, torch.tensor(label)

In [9]:
model_params = dict(
    token_len = 64,
    batch_size = 16,
    drop_out = 0.5,
)

train_data = TokenDataset(trainset, CHECKPOINT, model_params['token_len'])
test_data = TokenDataset(testset, CHECKPOINT, model_params['token_len'])

train_loader = DataLoader(train_data, batch_size=model_params['batch_size'], shuffle=True, num_workers=8)
test_loader = DataLoader(test_data, batch_size=model_params['batch_size'], shuffle=True, num_workers=8)

In [10]:
class Bert(torch.nn.Module):
    def __init__(self, bert_pretrained=CHECKPOINT, dropout_rate=0.3):
        super(Bert, self).__init__()
        self.bert = AutoModel.from_pretrained(bert_pretrained)
        self.dr = torch.nn.Dropout(p=dropout_rate)
        self.fc = torch.nn.Linear(768, n_classes)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        last_hidden_state = output['last_hidden_state']
        x = self.dr(last_hidden_state[:, 0, :])
        x = self.fc(x)
        return x

In [11]:
bert = Bert(bert_pretrained=CHECKPOINT, dropout_rate=model_params['drop_out'])
bert.to(device)

Some weights of the model checkpoint at klue/bert-base were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Bert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
       

In [12]:
train_params = dict(
    lr = 2e-5,
    weight_decay = 1e-2,
)

loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(bert.parameters(), lr=train_params['lr'], weight_decay=train_params['weight_decay'])

In [13]:
from tqdm import tqdm 

def model_train(model, data_loader, loss_fn, optimizer, device):
    model.train()
    
    running_loss = 0
    corr = 0
    counts = 0
    
    prograss_bar = tqdm(data_loader, unit='batch', total=len(data_loader), mininterval=1)
    
    for idx, (inputs, labels) in enumerate(prograss_bar):
        inputs = {k:v.to(device) for k, v in inputs.items()}
        labels = labels.to(device)
        
        optimizer.zero_grad()
        output = model(**inputs)
        loss = loss_fn(output, labels)
        
        loss.backward()
        optimizer.step()
        
        _, pred = output.max(dim=1)
        
        corr += pred.eq(labels).sum().item()
        counts += len(labels)
        
        running_loss += loss.item() * labels.size(0)
        
        prograss_bar.set_description(f"training loss: {running_loss/(idx+1):.5f}, training accuracy: {corr / counts:.5f}")
        
    acc = corr / len(data_loader.dataset)
    
    return running_loss / len(data_loader.dataset), acc

def model_evaluate(model, data_loader, loss_fn, device):
    model.eval()
    
    with torch.no_grad():
        corr = 0
        running_loss = 0
        
        for inputs, labels in data_loader:
            inputs = {k:v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            
            output = model(**inputs)
            
            _, pred = output.max(dim=1)
            
            corr += torch.sum(pred.eq(labels)).item()
            
            running_loss += loss_fn(output, labels).item() * labels.size(0)
        
        acc = corr / len(data_loader.dataset)
        
        return running_loss / len(data_loader.dataset), acc

In [14]:
import gc
gc.collect()
torch.cuda.empty_cache()

In [15]:
# import mlflow

# mlflow.set_tracking_uri('mlruns')
# mlflow.set_experiment("Emotion_Classification")

In [16]:
# 에폭별 모델 저장하도록 코드 수정

model_base_path = "clean_norm_repeat/"
if not os.path.exists(model_base_path):
    os.mkdir(model_base_path)
    
model_name_base = 'klue-bert-' + "-".join([k+"_"+str(v) for k, v in model_params.items()] +[k+"_"+str(v) for k, v in train_params.items()])

num_epochs = 10
min_loss = np.inf
best_epoch = -1

for epoch in range(num_epochs):
    train_loss, train_acc = model_train(bert, train_loader, loss_fn, optimizer, device)
    val_loss, val_acc = model_evaluate(bert, test_loader, loss_fn, device)
    
    if val_loss < min_loss:
        print(f'[INFO] val_loss has been improved from {min_loss:.5f} to {val_loss:.5f}. Saving Model!')
        min_loss = val_loss
        best_epoch = epoch
    
    torch.save(bert.state_dict(), f'{model_base_path + model_name_base} + {epoch}.pth')
    
    print(f'epoch {epoch+1:02d}, loss: {train_loss:.5f}, acc: {train_acc:.5f}, val_loss: {val_loss:.5f}, val_accuracy: {val_acc:.5f}')


os.rename(f'{model_base_path + model_name_base} + {best_epoch}.pth', f'{model_base_path + model_name_base} + {best_epoch}_best.pth')
print(f'best val loss at {best_epoch}')

training loss: 21.44521, training accuracy: 0.48240: 100%|██████████| 1685/1685 [06:42<00:00,  4.18batch/s]


[INFO] val_loss has been improved from inf to 1.20229. Saving Model!
epoch 01, loss: 1.34057, acc: 0.48240, val_loss: 1.20229, val_accuracy: 0.53744


training loss: 17.00966, training accuracy: 0.59158: 100%|██████████| 1685/1685 [06:46<00:00,  4.14batch/s]


[INFO] val_loss has been improved from 1.20229 to 1.19485. Saving Model!
epoch 02, loss: 1.06330, acc: 0.59158, val_loss: 1.19485, val_accuracy: 0.54903


training loss: 13.06594, training accuracy: 0.68945: 100%|██████████| 1685/1685 [06:53<00:00,  4.07batch/s]


epoch 03, loss: 0.81677, acc: 0.68945, val_loss: 1.32940, val_accuracy: 0.53605


training loss: 8.83595, training accuracy: 0.79518: 100%|██████████| 1685/1685 [06:50<00:00,  4.11batch/s]


epoch 04, loss: 0.55235, acc: 0.79518, val_loss: 1.61087, val_accuracy: 0.53415


training loss: 5.73000, training accuracy: 0.86741: 100%|██████████| 1685/1685 [06:52<00:00,  4.09batch/s]


epoch 05, loss: 0.35819, acc: 0.86741, val_loss: 1.85172, val_accuracy: 0.52489


training loss: 3.65039, training accuracy: 0.91927: 100%|██████████| 1685/1685 [06:55<00:00,  4.06batch/s]


epoch 06, loss: 0.22819, acc: 0.91927, val_loss: 2.19302, val_accuracy: 0.51554


training loss: 2.53345, training accuracy: 0.94691: 100%|██████████| 1685/1685 [06:49<00:00,  4.11batch/s]


epoch 07, loss: 0.15837, acc: 0.94691, val_loss: 2.26429, val_accuracy: 0.52454


training loss: 1.99906, training accuracy: 0.95748: 100%|██████████| 1685/1685 [06:55<00:00,  4.05batch/s]


epoch 08, loss: 0.12496, acc: 0.95748, val_loss: 2.51907, val_accuracy: 0.52012


training loss: 1.61069, training accuracy: 0.96554: 100%|██████████| 1685/1685 [07:09<00:00,  3.92batch/s]


epoch 09, loss: 0.10069, acc: 0.96554, val_loss: 2.66548, val_accuracy: 0.51536


training loss: 1.49204, training accuracy: 0.96817: 100%|██████████| 1685/1685 [07:09<00:00,  3.92batch/s]


epoch 10, loss: 0.09327, acc: 0.96817, val_loss: 2.61131, val_accuracy: 0.51787
best val loss at 1


# Prediction

In [61]:
class EmotionClassifier():
    def __init__(self, model, tokenizer, labels: dict):
        model.to(device)
        self.model = model
        self.tokenizer = tokenizer
        self.labels = labels
        
    def predict(self, sentence):
        tokens = self.tokenizer(
            sentence,                # 1개 문장 
            return_tensors='pt',     # 텐서로 반환
            truncation=True,         # 잘라내기 적용
            padding='max_length',    # 패딩 적용
            add_special_tokens=True, # 스페셜 토큰 적용
            max_length = 64,
        )
        tokens.to(device)
        prediction = self.model(**tokens)
        prediction = torch.nn.functional.softmax(prediction, dim=1)
        output = prediction.argmax(dim=1).item()
        prob, result = prediction.max(dim=1)[0].item(), self.labels[output]
        print(f'[{result}]\n확률은: {prob*100:.3f}% 입니다.')

In [62]:
model = Bert()
model.load_state_dict(torch.load('/home/wonhong/workspace/Emotion_Detection/DL/dataset/klue-bert-token_len_64-batch_size_16-lr_5e-05-weight_decay_0.01.pth'))

# ['anger', 'disgust', 'fear', 'happiness', 'neutralism', 'sadness', 'surprise']
label_dict = {idx:label for idx, label in enumerate(label_encoder.inverse_transform(range(n_classes)))}

tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT)

ecf = EmotionClassifier(model, tokenizer, label_dict)

In [73]:
for predict_times in range(10):
    sentence = df['text'].sample(1).values[0]
    print("SENTENCE : ", sentence)
    %time ecf.predict(sentence)
    print("-"*30)

SENTENCE :  어떻게 상처받은 눅희의 마음을 달래줄까요ㅜㅜ0
[sadness]
확률은: 67.385% 입니다.
CPU times: user 51.8 ms, sys: 1.06 ms, total: 52.8 ms
Wall time: 52.6 ms
------------------------------
SENTENCE :  ㅡㅡ그냥얜웃기지도않고혐오스러워 얘개그는ㅡㅡ
[anger]
확률은: 62.665% 입니다.
CPU times: user 40.3 ms, sys: 164 µs, total: 40.5 ms
Wall time: 40.4 ms
------------------------------
SENTENCE :  300만유로면, 약 37억 7800만원정도 됨...
[surprise]
확률은: 79.438% 입니다.
CPU times: user 45.6 ms, sys: 0 ns, total: 45.6 ms
Wall time: 45.5 ms
------------------------------
SENTENCE :  방동민 선수 화이팅!
[happiness]
확률은: 91.218% 입니다.
CPU times: user 24.2 ms, sys: 0 ns, total: 24.2 ms
Wall time: 24.2 ms
------------------------------
SENTENCE :  13명이돈 쪼개서 나눠 가지는데..
[sadness]
확률은: 34.543% 입니다.
CPU times: user 14.2 ms, sys: 0 ns, total: 14.2 ms
Wall time: 14.2 ms
------------------------------
SENTENCE :  거짓은 거짓을 낳기 마련이고 진실은 백일하에 반드시 드러난다.
[anger]
확률은: 33.025% 입니다.
CPU times: user 7.82 ms, sys: 8.68 ms, total: 16.5 ms
Wall time: 16.2 ms
------------------------------
S

---