In [1]:
import pandas as pd

In [2]:
df1 = pd.read_csv('./0530학습용_종목품명.csv')

In [3]:
account_df = df1[['상대계정', '상대계정_new']].drop_duplicates()
num_account = {}
num_account_inverse = {}
for account, account1 in zip(account_df['상대계정'].tolist(), account_df['상대계정_new'].tolist()):
    num_account[account1] = account
    num_account_inverse[account] = account1

In [4]:
print(len(num_account)) # 44개의 classification class
print(len(num_account_inverse))

44
44


In [5]:
import os
import random
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, GPT2Config
from torch.optim import Adam
import torch.nn.functional as F

In [6]:
class CustomDataset(Dataset):
    def __init__(self, df):
        self.df = df
    
    def __getitem__(self, idx):
        text = str(self.df.iloc[idx, 4])
        label = self.df.iloc[idx, 5]
        return text, label
    
    def __len__(self):
        return len(self.df)

In [7]:
dataset_size = len(df1)
validation_split = .2
shuffle_dataset = True
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))

In [8]:
print(dataset_size)

23519


In [10]:
if shuffle_dataset:    
    np.random.shuffle(indices)

In [11]:
def seed_everything(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore
    torch.backends.cudnn.benchmark = True  # type: ignore
seed_everything()

In [12]:
dataset_train = CustomDataset(df1)
dataset_test = CustomDataset(df1)

train_indices, val_indices = indices[split:], indices[:split]
train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=16, sampler = train_sampler, num_workers=1)
val_loader = torch.utils.data.DataLoader(dataset_test, batch_size=8, sampler = val_sampler, num_workers=1)

1176


In [11]:
device = torch.device("cuda")
model_config = GPT2Config.from_pretrained('gpt2', num_labels=44)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2ForSequenceClassification.from_pretrained('gpt2', config = model_config)

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

model.to(device)

model.load_state_dict(torch.load("./gpt2_classcode_44_230102_5e5_e4.pth"))
model.eval()

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid

In [12]:
from tqdm import tqdm

In [13]:
# dataset = CustomDataset(df1)
# loader = DataLoader(dataset, batch_size=8, shuffle=False, num_workers=2)

result = np.empty((0, 44))
labels = np.empty((0, 1))
for text , label in tqdm(val_loader):
    encoded_list = [tokenizer.encode(t, add_special_tokens=True) for t in text]
    padded_list =  [e + [0] * (512-len(e)) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample = sample.to(device)
    outputs = model(sample)
    output = outputs[0].to('cpu').detach().numpy() 
    result = np.vstack((result, output))
    
    try:
        label_ = label.view(8, -1).to('cpu').detach().numpy()
        labels = np.vstack((labels, label_))
    except:
        label_ = label.view(7, -1).to('cpu').detach().numpy()
        labels = np.vstack((labels, label_))
    

100%|█████████████████████████████████████████| 588/588 [00:33<00:00, 17.30it/s]


In [14]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x, axis=1).reshape(-1,1))
    return e_x / e_x.sum(axis=1).reshape(-1,1) # only difference

In [15]:
softmax(result).sum(axis=1)

array([1., 1., 1., ..., 1., 1., 1.])

In [16]:
y_pred = softmax(result)

### softmax output 중 최상위 3개 값 뽑아야함

In [17]:
y_pred.shape

(4703, 44)

In [18]:
print(labels)
labels.shape

[[ 8.]
 [ 6.]
 [19.]
 ...
 [18.]
 [18.]
 [19.]]


(4703, 1)

### performance check

In [19]:
score1 = 0; score2 = 0; wrong1 = 0; wrong2 = 0;
for pred , label in zip(y_pred, labels):
    if pred.max() < 0.7: # correct2를 count
        # softmax 예측값 중 가장 큰 3개를 가져옴
        predictions = np.argpartition(-pred, 3)[:3]
        if (predictions[0] == int(label) or predictions[1] == int(label) or predictions[2] == int(label)):
            score2 = score2 + 1
        else:
            wrong2 = wrong2 + 1
    else: # correct1을 count
        if (np.argmax(pred) == int(label)):
            score1 = score1 + 1
        else:
            wrong1 = wrong1 + 1
        

In [20]:
print(len(y_pred))
print(score1 + score2 + wrong1 + wrong2)

4703
4703


In [21]:
print(wrong1, wrong2)

127 26


In [22]:
print("사업계획서에 명시된 자연어이해 기술 성능 지표: (correct1 + correct2) / (데이터의 개수)")
print("------------------------------------------------------------------------")
print("score: ", (score1 + score2) / len(y_pred)) # 사업계획서에 명시된 자연어이해 기술 성능 지표

사업계획서에 명시된 자연어이해 기술 성능 지표: (correct1 + correct2) / (데이터의 개수)
------------------------------------------------------------------------
score:  0.9674675738890071
