In [62]:
import torch
import numpy as np
import torch.nn as nn
from tqdm.notebook import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import get_cosine_schedule_with_warmup

In [54]:
train_input = torch.load('./processed_data/train_input.pt')
train_output = torch.load('./processed_data/train_label.pt')
test_input = torch.load('./processed_data/test_input.pt')
vocab_number = 80096
label_number = 7
batch_size = 32
n_hidden = 300

In [44]:
print(train_input.shape)
print(test_input.shape)

torch.Size([65863, 121])
torch.Size([13491, 121])


In [45]:
class TextDataset(Dataset):
    def __init__(self,tokens, labels=None ,is_test=False):
        super().__init__()
        self.tokens = tokens
        self.labels = labels
        self.is_test = is_test
    
    def __len__(self):
        return len(self.tokens)    
    
    def __getitem__(self, idx):
        input = self.tokens[idx]
        label = None
        if not self.is_test:
            label = self.labels[idx]
        return (input, label) if not self.is_test else (input)

In [51]:
dataset_train = TextDataset(tokens=train_input, labels=train_output, is_test=False)
dataset_test = TextDataset(tokens=test_input, is_test=True)
loader_train = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True)
loader_test = DataLoader(dataset=dataset_test, batch_size=batch_size, shuffle=False) 

In [57]:
class TextLSTM(nn.Module):
  def __init__(self):
    super(TextLSTM, self).__init__()
    self.embedding = nn.Embedding(vocab_number, 125)
    self.lstm = nn.LSTM(
      input_size=125, 
      hidden_size=125, 
      num_layers=1,
      dropout=0.5,
      bidirectional=True,
    )
    self.fc = nn.Linear(2*125, label_number)
    
  def forward(self, hidden_and_cell,X):
    X = self.embedding(X)
    X = X.transpose(0,1)
    outputs, (hidden_forward, hidden_backward) = self.lstm(X, hidden_and_cell)
    
    # forward, backward 방향의 hidden state 를 합침
    outputs = torch.cat(
      (hidden_forward[-1], hidden_backward[-1]), dim=1
    )  
    outputs = self.fc(outputs)
    
    return outputs

In [58]:
#  GPU 장비 설정
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


In [59]:
model = TextLSTM().to(device)



In [64]:
criterion = nn.CrossEntropyLoss() # 오차 함수 조정 
optimizer = torch.optim.Adam(model.parameters(), lr=0.0006, weight_decay=0.001) 
scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=len(loader_train)*3, 
    num_training_steps=len(loader_train)*20
)

In [68]:
def train(model, loader_train, criterion, optimizer, scheduler=None, epochs=20, 
          save_file='./models/bidirectional_classifier.pth'):
    
    hidden = torch.zeros(2, batch_size, n_hidden, requires_grad=True).to(device)
    cell = torch.zeros(2, batch_size, n_hidden, requires_grad=True).to(device)

    train_loss_min = np.inf
    # 총 epoch 만큼 반복
    for epoch in range(epochs):
        
        preds_list = []
        true_list = []
        print(f'에폭 [{epoch+1}/{epochs}] \n---------------------')
        # == [훈련] =========
        model.train() # 모델을 훈련 상태로 설정
        epoch_train_loss = 0 # 에폭별 손실값 초기화(훈련데이터 용)
        
        # 미니배치 단위로 훈련
        for tokens , labels in tqdm(loader_train):
            tokens = tokens.to(device)
            labels = labels.to(device)
            
            optimizer.zero_grad()
            outputs = model((hidden,cell),tokens)

            preds_list.extend(outputs)
            true_list.extend(labels)
            
            loss = criterion(outputs, labels)
            epoch_train_loss += loss.item()
            
            loss.backward()
            optimizer.step() 
            
            if scheduler != None: # 스케줄러 학습률 갱신
                scheduler.step()
                
        print(f'\t 훈련 데이터 손실값 : {epoch_train_loss/len(loader_train):.4f}')
        
        if epoch_train_loss <= train_loss_min:
            print(f'\t### 훈련 데이터 손실값 감소 ({train_loss_min:.4f} --> {epoch_train_loss:.4f}). 모델저장')
            torch.save(model.state_dict(), save_file)
            train_loss_min = epoch_train_loss
        # == [최적 모델 가중치 찾기] ==
        # 현 에폭에서의 검증 데이터 손실값이 지금까지 가장 작다면
            # 현 에폭의 모델 가중치 (현재까지의 최적 모델 가중치 ) 저장
    return torch.load(save_file)

In [69]:
model_state_dict = train(
    model=model, loader_train=loader_train,
    criterion=criterion, optimizer=optimizer,
    scheduler=scheduler, epochs=20,
    save_file=f'./bidirectional_classifier.pth')

에폭 [1/20] 
---------------------


  0%|          | 0/2059 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
model.eval() # 모델을 평가 상태로 설정
final_preds = [] # 예측값 저장용 리스트 초기화

hidden = torch.zeros(2, batch_size, n_hidden, requires_grad=True).to(device)
cell = torch.zeros(2, batch_size, n_hidden, requires_grad=True).to(device)

with torch.no_grad():
    for tokens in loader_test: # batch 갯수때문에 복수형
        tokens = tokens.to(device)
        outputs = model((hidden,cell),tokens)
        pred = torch.max(outputs.cpu(), dim=1)[1].numpy() # 예측값

        final_preds.extend(pred)

In [None]:
# 마지막 32개에 맞춰서 마지막 테스트 데이터 예측
import numpy as np
last_input = test_input[-32:]

last_input = last_input.to(device)
outputs = model((hidden,cell),tokens)
last_pred = torch.max(outputs.cpu(), dim=1)[1].numpy() # 예측값
print(last_pred)

In [71]:
final_output = np.load('./output/final_output.npy')
print(len(final_output))
print(len(test_input))

13491
13491


In [74]:
final_output = final_output.tolist()

In [83]:
import pandas as pd

submission_df = pd.read_csv("input/dataset/test.csv")
submission_df['label']= final_output
submission_df = submission_df[['ID','label']]

In [84]:
submission_df.to_csv('./output/jhkim_1.csv',index=False)