# 딥러닝을 이용한 수능 영어 풀기

In [1]:
import dill
import time
import random
import numpy as np
from sklearn.metrics import roc_curve, auc

import nltk
nltk.download("punkt")
from nltk.tokenize import word_tokenize

import torch
import torch.nn as nn

from torchtext.legacy.data import Field
from torchtext.legacy.data import TabularDataset
from torchtext.legacy.data import BucketIterator, Iterator

RANDOM_SEED = 2020
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

DATA_PATH = "../data/processed/"

[nltk_data] Downloading package punkt to /home/ysher/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### 모델 클래스 정의하기

In [2]:
class LSTMClassifier(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, pad_idx):
        super().__init__()
        self.embed_layer = nn.Embedding(
            num_embeddings=num_embeddings,embedding_dim=embedding_dim,padding_idx=pad_idx)
        self.lstm_layer = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            num_layers=num_layers,
            bidirectional=True,
            dropout=0.5
        )
        self.last_layer = nn.Sequential(
            nn.Linear(hidden_size*2, hidden_size),
            nn.Dropout(0.5),
            nn.LeakyReLU(),
            nn.Linear(hidden_size, 1),
            nn.Sigmoid()
        )
        
    def forward(self,x):
        embed_x = self.embed_layer(x)
        output, (_,_) = self.lstm_layer(embed_x)
        last_output = output[:,-1,:]
        last_output = self.last_layer(last_output)
        return last_output

## 데이터셋 불러오기

In [3]:
TEXT = Field(
    sequential=True, # 문장이 들어온다
    use_vocab=True, # 따로 단어장을 만드는가?
    tokenize=word_tokenize, 
    lower=True,
    batch_first=True
)

LABEL = Field(
    sequential=False,
    use_vocab=False,
    batch_first=True
)

### 데이터 불러오기

In [4]:
sat_train_data, sat_valid_data, sat_test_data = TabularDataset.splits(
                                                    path = "../data/",
                                                    train = "sat_train.tsv",
                                                    validation = "sat_valid.tsv",
                                                    test = "sat_test.tsv",
                                                    format = "tsv",
                                                    fields = [("text",TEXT),("label",LABEL)],
                                                    skip_header = 1,
                                                )
TEXT.build_vocab(sat_train_data, min_freq=2)

### Data Loader 정의

In [5]:
sat_train_iterator, sat_valid_iterator, sat_test_iterator = BucketIterator.splits(
    (sat_train_data, sat_valid_data, sat_test_data),
    batch_size=8,
    device=None,
    sort=False
)

## 학습

### 모델 학습 함수 정의

In [6]:
def train(model, train_loader, optimizer, criterion, device):
    model.train()
    epoch_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        text = batch.text
        if text.shape[0] >1 :
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            label = label.to(device)
            output = model(text).flatten()
            loss = criterion(output, label)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
    return epoch_loss/len(train_loader)

### 모델 평가 함수 정의

In [7]:
def evaluate(model, valid_loader, criterion, device):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for _, batch in enumerate(valid_loader):
            text = batch.text
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            label = label.to(device)
            output = model(text).flatten()
            loss = criterion(output, label)
            
            epoch_loss += loss.item()
        
    return epoch_loss/len(valid_loader)

### HyperParameter 선언

In [8]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20

lstm_classifier = LSTMClassifier(num_embeddings=len(TEXT.vocab),
                                embedding_dim=100,
                                hidden_size=200,
                                num_layers=4,
                                pad_idx=PAD_IDX)
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"
    
_ = lstm_classifier.to(device)
optimizer = torch.optim.Adam(lstm_classifier.parameters())
bce_loss_fn = nn.BCELoss()

### 모델 학습

In [9]:
for epoch in range(N_EPOCHS):
    train_loss = train(
        lstm_classifier,
        sat_train_iterator,
        optimizer,
        bce_loss_fn,
        device
    )
    valid_loss = evaluate(
        lstm_classifier,
        sat_valid_iterator,
        bce_loss_fn,
        device
    )
    print(f"Epoch: {epoch+1:02}")
    print(f"\tTrain Loss: {train_loss:.5f}")
    print(f"\tVal. Loss: {valid_loss:.5f}")

Epoch: 01
	Train Loss: 0.52060
	Val. Loss: 0.76532
Epoch: 02
	Train Loss: 0.51065
	Val. Loss: 0.55898
Epoch: 03
	Train Loss: 0.47559
	Val. Loss: 0.54429
Epoch: 04
	Train Loss: 0.43671
	Val. Loss: 0.53830
Epoch: 05
	Train Loss: 0.42772
	Val. Loss: 0.53806
Epoch: 06
	Train Loss: 0.40104
	Val. Loss: 0.54618
Epoch: 07
	Train Loss: 0.43988
	Val. Loss: 0.53907
Epoch: 08
	Train Loss: 0.42333
	Val. Loss: 0.53079
Epoch: 09
	Train Loss: 0.42076
	Val. Loss: 0.53789
Epoch: 10
	Train Loss: 0.43361
	Val. Loss: 0.53598
Epoch: 11
	Train Loss: 0.43992
	Val. Loss: 0.53834
Epoch: 12
	Train Loss: 0.46740
	Val. Loss: 0.53208
Epoch: 13
	Train Loss: 0.42639
	Val. Loss: 0.54740
Epoch: 14
	Train Loss: 0.42282
	Val. Loss: 0.58034
Epoch: 15
	Train Loss: 0.41925
	Val. Loss: 0.57024
Epoch: 16
	Train Loss: 0.42093
	Val. Loss: 0.55255
Epoch: 17
	Train Loss: 0.43579
	Val. Loss: 0.55127
Epoch: 18
	Train Loss: 0.43980
	Val. Loss: 0.56086
Epoch: 19
	Train Loss: 0.43909
	Val. Loss: 0.54740
Epoch: 20
	Train Loss: 0.42460


### 모델 저장

In [10]:
with open("baseline_model.dill", "wb") as f:
    model = {
        "TEXT": TEXT,
        "LABEL": LABEL,
        "classifier": lstm_classifier
    }
    dill.dump(model, f)

## TEST

### 테스트 함수 정의

In [11]:
def test(model, test_loader, device):
    model.eval()
    with torch.no_grad():
        y_real = []
        y_pred = []
        
        for batch in test_loader:
            text = batch.text
            label = batch.label.type(torch.FloatTensor)
            text = text.to(device)
            
            output = model(text).flatten().cpu()
            
            y_real += [label]
            y_pred += [output]
        
        y_real = torch.cat(y_real)
        y_pred = torch.cat(y_pred)
        
    fpr, tpr, _ = roc_curve(y_real, y_pred)
    auroc = auc(fpr, tpr)
    
    return auroc

### 모델 성능 확인

In [12]:
_ = lstm_classifier.cpu()
test_auroc = test(
    lstm_classifier,
    sat_test_iterator,
    "cpu"
)

print(f"SAT Dateset Test AUROC: {test_auroc:.5f}")

SAT Dateset Test AUROC: 0.84615


## 성능 높이기

### 추가 데이터 이용

- pre trained model을 만들어 사용

- train 데이터에 넣지 않고 사전 훈련 하는 이유 : train에 넣고 같이 진행할 경우 데이터가 큰 추가데이터에 오버피팅 될 수 있다.

### 추가 데이터 불러오기

In [13]:
TEXT = Field(
    sequential=True, # 문장이 들어온다
    use_vocab=True, # 따로 단어장을 만드는가?
    tokenize=word_tokenize, 
    lower=True,
    batch_first=True
)

LABEL = Field(
    sequential=False,
    use_vocab=False,
    batch_first=True
)
cola_train_data, cola_valid_data, cola_test_data = TabularDataset.splits(
                                                    path = "../data/",
                                                    train = "cola_train.tsv",
                                                    validation = "cola_valid.tsv",
                                                    test = "cola_test.tsv",
                                                    format = "tsv",
                                                    fields = [("text",TEXT),("label",LABEL)],
                                                    skip_header = 1,
                                                )
TEXT.build_vocab(cola_train_data, min_freq=2)

cola_train_iterator, cola_valid_iterator, cola_test_iterator = BucketIterator.splits(
    (cola_train_data, cola_valid_data, cola_test_data),
    batch_size=32,
    device=None,
    sort=False
)

### 기존 데이터 불러오기

In [14]:
sat_train_data, sat_valid_data, sat_test_data = TabularDataset.splits(
                                                    path = "../data/",
                                                    train = "sat_train.tsv",
                                                    validation = "sat_valid.tsv",
                                                    test = "sat_test.tsv",
                                                    format = "tsv",
                                                    fields = [("text",TEXT),("label",LABEL)],
                                                    skip_header = 1,
                                                )

sat_train_iterator, sat_valid_iterator, sat_test_iterator = BucketIterator.splits(
    (sat_train_data, sat_valid_data, sat_test_data),
    batch_size=8,
    device=None,
    sort=False
)

### 모델 사전 학습

In [15]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20

lstm_classifier = LSTMClassifier(
    num_embeddings=len(TEXT.vocab),
    embedding_dim=100,
    hidden_size=200,
    num_layers=4,
    pad_idx=PAD_IDX,
)

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

_ = lstm_classifier.to(device)

optimizer = torch.optim.Adam(lstm_classifier.parameters())
bce_loss_fn = nn.BCELoss()

for epoch in range(N_EPOCHS):
    train_loss = train(lstm_classifier,
                      cola_train_iterator,
                      optimizer,
                      bce_loss_fn,
                      device)
    
    val_loss = evaluate(lstm_classifier,
                       cola_valid_iterator,
                       bce_loss_fn,
                       device)
    
    print(f"Epoch: {epoch+1:02}")
    print(f"\tTrain Loss: {train_loss:.5f}")
    print(f"\tVal. Loss: {valid_loss:.5f}")
    

Epoch: 01
	Train Loss: 0.61577
	Val. Loss: 0.56479
Epoch: 02
	Train Loss: 0.61246
	Val. Loss: 0.56479
Epoch: 03
	Train Loss: 0.61109
	Val. Loss: 0.56479


KeyboardInterrupt: 

In [None]:
from copy import deepcopy
before_tuning_lstm_classifier = deepcopy(lstm_classifier)

### 모델 추가 학습

In [None]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20

for epoch in range(N_EPOCHS):
    train_loss = train(
        lstm_classifier,
        sat_train_iterator,
        optimizer,
        bce_loss_fn,
        device
    )
    valid_loss = evaluate(
        lstm_classifier,
        sat_valid_iterator,
        bce_loss_fn,
        device
    )
    print(f"Epoch: {epoch+1:02}")
    print(f"\tTrain Loss: {train_loss:.5f}")
    print(f"\tVal. Loss: {valid_loss:.5f}")

### 모델 성능 비교

In [None]:
_ = before_tuning_lstm_classifier.cpu()
lstm_sat_test_auroc = test(before_tuning_lstm_classifier, sat_test_iterator,"cpu")
_ = lstm_classifier.cpu()
lstm_tuned_test_auroc = test(lstm_classifier, sat_test_iterator, "cpu")

print(f"Before fine-tuning SAT Dataset Test AUROC: {lstm_sat_test_auroc:.5f}")
print(f"After fine-tuning SAT Dataset Test AUROC: {lstm_tuned_test_auroc:.5f}")

### 모델 저장

In [None]:
with open("before_tuning_model.dill","wb") as f:
    model = {
        "TEXT": TEXT,
        "LABEL": LABEL,
        "classifier": before_tuning_lstm_classifier
    }
    dill.dump(model, f)
    
_ = lstm_classifier.cpu()
with open("after_tuning_model.dill","wb") as f:
    model = {
        "TEXT": TEXT,
        "LABEL": LABEL,
        "classifier": lstm_classifier
    }
    dill.dump(model, f)

## 심화 모델

- LSTM의 전체 값을 max pooling 하여 사용한다. (문장길이, pading 문제)

### 모델 정의

In [30]:
class LSTMPoolingClassifier(nn.Module):
    def __init__(self, num_embeddings, embedding_dim, hidden_size, num_layers, pad_idx):
        super().__init__()
        self.embed_layer = nn.Embedding(num_embeddings=num_embeddings,
                                           embedding_dim=embedding_dim,
                                           padding_idx=pad_idx)
        
        self.lstm_layer = nn.LSTM(input_size=embedding_dim,
                                 hidden_size=hidden_size,
                                 num_layers=num_layers,
                                 bidirectional=True,
                                 dropout=0.5,
                                 batch_first=True)
        
        self.last_layer = nn.Sequential(nn.Linear(2*hidden_size,1),
                                       nn.Dropout(p=0.5),
                                       nn.Sigmoid())
        
    def forward(self, x):
        x = self.embed_layer(x)
        output, _ = self.lstm_layer(x)
        pool = nn.functional.max_pool1d(output.transpose(1,2),x.shape[1])
        pool = pool.transpose(1,2).squeeze()
        output = self.last_layer(pool)
        return output.squeeze()

## 모델 사전 학습

In [31]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 20

lstm_pool_classifier = LSTMPoolingClassifier(
    num_embeddings=len(TEXT.vocab),
    embedding_dim=100,
    hidden_size=200,
    num_layers=4,
    pad_idx=PAD_IDX,
)

if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

_ = lstm_pool_classifier.to(device)

optimizer = torch.optim.Adam(lstm_pool_classifier.parameters())
bce_loss_fn = nn.BCELoss()

for epoch in range(N_EPOCHS):
    train_loss = train(lstm_pool_classifier,
                      cola_train_iterator,
                      optimizer,
                      bce_loss_fn,
                      device)
    
    val_loss = evaluate(lstm_pool_classifier,
                       cola_valid_iterator,
                       bce_loss_fn,
                       device)
    
    print(f"Epoch: {epoch+1:02}")
    print(f"\tTrain Loss: {train_loss:.5f}")
    print(f"\tVal. Loss: {valid_loss:.5f}")

Epoch: 01
	Train Loss: 0.65498
	Val. Loss: 0.85817
Epoch: 02
	Train Loss: 0.64562
	Val. Loss: 0.85817
Epoch: 03
	Train Loss: 0.63974
	Val. Loss: 0.85817
Epoch: 04
	Train Loss: 0.62967
	Val. Loss: 0.85817
Epoch: 05
	Train Loss: 0.61860
	Val. Loss: 0.85817
Epoch: 06
	Train Loss: 0.63392
	Val. Loss: 0.85817
Epoch: 07
	Train Loss: 0.63499
	Val. Loss: 0.85817
Epoch: 08
	Train Loss: 0.61458
	Val. Loss: 0.85817
Epoch: 09
	Train Loss: 0.59801
	Val. Loss: 0.85817
Epoch: 10
	Train Loss: 0.58919
	Val. Loss: 0.85817
Epoch: 11
	Train Loss: 0.56972
	Val. Loss: 0.85817
Epoch: 12
	Train Loss: 0.56575
	Val. Loss: 0.85817
Epoch: 13
	Train Loss: 0.55819
	Val. Loss: 0.85817
Epoch: 14
	Train Loss: 0.53673
	Val. Loss: 0.85817
Epoch: 15
	Train Loss: 0.52101
	Val. Loss: 0.85817
Epoch: 16
	Train Loss: 0.50902
	Val. Loss: 0.85817
Epoch: 17
	Train Loss: 0.50033
	Val. Loss: 0.85817
Epoch: 18
	Train Loss: 0.48945
	Val. Loss: 0.85817
Epoch: 19
	Train Loss: 0.48062
	Val. Loss: 0.85817
Epoch: 20
	Train Loss: 0.46766


In [32]:
from copy import deepcopy
before_tuning_lstm_pool_classifier = deepcopy(lstm_pool_classifier)

### 모델 추가학습

In [38]:
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
N_EPOCHS = 15

for epoch in range(N_EPOCHS):
    train_loss = train(lstm_pool_classifier,
                      sat_train_iterator,
                      optimizer,
                      bce_loss_fn,
                      device)
    
    valid_loss = evaluate(lstm_pool_classifier,
                         sat_valid_iterator,
                         bce_loss_fn,
                         device)
    
    print(f"Epoch: {epoch+1:02}")
    print(f"\tTrain Loss: {train_loss:.5f}")
    print(f"\tVal. Loss: {valid_loss:.5f}")

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper__index_select)

### 성능비교

In [36]:
_ = before_tuning_lstm_pool_classifier.cpu()
_ = lstm_pool_classifier.cpu()

pool_sat_test_auroc = test(before_tuning_lstm_pool_classifier, sat_test_iterator, "cpu")
pool_tuned_test_auroc = test(lstm_pool_classifier, sat_test_iterator, "cpu")

print(f"Before fine-tuning SAT Dataset Test AUROC: {pool_sat_test_auroc:.5f}")
print(f"After fine-tuning SAT Dataset Test AUROC: {pool_tuned_test_auroc:.5f}")

Before fine-tuning SAT Dataset Test AUROC: 0.53846
After fine-tuning SAT Dataset Test AUROC: 0.80769


### 모델 저장

In [56]:
with open("../model/advanced_before_tuning_model.dill", "wb") as f:
    model = {"TEXT" : TEXT,
            "LABEL" : LABEL,
            "classifier" : before_tuning_lstm_pool_classifier}
    dill.dump(model, f)

with open("../model/advanced_after_tuning_model.dill", "wb") as f:
    model = {"TEXT" : TEXT,
            "LABEL" : LABEL,
            "classifier" : lstm_pool_classifier}
    dill.dump(model, f)

## 데모

### 성능비교 함수 정의

In [57]:
def test(model_path):
    with open(model_path, "rb") as f:
        model = dill.load(f)
        
    sat_test_data = TabularDataset(path=f"{DATA_PATH}sat_test.tsv",
                                  format = "tsv",
                                  fields=[("text", model["TEXT"]),
                                         ("label", model["LABEL"])],
                                  skip_header=1)
    
    sat_test_iterator = BucketIterator(sat_test_data,
                                      batch_size=8,
                                      device=None,
                                      sort=False,
                                      shuffle=False)
    
    classifier = model["classifier"].cpu()
    
    with torch.no_grad():
        y_real = []
        y_pred = []
        classifier.eval()
        for batch in sat_test_iterator:
            text = batch.text
            label = batch.label.type(torch.FloatTensor)
            
            output = classifier(text).flatten().cpu()
            
            y_real += [label]
            y_pred += [output]
            
        y_real = torch.cat(y_real)
        y_pred = torch.cat(y_pred)
    
    fpr, tpr, _ = roc_curve(y_real, y_pred)
    auroc = auc(fpr, tpr)
    
    return auroc.round(5)

### 성능 비교

In [58]:
model_list = ["baseline_model.dill",
             "before_tuning_model.dill",
             "after_tuning_model.dill",
             "advanced_before_tuning_model.dill",
             "advanced_after_tuning_model.dill"]

test_auroc = []
for file_name in model_list:
    model_name = file_name.replace(".dill","")
    auroc = test("../model/" + file_name)
    test_auroc += [(model_name,auroc)]
    
test_auroc = sorted(test_auroc, key=lambda x:x[1], reverse=True)
for rank, (model_name, auroc) in enumerate(test_auroc):
    print(f"Rank {rank+1} - {model_name:30} - Test AUROC: {auroc:.5f}")

Rank 1 - baseline_model                 - Test AUROC: 0.84615
Rank 2 - advanced_after_tuning_model    - Test AUROC: 0.80769
Rank 3 - advanced_before_tuning_model   - Test AUROC: 0.53846
Rank 4 - after_tuning_model             - Test AUROC: 0.46154
Rank 5 - before_tuning_model            - Test AUROC: 0.38462
