In [2]:
# 문장 분류 모델
from torch import nn

class SentenceClassifier(nn.Module):
    def __init__(
            self,
            n_vocab,
            hidden_dim,
            embedding_dim,
            n_layers,
            dropout=0.5,
            bidirectional=True,
            model_type='lstm'
            ):
        super().__init__()

        self.embedding=nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=embedding_dim,
            padding_idx=0
        )
        if model_type=='rnn':
            self.model=nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True
            )
        elif model_type=='lstm':
            self.model=nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True   
            )
        
        if bidirectional:
            self.classifier=nn.Linear(hidden_dim*2,1)
        else:
            self.classifier=nn.Linear(hidden_dim,1)
        self.dropout=nn.Dropout(dropout)

    def forward(self,inputs):
        embeddings=self.embedding(inputs)
        output,_=self.model(embeddings)
        last_output=output[:,-1,:]
        last_output=self.dropout(last_output)
        logits=self.classifier(last_output)
        return logits

In [3]:
# 데이터셋 불러오기
import pandas as pd
from Korpora import Korpora

corpus=Korpora.load('nsmc')
corpus_df=pd.DataFrame(corpus.test)

train=corpus_df.sample(frac=0.9,random_state=42)
test=corpus_df.drop(train.index)

print(train.head(5).to_markdown())
print('Training Data Size :',len(train))
print('Testing Data Size :',len(test))


    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\KDP-30\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\KD

In [4]:
# 데이터 토큰화 및 단어사전 구축
from konlpy.tag import Okt
from collections import Counter

def build_vocab(corpus,n_vocab,special_tokens):
    counter=Counter()
    for tokens in corpus:
        counter.update(tokens)
    vocab=special_tokens
    for token,count in counter.most_common(n_vocab):
        vocab.append(token)
    return vocab

tokenizer=Okt()
train_tokens=[tokenizer.morphs(review) for review in train.text]
test_tokens=[tokenizer.morphs(review) for review in test.text]

vocab=build_vocab(corpus=train_tokens, n_vocab=5000,special_tokens=['<pad>','<unk>'])
token_to_id={token:idx for idx,token in enumerate(vocab)}
id_to_token={idx:token for idx,token in enumerate(vocab)}

print(vocab[:10])
print(len(vocab))

['<pad>', '<unk>', '.', '이', '영화', '의', '..', '가', '에', '...']
5002


In [5]:
# 정수 인코딩 및 패딩
import numpy as np

def pad_sequences(sequences,max_length,pad_value):
    result=list()
    for sequence in sequences:
        sequence=sequence[:max_length]
        pad_length=max_length - len(sequence)
        padded_sequence=sequence+[pad_value]*pad_length
        result.append(padded_sequence)
    return np.asarray(result)

unk_id=token_to_id['<unk>']
train_ids=[[token_to_id.get(token,unk_id) for token in review] for review in train_tokens]
test_ids=[[token_to_id.get(token,unk_id) for token in review] for review in test_tokens]

max_length=32
pad_id=token_to_id['<pad>']
train_ids=pad_sequences(train_ids,max_length,pad_id)
test_ids=pad_sequences(test_ids,max_length,pad_id)

print(train_ids[0])
print(test_ids[0])

[ 223 1716   10 4036 2095  193  755    4    2 2330 1031  220   26   13
 4839    1    1    1    2    0    0    0    0    0    0    0    0    0
    0    0    0    0]
[3307    5 1997  456    8    1 1013 3906    5    1    1   13  223   51
    3    1 4684    6    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]


In [6]:
# 데이터 로더 적용
import torch
from torch.utils.data import TensorDataset,DataLoader

train_ids=torch.tensor(train_ids)
test_ids=torch.tensor(test_ids)

train_labels=torch.tensor(train.label.values,dtype=torch.float32)
test_labels=torch.tensor(test.label.values,dtype=torch.float32)

train_dataset=TensorDataset(train_ids,train_labels)
test_dataset=TensorDataset(test_ids,test_labels)

train_loader=DataLoader(train_dataset,batch_size=16,shuffle=True)
test_loader=DataLoader(test_dataset,batch_size=16,shuffle=False)

In [7]:
# 손실함수와 최적화 함수 정의
from torch import optim

n_vocab=len(token_to_id)
hidden_dim=64
embedding_dim=128
n_layers=2

device='cuda' if torch.cuda.is_available() else 'cpu'
classifier=SentenceClassifier(n_vocab=n_vocab,hidden_dim=hidden_dim,embedding_dim=embedding_dim,n_layers=n_layers).to(device)
criterion=nn.BCEWithLogitsLoss().to(device)
optimizer=optim.RMSprop(classifier.parameters(),lr=0.001)

In [8]:
# 모델 학습 및 테스트
def train(model,datasets,criterion,optimizer,device,interval):
    model.train()
    losses=list()

    for step,(input_ids,labels) in enumerate(datasets):
        input_ids=input_ids.to(device)
        labels=labels.to(device).unsqueeze(1)

        logits=model(input_ids)
        loss=criterion(logits,labels)
        losses.append(loss.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if step%interval==0:
            print(f'Train Loss {step} : {np.mean(losses)}')

def test(model,datasets,criterion,device):
    model.eval()
    losses=list()
    corrects=list()

    for step,(input_ids,labels) in enumerate(datasets):
        input_ids=input_ids.to(device)
        labels=labels.to(device).unsqueeze(1)

        logits=model(input_ids)
        loss=criterion(logits,labels)
        losses.append(loss.item())
        yhat=torch.sigmoid(logits)>.5
        corrects.extend(
            torch.eq(yhat,labels).cpu().tolist()
        )

    print(f'Val Loss : {np.mean(losses)}, Val Accuracy : {np.mean(corrects)}')

epochs=5
interval=500

for epoch in range(epochs):
    train(classifier,train_loader,criterion,optimizer,device,interval)
    test(classifier,test_loader,criterion,device)


Train Loss 0 : 0.7056640386581421
Train Loss 500 : 0.693762791965774
Train Loss 1000 : 0.6826789934080202
Train Loss 1500 : 0.6739627492578724
Train Loss 2000 : 0.6652488444579475
Train Loss 2500 : 0.6575580659221526
Val Loss : 0.6006449655222055, Val Accuracy : 0.705
Train Loss 0 : 0.689236044883728
Train Loss 500 : 0.6013172161198423
Train Loss 1000 : 0.5943840187448602
Train Loss 1500 : 0.5943782634134693
Train Loss 2000 : 0.5864239292464096
Train Loss 2500 : 0.5783238524510735
Val Loss : 0.5395869973558969, Val Accuracy : 0.7342
Train Loss 0 : 0.6035038828849792
Train Loss 500 : 0.5283326465272142
Train Loss 1000 : 0.5158798949165897
Train Loss 1500 : 0.5110080776652998
Train Loss 2000 : 0.5011310465093972
Train Loss 2500 : 0.49235118276736395
Val Loss : 0.4467549383068999, Val Accuracy : 0.7932
Train Loss 0 : 0.5440927147865295
Train Loss 500 : 0.42414094986553913
Train Loss 1000 : 0.4210331855671151
Train Loss 1500 : 0.41656882060737627
Train Loss 2000 : 0.4135403377541538
Train 

In [9]:
# 학습된 모델로부터 임베딩 추출
token_to_embedding=dict()
embedding_matrix=classifier.embedding.weight.detach().cpu().numpy()

for word,emb in zip(vocab,embedding_matrix):
    token_to_embedding[word]=emb

token=vocab[1000]
print(token,token_to_embedding[token])

보고싶다 [-1.8683641  -1.3740939   1.7592905   0.70835006 -0.29292408  0.24323533
 -1.2669592   1.0632653   0.42124623  1.0604889  -0.8373948  -0.7002585
  0.35543796  0.35817844  1.6392184  -1.3335398   0.51421636 -0.22712448
  0.83719736  0.59881765 -0.54472256 -0.14003327 -1.089459   -0.6348388
 -1.42152    -1.0494287  -1.1649648   1.296978    0.9583308  -0.31751117
 -0.24820495 -0.26237667 -0.37054545  0.26183638  0.5348833   0.40665674
  0.535054   -0.26458335  0.92567116 -1.615079    0.39977413  1.1322653
 -0.7304918  -0.19022855  0.8435572  -0.3632555   1.2237774   1.711149
 -0.62056255  0.89880013 -0.77069217 -0.5066573  -0.22178869 -0.53092724
  2.4233046   0.01304005 -1.1162593   0.65695375  0.01025385 -1.2377442
 -0.4059662  -1.1750962   1.316754    0.7121545   0.79570824  1.6789912
  0.17559305 -0.78017986 -0.6577893  -0.27742603 -1.7648214   0.59902346
  0.6311329  -0.3315085   0.7283992  -1.4376836   0.5626868   2.5795205
 -1.4623389  -1.097462   -0.6777411   0.4786321   0.73

In [10]:
from gensim.models import Word2Vec
tokens=[tokenizer.morphs(review) for review in corpus_df.text]
word2vec=Word2Vec(sentences=tokens, vector_size=128, window=5, min_count=1, sg=1, epochs=3, max_final_vocab=10000)

In [12]:
import os

if not os.path.exists('../models/') :
    os.makedirs('../models/')
    word2vec.save('../models/word2vec.model')

In [13]:
# 사전 학습된 모델로 임베딩 계층 초기화

init_embeddings=np.zeros((n_vocab,embedding_dim))

for index,token in id_to_token.items():
    if token not in ["<pad>",'<unk>']:
        init_embeddings[index]=word2vec.wv[token]

embedding_layer=nn.Embedding.from_pretrained(torch.tensor(init_embeddings,dtype=torch.float32))

In [14]:
class SentenceClassifier(nn.Module):
    def __init__(
            self,
            n_vocab,
            hidden_dim,
            embedding_dim,
            n_layers,
            dropout=0.5,
            bidirectional=True,
            model_type='lstm',
            pretrained_embedding=None
            ):
        super().__init__()
        
        if pretrained_embedding is not None:
            self.embedding=nn.Embedding(
                num_embeddings=n_vocab,
                embedding_dim=embedding_dim,
                padding_idx=0
            )
        else:
            self.embedding=nn.Embedding(
                num_embeddings=n_vocab,
                embedding_dim=embedding_dim,
                padding_idx=0
            )
        if model_type=='rnn':
            self.model=nn.RNN(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True
            )
        elif model_type=='lstm':
            self.model=nn.LSTM(
                input_size=embedding_dim,
                hidden_size=hidden_dim,
                num_layers=n_layers,
                bidirectional=bidirectional,
                dropout=dropout,
                batch_first=True   
            )
        
        if bidirectional:
            self.classifier=nn.Linear(hidden_dim*2,1)
        else:
            self.classifier=nn.Linear(hidden_dim,1)
        self.dropout=nn.Dropout(dropout)

    def forward(self,inputs):
        embeddings=self.embedding(inputs)
        output,_=self.model(embeddings)
        last_output=output[:,-1,:]
        last_output=self.dropout(last_output)
        logits=self.classifier(last_output)
        return logits

In [15]:
classifier=SentenceClassifier(
    n_vocab=n_vocab,hidden_dim=hidden_dim,embedding_dim=embedding_dim,
    n_layers=n_layers,pretrained_embedding=init_embeddings
).to(device)
criterion=nn.BCEWithLogitsLoss().to(device)
optimizer=optim.RMSprop(classifier.parameters(),lr=0.001)

epochs=5
interval=500

for epoch in range(epochs):
    train(classifier,train_loader,criterion,optimizer,device,interval)
    test(classifier,test_loader,criterion,device)

Train Loss 0 : 0.7030622959136963
Train Loss 500 : 0.6936484206460432
Train Loss 1000 : 0.6917418511240156
Train Loss 1500 : 0.6782668178713694
Train Loss 2000 : 0.6748790943968124
Train Loss 2500 : 0.6659648386372037
Val Loss : 0.622719238455684, Val Accuracy : 0.6744
Train Loss 0 : 0.5098632574081421
Train Loss 500 : 0.6272295207796459
Train Loss 1000 : 0.6155383265935458
Train Loss 1500 : 0.6050998080300936
Train Loss 2000 : 0.5955927654810395
Train Loss 2500 : 0.5872510355956456
Val Loss : 0.524811508175664, Val Accuracy : 0.7572
Train Loss 0 : 0.5801953673362732
Train Loss 500 : 0.5152910835074331
Train Loss 1000 : 0.5033478119841346
Train Loss 1500 : 0.49367625631148776
Train Loss 2000 : 0.4817994584528164
Train Loss 2500 : 0.47329235025366895
Val Loss : 0.42293687512318545, Val Accuracy : 0.8056
Train Loss 0 : 0.361931174993515
Train Loss 500 : 0.37341168623960425
Train Loss 1000 : 0.3790219259577674
Train Loss 1500 : 0.38388115345816226
Train Loss 2000 : 0.38472909336608985
Tra