In [1]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import json
import pickle
import random
import time
import math
import numpy as np
from konlpy.tag import Mecab;tagger=Mecab()
from collections import Counter
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
import numpy as np
torch.manual_seed(1)

<torch._C.Generator at 0x7fc60012af00>

In [2]:
USE_CUDA = torch.cuda.is_available()

In [3]:
def prepare_sequence(seq, to_ix):
    idxs = list(map(lambda w: to_ix[w] if w in to_ix.keys() else to_ix["<UNK>"], seq))
    tensor = Variable(torch.LongTensor(idxs)).cuda() if USE_CUDA else Variable(torch.LongTensor(idxs))
    return tensor

# Data 

* 트레이닝 : 286,817
* 벨리데이션 : 13,368
* 테스트 : 11,487
* 인풋 : 800토큰, 아웃풋 100 토큰
* 인풋 : 헤드라인, 바이라인(필자 등), 본문을 스페셜 토큰으로 조인

In [4]:
raw = open('insight_social.txt','r',encoding='utf-8').readlines()

In [5]:
data=[]
temp=[]
for r in raw:
    if r !='\n':
        temp.append(r[:-1])
    else:
        data.append(temp)
        temp=[]

In [6]:
train_set=[]

In [7]:
for i, d in enumerate(data):
    try:
        temp = []
        temp.append(d[0].split("제목 : ")[1])
        temp.append(d[2].split("본문 : ")[1].replace("\'",""))
        input_seq = "ssSEPARATEss".join(temp)
        train_set.append((input_seq, d[1].split("요약 : ")[1]))
    except:
        pass
        #print(i)

In [8]:
train_set = [[tagger.morphs(t[0]),tagger.morphs(t[1])] for t in train_set if len(tagger.morphs(t[0]))<500]

In [9]:
from collections import Counter
flatten = lambda l: [item for sublist in l for item in sublist]

In [10]:
X,y = zip(*train_set)

In [11]:
input2index = {"<PAD>":0,"<SOS>":1,"<EOS>":2,"<UNK>":3}
output2index = {"<PAD>":0,"<SOS>":1,"<EOS>":2,"<UNK>":3}

for token in flatten(X): 
    if token not in input2index.keys():
        input2index[token]=len(input2index)

index2input = {v:k for k,v in input2index.items()}

for token in flatten(y): 
    if token not in output2index.keys():
        output2index[token]=len(output2index)

index2output = {v:k for k,v in output2index.items()}

In [12]:
print(len(input2index))
print(len(output2index))

18007
6315


In [13]:
X_p=[]
y_p=[]

In [14]:
for i in range(len(X)):

    if len(X[i])<300:
        num_pad = 300-len(X[i])
        temp_x = X[i]+["<PAD>"]*num_pad
    else:
        temp_x  = X[i][:300]
    X_p.append(prepare_sequence(temp_x,input2index).view(1,-1))

    if len(y[i])<50:
        num_pad = 50-len(y[i])
        temp_y = y[i] +["<PAD>"]*num_pad
    else:
        temp_y = y[i][:50]
    y_p.append(prepare_sequence(temp_y,output2index).view(1,-1))


In [15]:
train_data = list(zip(X_p,y_p))

In [16]:
import random

In [17]:
def getBatch(batch_size,train_data):
    random.shuffle(train_data)
    sindex=0
    eindex=batch_size
    while eindex < len(train_data):
        batch = train_data[sindex:eindex]
        temp = eindex
        eindex = eindex+batch_size
        sindex = temp

        yield batch

In [18]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size,embedding_size, hidden_size ,n_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.lstm = nn.LSTM(embedding_size, hidden_size, n_layers, batch_first=True,bidirectional=True)
    
    def init_hidden(self,input):
        hidden = Variable(torch.randn(self.n_layers*2, input.size(0), self.hidden_size)).cuda() if USE_CUDA else Variable(torch.randn(self.n_layers*2, input.size(0), self.hidden_size))
        context = Variable(torch.randn(self.n_layers*2, input.size(0), self.hidden_size)).cuda() if USE_CUDA else Variable(torch.randn(self.n_layers*2, input.size(0), self.hidden_size))
        return (hidden,context)
    
        
    def forward(self, input,input_masking):
        """
        input : B,T (LongTensor)
        input_masking : B,T (PAD 마스킹한 ByteTensor)
        
        <PAD> 제외한 리얼 Context를 다시 만들어서 아웃풋으로
        """
        
        self.hidden = self.init_hidden(input)
        embedded = self.embedding(input)
        output, self.hidden = self.lstm(embedded, self.hidden)
        
        real_context=[]
        
        for i,o in enumerate(output): # B,T,D
            real_length = input_masking[i].data.tolist().count(0) # 실제 길이
            real_context.append(o[real_length-1])
            
        return output, torch.cat(real_context).view(input.size(0),-1).unsqueeze(1)

In [49]:
class DecoderRNN(nn.Module):
    
    def __init__(self,output_size,embedding_size,hidden_size,max_len=50,n_layers=1,dropout_p=0.1):
        super(DecoderRNN, self).__init__()
        
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        self.max_len=max_len
        self.embedding_size = embedding_size
        self.attn_e = nn.Linear(self.hidden_size,self.hidden_size) # encoder Attention
        self.attn_d = nn.Linear(self.hidden_size,self.hidden_size) # decoder Attention
        self.sigmoid = nn.Sigmoid()
        self.temp_attn_scores=[]

        # Define the layers
        self.embedding = nn.Embedding(self.output_size, self.embedding_size) #TODO encoder와 공유하도록 하고 학습되지 않게..

        #self.dropout = nn.Dropout(self.dropout_p)
        self.lstm = nn.LSTM(self.embedding_size, hidden_size, n_layers, batch_first=True)
        
        self.switch = nn.Linear(self.hidden_size*3, 1) # Gen or Copy
        self.out = nn.Linear(self.hidden_size*3, self.output_size)
    
    def init_hidden(self,input):
        hidden = Variable(torch.randn(self.n_layers*1, input.size(0), self.hidden_size)).cuda() if USE_CUDA else Variable(torch.randn(self.n_layers*2, input.size(0), self.hidden_size))
        context = Variable(torch.randn(self.n_layers*1, input.size(0), self.hidden_size)).cuda() if USE_CUDA else Variable(torch.randn(self.n_layers*2, input.size(0), self.hidden_size))
        return (hidden,context)
    
    
    def IntraEncAttention(self, hidden, encoder_outputs, encoder_masking):
        """
        hidden : 1,B,D
        encoder_outputs : B,T,D
        encoder_masking : B,T # ByteTensor
        """
        hidden = hidden.squeeze(0).unsqueeze(2)  # 히든 : (1,배치,차원) -> (배치,차원,1)
        
        batch_size = encoder_outputs.size(0) # B
        encoder_len = encoder_outputs.size(1) # T : 300
        
        e_tis=[]
        for i, e_i in enumerate(encoder_outputs.transpose(0,1)):
            e_i = self.attn_e(e_i) # BxD
            
            ## PAD MASKING
            e_masking = encoder_masking.transpose(0,1)[i].unsqueeze(1).expand_as(e_i) # BxD
            e_i.masked_fill(e_masking,-1e12) # masking
            
            temporal_score = e_i.unsqueeze(1).bmm(hidden).squeeze(1).exp() # Bx1
            self.temp_attn_scores.append(temporal_score)
            
            if len(self.temp_attn_scores)>1:
                temporal_score = temporal_score/torch.sum(torch.cat(self.temp_attn_scores[:-1],1),1)
            
            e_tis.append(temporal_score)
        
        contexts=[]
        normalized_attn_scores=[]
        for i,e_i in enumerate(encoder_outputs.transpose(0,1)):
            normalized_attn_score = e_tis[i]/torch.sum(torch.cat(e_tis,1),1)
            normalized_attn_scores.append(normalized_attn_score)
            context = normalized_attn_score.unsqueeze(2).bmm(e_i.unsqueeze(1)) # Bx1x1 , Bx1xD = Bx1xD
            contexts.append(context)
        
        _, copy_attn_index = torch.max(F.softmax(torch.cat(normalized_attn_scores,1)),1)
        
        enc_context = torch.sum(torch.cat(contexts,1),1)
        
        del e_tis
        del contexts
        del normalized_attn_scores
        
        return  enc_context.unsqueeze(1), copy_attn_index.unsqueeze(1) # B,1,D
    
    def IntraDecAttention(self,hidden, decoder_outputs):
        """
        hidden : 1,B,D
        decoder_outputs : B,t,D
        """
        hidden = hidden.squeeze(0).unsqueeze(2)  # 히든 : (1,배치,차원) -> (배치,차원,1)
        
        batch_size = decoder_outputs.size(0) # B
        max_len = decoder_outputs.size(1) # t
        
        d_tis=[]
        for d_i in decoder_outputs.transpose(0,1):
            d_i = self.attn_d(d_i) # BxD
            temporal_score = d_i.unsqueeze(1).bmm(hidden).squeeze(1).exp() # Bx1
            d_tis.append(temporal_score)
        
        contexts=[]
        for i,d_i in enumerate(decoder_outputs.transpose(0,1)):
            normalized_attn_score = d_tis[i]/torch.sum(torch.cat(d_tis,1),1)
            context = normalized_attn_score.unsqueeze(2).bmm(d_i.unsqueeze(1)) # Bx1x1 , Bx1xD = Bx1xD
            contexts.append(context)
        
        dec_context = torch.sum(torch.cat(contexts,1),1) # B,1,D
        
        del d_tis
        del contexts
        
        return dec_context.unsqueeze(1)
    
    def forward(self, input,teacher_forcing,context,encoder_outputs,encoder_maskings,training=True):
        """
        input : B,1 (Start_token)
        teacher_forcing : B,T # 수렴을 빠르게 할 목적으로 그 다음 인풋을 Ground_Truth로 알려준다
        context : B,1,D
        encoder_outpouts : B,T,D
        """
        # Get the embedding of the current input word
        embedded = self.embedding(input)
        self.hidden = self.init_hidden(input)
        #embedded = self.dropout(embedded)
        
        decode=[]
        
        enc_context = context
        dec_context = Variable(torch.zeros(context.size())).cuda() if USE_CUDA else Variable(torch.zeros(context.size()))
        decoder_outputs=[]
        
        
        for i in range(self.max_len):

            _, self.hidden = self.lstm(embedded, self.hidden)
            decoder_outputs.append(self.hidden[0])
            concated = torch.cat((self.hidden[0],enc_context.transpose(0,1),dec_context.transpose(0,1)),2)
            
            #TODO switch = torch.round(self.sigmoid(self.switch(concated.squeeze(0)))).squeeze(1) # 0 or 1 copy할지 decode할지 결정
            score = self.out(concated.squeeze(0))
            softmaxed = F.log_softmax(score)
            decode.append(softmaxed)
            _, dec_index = torch.max(softmaxed,1)
            
            
            # encoder Context & Copy Index when switch= 1
            enc_context, copy_attn_index = self.IntraEncAttention(self.hidden[0], encoder_outputs,encoder_maskings) 

            # decoder Context
            if len(decoder_outputs)>=1:
                dec_context = self.IntraDecAttention(self.hidden[0],torch.cat(decoder_outputs).transpose(0,1))
            
            # teacher forcing !!
            if training: 
                input = teacher_forcing.transpose(0,1)[i].unsqueeze(1) # T,B [i] ==> B,1
                embedded = self.embedding(input)
                
            # decode or copy
            else:
                embedded = self.embedding(dec_index.unsqueeze(1))
                
                #TODO : u_t의 ground-truth 어떻게 줄지 고민 후 ㅠㅠ
#                 input=[]
#                 for i,s in enumerate(switch):
#                     if s==1:
#                         input.append(copy_attn_index[i]) # copy
#                     else:
#                         input.append(dec_index[i]) # decode

#                 embedded = self.embedding(torch.cat(input).unsqueeze(1))
        
        
        # 요고 주의! time-step을 column-wise concat한 후, reshape!!
        scores = torch.cat(decode,1)
        
        del decode
        del decoder_outputs
        
        return scores.view(input.size(0)*self.max_len,-1) #TODO , switch # 0 or 1 학습

In [50]:
EMBEDDING_SIZE = 100
HIDDEN_SIZE = 100
BATCH_SIZE=8
INPUT_LEN = 300
OUTPUT_LEN = 50
LEARNING_RATE = 0.001
STEP_SIZE=1

In [None]:
encoder = EncoderRNN(len(input2index),EMBEDDING_SIZE,HIDDEN_SIZE)
decoder = DecoderRNN(len(output2index),EMBEDDING_SIZE,HIDDEN_SIZE*2,OUTPUT_LEN)

if USE_CUDA:
    encoder = encoder.cuda()
    decoder = decoder.cuda()
    
loss_function = nn.CrossEntropyLoss(ignore_index=0)
enc_optim= optim.Adam(encoder.parameters(), lr=LEARNING_RATE)
dec_optim = optim.Adam(decoder.parameters(),lr=LEARNING_RATE)

In [None]:
for step in range(STEP_SIZE):
    losses=[]
    for i, batch in enumerate(getBatch(BATCH_SIZE,train_data)):
        x,y = zip(*batch)
        
        inputs = torch.cat(x)
        targets = torch.cat(y)
        masks = torch.cat([Variable(torch.ByteTensor(tuple(map(lambda s: s ==0, t.data)))).cuda() if USE_CUDA else Variable(torch.ByteTensor(tuple(map(lambda s: s ==0, t.data)))) for t in inputs]).view(BATCH_SIZE,-1)
        
        encoder.zero_grad()
        decoder.zero_grad()
        
        enc_output, context = encoder(inputs,masks)
        print("encoder go!")
        decoder_input = Variable(torch.LongTensor([[output2index['<SOS>']]*BATCH_SIZE])).cuda().transpose(1,0) if USE_CUDA else Variable(torch.LongTensor([[output2index['<SOS>']]*BATCH_SIZE])).transpose(1,0)
                
        decoding_score = decoder(decoder_input,targets,context,enc_output,masks)
        print("decoder go!")
        loss = loss_function(decoding_score,targets.view(-1))
        losses.append(loss)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm(encoder.parameters(), 5.0)
        torch.nn.utils.clip_grad_norm(decoder.parameters(), 5.0)
        
        enc_optim.step()
        dec_optim.step()
        
        if i % 10==0:
            print("Step",step," epoch",i," : ",np.mean(losses))
            losses=[]        

encoder go!
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
decoder go!


# Problem 

* Intra-attention  forward, backward 연산량이 말도 안되게 많음... epoch 한번 당 5~10분 걸리는듯
* 혹은 잘못 짰거나 

# TODO

1. 우선 teacher forcing (MLE)부터 해서 어텐션 및 마스킹이 제대로 들어갔는지부터 검증
2. switch score의 ground Truth를 훈련시키기 위해 NER로 만드는거 고민
3. 디코딩 액션을 선택하기 위해 log-prob을 어떻게 만들지... (policy gradient) 
4. mixed objective 최적화 (RL+MLE)
5. softmax 연산의 부하를 줄이기 위한 방법 (negative sampling, sampled softmax 등)
6. 나중에 teacher forcing anealing?! 확률적으로 동작하도록... 
7. GloVe로 임베딩 매트릭스 초기화 및 고정, 인코더-디코더-프로젝션 웨이트 공유