
## Seq2Seq

  Example : chatbot, machine translation
  
  Process : Data -> Endcoder -> contect vector -> Decoder -> Output
  Cell : LSTM, GRU
  Symbol : <sos> ( start of sequence ), <eos> ( end of sequence )

  Test
  -> < eos >가 다음 단어로 예측 될 때까지 반복

  teacher forcing : <sos> abs -> abs <eos>가 나오게 끔 훈련
  contect vector : encoder의 마지막 hidden vector라고 할 수 있음

  모든 단어들에 대해서 embedding vector로 변경해서 처리한다
  


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import pandas as pd
import numpy as np
import urllib3
import os
import zipfile
import shutil

In [None]:
"""
  Download data english - france transalation
"""
http = urllib3.PoolManager()
url = 'http://www.manythings.org/anki/fra-eng.zip'
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename = os.path.join(path,filename)
with http.request('GET', url, preload_content=False) as r, open(zipfilename, 'wb') as out_file:       
    shutil.copyfileobj(r, out_file)

with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
    zip_ref.extractall(path)

In [None]:
lines = pd.read_csv('fra.txt', names=['src', 'tar', 'lic'], sep='\t')
del lines['lic']
print('전체 샘플의 개수 :',len(lines))

전체 샘플의 개수 : 197463


In [None]:
lines = lines.loc[:, 'src':'tar']
lines = lines[0:60000] # 6만개만 저장
lines.sample(10)

Unnamed: 0,src,tar
23349,He is running now.,"Maintenant, il court."
22994,Does Tom know why?,Tom sait-il pourquoi ?
17925,Give me a hammer.,Donne-moi un marteau.
2574,Wait a bit.,Attends une seconde !
36680,I'd like some water.,Je voudrais un peu d’eau.
38232,That's not your job.,Ce n'est pas ton boulot.
49705,I have no backup plan.,Je n'ai pas de plan de rechange.
8940,Tom rushed in.,Tom s'est empressé d'entrer.
28167,You're very brave.,Vous êtes fort courageuses.
41727,He looked very tired.,Il avait l'air très fatigué.


In [None]:
"""
  <sos>와 <eos>의 삽입
"""
lines.tar = lines.tar.apply(lambda x : '\t '+ x + ' \n')
lines.sample(10)

Unnamed: 0,src,tar
17457,Are you studying?,\t Tu étudies ? \n
33141,We were both wrong.,\t Nous avions tout les deux tort. \n
47973,Did you all know that?,\t Est-ce que vous connaissiez tous ça ? \n
4846,I can fix it.,\t Je peux le réparer. \n
12050,Those are mine.,\t Ce sont les miens. \n
19412,I'm heading home.,\t Je me rends chez moi. \n
55617,"Good morning, everyone.",\t Bonjour tout le monde. \n
56938,I like pizza very much.,\t J'adore la pizza. \n
33787,You get used to it.,\t Tu t'y habitueras. \n
4454,Are you hurt?,\t Es-tu blessé ? \n


In [None]:
"""
  tokenize
  src, tar의 각각의 charset 구축
  ->embedding space의 크기를 알기 위함
"""
src_vocab = set() #set은 중복을 허용하지 않는다
for line in lines.src: # 1줄씩 읽음
    for char in line: # 1개의 문자씩 읽음
        src_vocab.add(char)

tar_vocab = set()
for line in lines.tar:
    for char in line:
        tar_vocab.add(char)

src_vocab_size = len(src_vocab)+1
tar_vocab_size = len(tar_vocab)+1
print('source 문장의 char 집합 :',src_vocab_size)
print('target 문장의 char 집합 :',tar_vocab_size)

source 문장의 char 집합 : 79
target 문장의 char 집합 : 105


In [None]:
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
print(src_vocab[45:75])
print(tar_vocab[45:75])

['W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w']


In [None]:
#tokenize 한 문자에 각각 key 설정
src_to_index = dict([(word,i+1) for i, word in enumerate(src_vocab)])
tar_to_index = dict([(word,i+1) for i, word in enumerate(tar_vocab)])

In [None]:
#input에 대해서 정수 인코딩
encoder_input = []
decoder_input = []
for line in lines.src:
  encoded_line = []
  for char in line:
    encoded_line.append(src_to_index[char])
  encoder_input.append(encoded_line)
for line in lines.tar:
  encoded_line = []
  for char in line:
    encoded_line.append(tar_to_index[char])
  decoder_input.append(encoded_line)

In [None]:
class Seq2Seq(nn.Module):
  def __init__(self,input_size,output_size,hidden_size,num_layers):
    self.super(self,Seq2Seq).__init__

    self.input_dim = input_size
    self.output_dim = output_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.encoder = nn.LSTM(input_size = self.input_size, hidden_size = self.hidden_size,
                           num_layers = num_layers, batch_first = True)
    
    self.decoder = nn.LSTM(input_size = self.input_Size, hidden_size = self.hidden_size,
                           num_layers = num_layers, batch_first = True)
    
    self.fc = nn.Linear(hidden_size, output_size)
    
  def forward(self,x):
    #init encoder hidden/cell state
    h_0 = torch.zeros(self.num_layers,x.size(0),-1)
    c_0 = torch.zeros(self.num_layers,x.size(0),-1)

    out, (h_n, c_n) = self.encoder( x , ( h_0 , c_0 ) )

    # context vector is encoder's last hidden state
    # init decoder's cell state
    c_d_0 = torch.zeros(self.num_layers, x.size(0), -1)
    h_d_0 = h_n

    out_decoder, (h_d_0, c_d_0) = self.decoder( x , (h_n, c_0) )

    out = self.fc(out_decoder)

    return out