#PyTorch
#神經網路翻譯系統v2
#序列對序列(Seq2Seq)
#By Andrew Huang

#安裝Spacy Library

In [None]:
!pip install spacy
!python -m spacy download en_core_web_sm
!python -m spacy download de_core_news_sm

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
Collecting de_core_news_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.2.5/de_core_news_sm-2.2.5.tar.gz (14.9 MB)
[K     |████████████████████████████████| 14.9 MB 7.1 MB/s 
Building wheels for collected packages: de-core-news-sm
  Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for de-core-news-sm: filename=de_core_news_sm-2.2.5-py3-none-any.whl size=14907055 sha256=1d9fce7c47bf6f993ca0e8bbdf2dde9a2c5d374cdf0887c00f6d0efd9c2115a1
  Stored in directory: /tmp/pip-ephem-wheel-cache-0ne5rtkh/wheels/00/66/69/cb6c921610087d2cab339062345098e30a5ceb665360e7b32a
Successfu

#匯入必要的程式庫

In [None]:
import torch
import torchtext
import numpy
import en_core_web_sm
import de_core_news_sm
from google.colab import drive

# 掛載雲端硬碟

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


#處理資料

In [None]:
en_nlp = en_core_web_sm.load()
de_nlp = de_core_news_sm.load()

def token_pair_src(text):
  text = str(text).strip()
  return [token_text.text for token_text in en_nlp.tokenizer(text)]

def token_pair_dst(text):
  text = str(text).strip()
  return [token_text.text for token_text in de_nlp.tokenizer(text)]

def load_dataset(root="/content/drive/MyDrive/PythonProject/TranslationZero", pair_lang=(".en", ".de"), batch_size=200, seq_len = 50):
    
  TEXT_SRC = torchtext.legacy.data.Field(
      unk_token="<unk>",
      pad_token="<pad>",
      eos_token="<eos>",
      init_token="<sos>",
      tokenize=token_pair_src,
      sequential=True,
      use_vocab=True,
      fix_length=seq_len)
    
  TEXT_DST = torchtext.legacy.data.Field(
      unk_token="<unk>",
      pad_token="<pad>",
      eos_token="<eos>",
      init_token="<sos>",
      tokenize=token_pair_dst,
      sequential=True,
      use_vocab=True,
      fix_length=seq_len)   
    
  pair_field = (TEXT_SRC, TEXT_DST)

  train_data, val_data, test_data = torchtext.legacy.datasets.Multi30k.splits(exts=pair_lang, fields=pair_field, root=root)

  TEXT_SRC.build_vocab(train_data, val_data, test_data, max_size=30000)
  TEXT_DST.build_vocab(train_data, val_data, test_data, max_size=30000)        
    
  train_iterator = torchtext.legacy.data.Iterator(
      dataset=train_data, 
      batch_size=batch_size, 
      shuffle=True)
  val_iterator = torchtext.legacy.data.Iterator(
      dataset=val_data, 
      batch_size=batch_size, 
      shuffle=True)
  test_iterator = torchtext.legacy.data.Iterator(
      dataset=test_data, 
      batch_size=batch_size, 
      shuffle=True)
  return train_iterator, val_iterator, test_iterator, TEXT_SRC.vocab, TEXT_DST.vocab

batch_size = 200
seq_len = 30
hidden_size = 256
num_layer = 2
epoch = 10
train_set, val_set, test_set, src_vocab, dst_vocab = load_dataset(batch_size=batch_size, seq_len=seq_len)
src_vocab_len = len(src_vocab)
dst_vocab_len = len(dst_vocab)

#建置EncoderNetwork Model

In [None]:
class EncoderNet(torch.nn.Module):

  def __init__(self, vocab_size, hidden_size, num_layer=1):
    super(EncoderNet, self).__init__()
    self.embedding_layer = torch.nn.Embedding(vocab_size, hidden_size)
    self.encoder_layer = torch.nn.TransformerEncoder(
      torch.nn.TransformerEncoderLayer(hidden_size, nhead=8), num_layers=num_layer)
  
  def forward(self, inputs):
    outputs = self.embedding_layer(inputs)
    outputs = self.encoder_layer(outputs)
    return outputs

#建置DecoderNetwork Model

In [None]:
class DecoderNet(torch.nn.Module):

  def __init__(self, vocab_size, hidden_size, num_layer=1):
    super(DecoderNet, self).__init__()
    self.embedding_layer = torch.nn.Embedding(vocab_size, hidden_size)
    self.decoder_layer = torch.nn.TransformerDecoder(
      torch.nn.TransformerDecoderLayer(hidden_size, nhead=8), num_layers=num_layer)
    self.fc_layer = torch.nn.Linear(hidden_size, vocab_size)

  def forward(self, inputs, memory):
    inputs = inputs.unsqueeze(0)
    outputs = self.embedding_layer(inputs)
    outputs = self.decoder_layer(outputs, memory)
    outputs = self.fc_layer(outputs)
    outputs = outputs.squeeze(0)
    return outputs

#建置Seq2Seq Model

In [None]:
class Seq2Seq(torch.nn.Module):

  def __init__(self, src_vocab_size, dst_vocab_size, hidden_size, num_layer=1):
    super(Seq2Seq, self).__init__()
    self.encoder = EncoderNet(src_vocab_size, hidden_size, num_layer)
    self.decoder = DecoderNet(dst_vocab_size, hidden_size, num_layer)

  def forward(self, src_inputs, dst_inputs):
    enc_outputs = self.encoder(src_inputs)
    outputs = torch.zeros(seq_len, batch_size, dst_vocab_len)
    if torch.cuda.device_count() > 0:
      outputs = outputs.cuda()
    target = dst_inputs[0]
    enc = enc_outputs[0].unsqueeze(0)
    for i in range(1, seq_len):
      output = self.decoder(target, enc)
      outputs[i] = output
      target = dst_inputs[i]
      enc = enc_outputs[i].unsqueeze(0)
    return outputs
  
  def prediction(self, x, vocab, seq_len = 30):
    result = ['<sos>']
    with torch.no_grad():
      enc_outputs = self.encoder(x)
      sos_token = [vocab.stoi[result[0]]]
      target = torch.from_numpy(numpy.array(sos_token))
      if torch.cuda.device_count() > 0:
        target = target.cuda()
      for i in range(seq_len):
        outputs = self.decoder(target, enc_outputs[i].unsqueeze(0))
        topi = outputs.argmax(1)
        if vocab.itos[topi.item()] == "<eos>":
          result.append(vocab.itos[topi.item()]) 
          break
        else:
          result.append(vocab.itos[topi.item()]) 
        target = topi
    return result

#開始訓練

In [None]:
seq2seq = Seq2Seq(src_vocab_len, dst_vocab_len, hidden_size, num_layer)
if torch.cuda.device_count() > 0:
  seq2seq = seq2seq.cuda()
optim = torch.optim.Adam(seq2seq.parameters(), lr = 0.001)
loss_fn = torch.nn.CrossEntropyLoss()

for i in range(epoch):
  for iters, data in enumerate(train_set, 0):
    src_inputs = data.src.long()
    dst_inputs = data.trg.long()
    if torch.cuda.device_count() > 0:
      src_inputs = src_inputs.cuda()
      dst_inputs = dst_inputs.cuda()
    outputs = seq2seq(src_inputs, dst_inputs)
    loss = loss_fn(outputs.reshape(-1, outputs.shape[2]), dst_inputs.reshape(-1))
 
    optim.zero_grad()
    loss.backward()
    optim.step()

    if iters % 10 == 0:
      print("[+] Epoch: [%d/%d] Loss: %.4f" % (i+1, epoch, loss.item()))
      print(seq2seq.prediction(src_inputs[:, 0].unsqueeze(1), dst_vocab, seq_len))

torch.save(seq2seq.state_dict(), "/content/drive/MyDrive/PythonProject/TranslationZero/seq2seq_modelv2.pth")

[+] Epoch: [1/10] Loss: 10.4038
['<sos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
[+] Epoch: [1/10] Loss: 3.2960
['<sos>', 'Ein', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']
[+] Epoch: [1/10] Loss: 2.8853
['<sos>', 'Ein', 'Mann', '.', '<eos>']
[+] Epoch: [1/10] Loss: 2.6617
['<sos>', 'Ein', 'Mann', 'in', 'einem', 'Mann', 'in', 'einem', 'in', 'einem', 'Mann', 'in', 'einem', 'Mann', 'in', 'einem', '<eos>']
[+] Epoch: [1/10] Loss: 2.4463
['<sos>', 'Ein', 'Mann', 'in', 'einem', 'auf', 'einem', 'auf', 'einem', 'auf', 'einem', 'roten', '.', '<eos>']
[+]

#輸出測試結果

In [None]:
model = Seq2Seq(src_vocab_len, dst_vocab_len, hidden_size, num_layer)
if torch.cuda.device_count() > 0:
  model = model.cuda()
model.load_state_dict(torch.load("/content/drive/MyDrive/PythonProject/TranslationZero/seq2seq_modelv2.pth"))
model = model.eval()

for iters, data in enumerate(test_set, 0):
  src_inputs = data.src.long()
  dst_inputs = data.trg.long()
  if torch.cuda.device_count() > 0:
    src_inputs = src_inputs.cuda()
    dst_inputs = dst_inputs.cuda()
  for i in range(10):
    orgin = []
    raw = src_inputs[:, i].tolist()
    
    for index in raw:
      if src_vocab.itos[index] == "<eos>":
        orgin.append(src_vocab.itos[index])
        break
      else:
        orgin.append(src_vocab.itos[index])

    result = model.prediction(src_inputs[:, i].unsqueeze(1), dst_vocab, seq_len)
    print("English=")
    print(orgin)
    print("German=")
    print(result)
    print()

English=
['<sos>', 'People', 'are', 'walking', 'on', 'a', 'paved', 'slope', 'surrounded', 'by', 'Chinese', 'vendors', '.', '<eos>']
German=
['<sos>', 'Leute', 'gehen', 'auf', 'einem', 'Feld', 'mit', 'vielen', 'Bäumen', 'gesäumten', 'Weg', '.', '<eos>']

English=
['<sos>', 'Five', 'people', 'wearing', 'winter', 'jackets', 'and', 'helmets', 'stand', 'in', 'the', 'snow', ',', 'with', 'snowmobiles', 'in', 'the', 'background', '.', '<eos>']
German=
['<sos>', 'Fünf', 'Personen', 'mit', 'Rucksäcken', 'und', 'stehen', 'im', 'Hintergrund', 'stehen', 'im', 'Hintergrund', '.', '<eos>']

English=
['<sos>', 'A', 'guy', 'wearing', 'a', 'white', 'shirt', 'is', 'playing', 'a', 'white', 'guitar', '.', '<eos>']
German=
['<sos>', 'Ein', 'Mann', 'in', 'weißem', 'Hemd', 'spielt', 'Gitarre', '.', '<eos>']

English=
['<sos>', 'A', 'young', 'boy', 'in', 'a', 'soccer', 'uniform', 'crying', 'into', 'his', 'palms', '.', '<eos>']
German=
['<sos>', 'Ein', 'Junge', 'in', 'Uniform', 'wirft', 'einen', 'Baseball', 'in

# 自訂翻譯結果

In [None]:
model = Seq2Seq(src_vocab_len, dst_vocab_len, hidden_size, num_layer)
if torch.cuda.device_count() > 0:
  model = model.cuda()
model.load_state_dict(torch.load("/content/drive/MyDrive/PythonProject/TranslationZero/seq2seq_modelv2.pth"))
model = model.eval()

inputs_list = []

inputs_list.append(src_vocab.stoi['<sos>'])
data = str(input("Input:"))
for word in data.split(' '):
  inputs_list.append(src_vocab.stoi[word])
inputs_list.append(src_vocab.stoi['<eos>'])

while len(inputs_list) < seq_len:
  inputs_list.append(src_vocab.stoi['<pad>'])

inputs_list = inputs_list[:seq_len]

inputs = torch.from_numpy(numpy.array([inputs_list])).permute(1, 0).long()
inputs = inputs.cuda()
outputs = model.prediction(inputs, dst_vocab, seq_len)
print(outputs)

NameError: ignored