In [1]:
import unicodedata
import string
import torch
import math
import pickle
import re
import time

# import files on google colab
# from google.colab import files
from torch.utils.data import DataLoader,Dataset

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# import files on google colab
# uploaded = files.upload()

# **Helper class**

In [5]:
class Lang:
    """
    Helper class for creating dictionaries for languages
    """
    def __init__(self,name):
        self.name = name
        self.word2index = {'SOS':0,'EOS':1,'PAD':2}
        self.index2word = {0:'SOS',1:'EOS',2:'PAD'}
        self.word2count = {}
        self.n_words = 3

    def add_sentence(self,sentence):
        for word in sentence.split(' '):
            self.add_word(word)

            
    def add_word(self,word):
        # check if the word has appeared before
        if word not in self.word2index:
            # word to index at index (n_words)
            self.word2index[word] = self.n_words
            
            # index (n_words) to word
            self.index2word[self.n_words] = word
            
            # word count is set to 1
            self.word2count[word] = 1
            
            # numbers of unique words increases by 1
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [6]:
def read_Lang(reverse=False):
    # open the file in the directory
    # This is the only class that is language specific
    
    # Reads chinese to english files, the first line is English and second line is Chinese
    f = open("cmn-eng.txt",encoding='utf-8')
    
    # read lines in the file
    lines = f.readlines()
    lines = [line.rstrip().split('\t')[0:2] for line in lines]
    
    # add all the lower case alphabetical letters into Chinese dictionary
    alph = string.ascii_lowercase
    alph = [s for s in alph]
    alph = " ".join(alph)

    # close the file
    f.close()
    
    if reverse:
        lines = [list(reversed(line)) for line in lines]
        
        # Input is Chinese
        input_ = [re.sub(r" ","",line[0]) for line in lines]
        input_ = [[word for word in line] for line in input_]
        input_ = [" ".join(line) for line in input_]
        input_ = [line.lower() for line in input_]
        input_.append(alph)
        
        # Output is English
        output_ = [re.sub(r"([.!?])",r" \1",line[1]) for line in lines]
        output_ =  [re.sub(r"[^a-zA-Z.!?]+", r" ", line) for line in output_]
        output_ = [s.lower() for s in output_]
        
        # Combine the pairs
        pairs = [[i,o] for i,o in zip(input_,output_)]
        input_class = Lang('cmn')
        output_class = Lang('eng')
    else:
        # input is English
        input_ = [re.sub(r"([.!?])",r" \1",line[0]) for line in lines]
        input_ =  [re.sub(r"[^a-zA-Z.!?]+", r" ", line) for line in input_]
        input_ = [s.lower() for s in input_]
        
        # output is Chinese
        output_ = [re.sub(r" ","",line[1]) for line in lines]
        output_ = [[word for word in line] for line in output_]
        output_ = [" ".join(line) for line in output_]
        output_ = [line.lower() for line in output_]
        output_.append(alph)
        
        # Combine the pairs
        pairs = [[i,o] for i,o in zip(input_,output_)]
        input_class = Lang('eng')
        output_class = Lang('cmn')
    
    
    for i in range(len(input_)):
        input_class.add_sentence(input_[i])

    for j in range(len(output_)):
        output_class.add_sentence(output_[j])
        
    return input_class,output_class,pairs

In [7]:
class language_loader(Dataset):
    def __init__(self,pairs,input_lang,output_lang,device):
        self.pairs = pairs
        self.input_lang = input_lang
        self.output_lang = output_lang

        # find the length of sentences of the input and output sentence
        length_in = [len(pair[0]) for pair in self.pairs]
        length_out = [len(pair[1]) for pair in self.pairs]

        # find the maximum length of the input and output sentence
        self.in_max = max(length_in)
        self.out_max = max(length_out)
        
        self.device = device
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self,idx):
        pair = self.pairs[idx]
        
        input_s = pair[0].split(" ")
        input_s = input_s + ['EOS']
        input_length = torch.LongTensor([len(input_s)])
        
        output_s = pair[1].split(" ")
        output_s = ['SOS'] + output_s + ['EOS']
        output_length = torch.LongTensor([len(output_s)])
        
        src_pad_idx = self.input_lang.word2index['PAD']
        trg_pad_idx = self.input_lang.word2index['PAD']
        
        input_tensor = torch.ones(self.in_max)*src_pad_idx
        output_tensor = torch.ones(self.out_max)*trg_pad_idx
        
        for i in range(len(input_s)):
            word = input_s[i]
            input_tensor[i] = self.input_lang.word2index[word]
        
        for j in range(len(output_s)):
            word = output_s[j]
            output_tensor[j] = self.output_lang.word2index[word]

        return input_tensor.long().to(self.device),output_tensor.long().to(self.device),input_length.to(self.device),output_length.to(self.device)

In [8]:
input_lang,output_lang,pairs = read_Lang()
l = language_loader(pairs,input_lang,output_lang,device)
d = DataLoader(l,shuffle=True,batch_size=128)

## **Encoder RNN**

In [9]:
class EncoderRNN(torch.nn.Module):
    def __init__(self,input_size,embedded_size,hidden_size,device,drop_p=0.1):
        """
        inputs: 
        input_size: the size of the dictionary of the input language
        hidden_size: hyper-parameter that represents the hidden state length of the model
        device: the device that this model is operating on: either "cpu" or "cuda"
        drop_p: dropout probability
        """
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.embedded_size = embedded_size
        self.drop_p = drop_p
        self.device = device

        self.embedding = torch.nn.Embedding(self.input_size, self.embedded_size)
        self.rnn = torch.nn.GRU(self.embedded_size,self.hidden_size,bidirectional=True)
        
        self.fc = torch.nn.Linear(self.hidden_size*2,self.hidden_size)
        self.dropout = torch.nn.Dropout(p=self.drop_p)
        
    def forward(self,input_t,src_len):
        """
        input_t: a tensor of type Long that passes in the indices of the embedded dictionary
                 of shape (src_max_len,batch_size)
        
        h0: a tensor of size (num_layers*2,batch_size,self.hidden_size) as we are performing bidirectional operation
        """
        batch_size = input_t.shape[1]
        
        
        # embedded of shape (src_len, batch_size, embedded)
        embedded = self.dropout(self.embedding(input_t))
        
        packed_embedded = torch.nn.utils.rnn.pack_padded_sequence(embedded, src_len)
        # hidden is of shape (num_layers * num_directions, batch, hidden_size)
        packed_outputs, hidden = self.rnn(packed_embedded)
        
        # encoder_outputs = [src len, batch_size, num_direction* embed dim ]
        encoder_outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(packed_outputs)
        
        # output has shape (src_len,batch_size,num_direction*hidden_size) and is the compilation of all the hidden state
        # in the network
        
        #initial decoder hidden is final hidden state of the forwards and backwards 
        #  encoder RNNs fed through a linear layer
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)))

        return encoder_outputs,hidden

In [10]:
in_ = torch.tensor([[1,2,5],[2,3,6],[10,4,3]])
_len = torch.tensor([2,2,1])
Embed_dim = 30
input_size = 20
hidden_size = 40
E = EncoderRNN(input_size,Embed_dim,hidden_size,drop_p=0.1,device=device)

encoder_outputs,h = E(in_,_len)

# **Attention Layer**

In [11]:
class Attention(torch.nn.Module):
    def __init__(self,hidden_size,device):
        """
        hidden_size: the hidden layer size
        device: the device this model is operating on
        """
        super().__init__()
        self.hidden_size = hidden_size
        self.device = device

        self.convh0 = torch.nn.Linear(self.hidden_size,self.hidden_size)
        self.attn = torch.nn.Linear((self.hidden_size* 2)+self.hidden_size, self.hidden_size)
        self.v = torch.nn.Linear(self.hidden_size, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs,mask=None):
        #h0 = [batch_size,hid dim]
        #encoder_outputs = [src len, batch_size, enc hid dim * 2]
        src_len = encoder_outputs.shape[0]
        batch_size =  hidden.shape[0]      
        
        # h0 = [batch_size, src len, hid dim]
        hidden =  hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        #encoder_outputs = [batch_size, src len, hid dim * 2]
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        
        # energy has shape (batch_size,src len,hid dim)
        energy = torch.tanh(self.attn(torch.cat((hidden,encoder_outputs),dim=2)))

        # attention has shape(batch_size,src len,1)
        attention = self.v(energy)

        # attention has shape (batch_size,src len)
        attention = attention.squeeze(2)
        
        # mask should be shape shape (batch_size, src len)
        if mask !=  None:
            attention = attention.masked_fill(mask == 0,-1e10)
        
        attention = torch.nn.functional.softmax(attention, dim=1)
        
        return attention

In [12]:
a = Attention(hidden_size,device)
a(h,encoder_outputs)

tensor([[0.5132, 0.4868],
        [0.4971, 0.5029],
        [0.5097, 0.4903]], grad_fn=<SoftmaxBackward>)

# **Decoder RNN**

In [13]:
class DecoderRNN(torch.nn.Module):
    def __init__(self, output_size, embedded_size,hidden_size,device,drop_p=0.1):
        """
        output_size: the size of the dictionary of the output language
        
        hidden_size: hyper-parameter that represents the hidden state length of the model
        
        """
        super().__init__()
        self.output_size = output_size
        self.hidden_size = hidden_size        
        self.embedded_size = embedded_size
        self.drop_p = drop_p


        self.attention = Attention(self.hidden_size,device)
        
        self.embedding = torch.nn.Embedding(output_size,embedded_size)
        
        # input is context_vec concatenated with embedded, hidden is of size (1,1,hid dem)
        self.rnn = torch.nn.GRU(self.hidden_size*2+self.embedded_size,self.hidden_size)
        
        # input incorporates (out from GRU, context_vec,embedded)
        self.out = torch.nn.Linear(self.hidden_size*3+self.embedded_size,self.output_size)
        self.dropout = torch.nn.Dropout(self.drop_p)
        
    def forward(self,input_vec,hidden,encoder_outputs,mask=None):
        """
        input_vec: a vector of type torch.Long that is the index of the word being passed in (meant to be passed in with
        # shape (1,batch_size))
        
        hidden: of shape [batch_size, hid dim]
        
        encoder_outputs: in the shape of (src_len,batch_size,hid dim*2) from bidirectional RNN encoder
        """
        # number of batch_size passed in
        batch_size = input_vec.shape[1]

        # trg_len is 1 in the case for decoder as we pass it in one at a time
        trg_len = input_vec.shape[0]
        
        # embedded has shape (trg_len,batch_size,embedded size)
        embedded = self.dropout(self.embedding(input_vec))

        # a has shape (batch_size,src_len)
        a = self.attention(hidden,encoder_outputs,mask=mask)
        
        # a now has shape (batch_size,1,src_len)
        a = a.unsqueeze(1)
        
        # encoder_outputs now has shape (batch_size,src_len,hid dim*2)
        encoder_outputs = encoder_outputs.permute(1,0,2)
        
        # find the weighted output of the attention network (batch_size,1,hid dim*2)
        context_vec = torch.bmm(a,encoder_outputs)
        
        
        # context_vec has shape (trg len, batch_size,hid dim*2)
        context_vec = context_vec.permute(1,0,2)
        
        # find the input to the decoder of shape (trg_len,batch_size,hid dim*2 + embedded dim)
        decoder_input = torch.cat((embedded,context_vec),dim=2)
        
        
        # out is (1,batch_size,hid dim*num_directions)
        # h0 is (num_layers,batch_size,hid dim)
        
        out,hidden = self.rnn(decoder_input,hidden.unsqueeze(0))
        
        # embedded of shape (batch_size,embedded size)
        embedded = embedded.squeeze(0)
        # out is of shape (batch_size, hid dim)
        out = out.squeeze(0)
        # context_vec is of shape (batch_size,hid dim*2)
        context_vec = context_vec.squeeze(0)
        
        
        # out is of shape (batch_size,self.output_size)
        out = self.out(torch.cat((out,context_vec,embedded),dim=1))
        
        return out, hidden.squeeze(0),a

In [14]:
output_size=100
embedded_size=50
decoder = DecoderRNN(output_size,embedded_size,hidden_size,device)

# **NMT**

In [15]:
class NMT(torch.nn.Module):
    def __init__(self,hidden_size,embedded_size,input_lang,output_lang,device,drop_p=0.1):
        """
        hidden_size: size of the hidden layer
        embedded_size: size of the embedding 
        input_lang: input language object
        output_lang: output language object
        device: the device this model is operating on
        drop_p: the drop out probability of the drop out layer
        """
        super().__init__()
        self.input_lang = input_lang
        self.output_lang = output_lang
        self.hidden_size = hidden_size
        self.embedded_size = embedded_size
        self.drop_p = drop_p


        self.input_size = self.input_lang.n_words
        self.output_size = self.output_lang.n_words
        
        self.SRC_PAD_IDX = self.input_lang.word2index['PAD']
        self.TRG_PAD_IDX = self.output_lang.word2index['PAD']
        
        self.encoder = EncoderRNN(self.input_size,self.embedded_size,self.hidden_size,device=device,drop_p=self.drop_p)
        self.decoder = DecoderRNN(self.output_size,self.embedded_size,self.hidden_size,device=device,drop_p=self.drop_p)
        
        self.loss = torch.nn.CrossEntropyLoss(ignore_index=self.TRG_PAD_IDX)
        
        self.device = device

        
    def mask(self,input_t):
        mask = (input_t != self.SRC_PAD_IDX).permute(1,0)
        
        return mask
    
    def forward(self,data,teacher_enforce=0.5):
        """
        data contains the following info: 
        input_t: of shape (batch_size,src len)
        output_t: of shape (batch_size,trg len)
        src_len: of shape (batch_size, 1) that contains the length of all the src sentences
        trg_len: of shape (batch_size, 1) that contains the length of all the trg sentences
        """ 
        input_t,output_t,src_len,trg_len = data
        src_len = src_len.flatten()
        src_len = src_len.long()
        src_idx = torch.argsort(src_len,descending=True)

        # find the maximum length of the input and output of this batch
        max_in = torch.max(src_len)
        max_out = torch.max(trg_len.flatten())
        

        # input_t and output_t are now of shape (src len, batch_size), (trg len, batch_size)
        input_t,output_t = input_t.permute(1,0),output_t.permute(1,0)
        batch_size = input_t.shape[1]
        input_t = input_t[:max_in,:]
        input_t = input_t[:,src_idx]

        output_t = output_t[:max_out,:]
        output_t = output_t[:,src_idx]


        src_len = src_len[src_idx]        
        src_mask = self.mask(input_t)
        
 
        #### Encoder part ####        
        # encoder outputs of shape (src_len,batch_size,hid dim*2), encoder_h0 of shape (batch_size,hid dim)
        encoder_outputs,encoder_hidden = self.encoder(input_t,src_len)    
        
        #### Decoder part ##### 
        # define the decoder hidden state input of shape (batch_size,hid dim)
        decoder_hidden = encoder_hidden
  
        # The 'SOS' token of shape (1,batch_size)
        decoder_input = output_t[0].view(1,batch_size)
        
        
        
        outputs = torch.zeros(max_out, batch_size, self.output_size).to(self.device)


        cum_loss = 0
        for k in range(1,max_out):
            # output (batch_size,output_dim), hidden (1,batch_size,hid dim)
            output,decoder_hidden,_ = self.decoder(decoder_input,decoder_hidden,encoder_outputs,mask=src_mask)
            
            outputs[k] = output
            
            top1 = torch.argmax(output,dim=1).view(1,batch_size)
            randnum = torch.rand(1)

            # check whether or not to use teacher enforced learning
            criteria = randnum < teacher_enforce
            
            decoder_input = output_t[k].unsqueeze(0) if criteria else top1

        outputs = outputs[1:].view(-1, self.output_size)
        output_t = output_t[1:].view(-1)    
        loss = self.loss(outputs,output_t)

        return loss

# **Training**


In [16]:
def train(model,trainloader,optimizer,teacher_enforce=0.5):
    clip = 1
    epoch_loss = 0
    
    model.train()
    for data in trainloader:
        # zero out gradient descent
        optimizer.zero_grad()
        # forward pass
                
        loss = model(data,teacher_enforce=teacher_enforce)            

        # call backward on loss
        loss.backward()
                
        torch.nn.utils.clip_grad_norm_(model.parameters(),clip)
                
        # perform gradient descent
        optimizer.step()

        epoch_loss += loss.item()

    avg_loss = epoch_loss/len(trainloader)
    
    return avg_loss

# **Evaluation**


In [17]:
def evaluation(model,testloader):
    model.eval()
    with torch.no_grad():
        running_loss = 0
        for data in testloader:
            loss = model(data,teacher_enforce=0)
            running_loss += loss.item()
                
            
        return running_loss/len(testloader)

# **Translate**

In [18]:
def translate(model,input_,device,maxIter=100):
    input_ = input_.lower()
    input_ = re.sub(r"([.!?])",r" \1",input_)
    input_ =  re.sub(r"[^a-zA-Z.!?]+", r" ",input_)
    
    input_s = input_.split(" ")
    input_s = input_s + ['EOS']    
    input_t = torch.zeros(len(input_s),1).to(device)
    for i in range(len(input_s)):
        idx_i = model.input_lang.word2index[input_s[i]]
        input_t[i,:] = idx_i
        
        
    input_t = input_t.long()
    with torch.no_grad():
        model.eval()
        src_len = torch.LongTensor([len(input_s)]).to(device)
        encoder_outputs,encoder_hidden = model.encoder(input_t,src_len)    
        
        # define the decoder hidden state input of shape (num_layers,1,hid dim)
        decoder_hidden = encoder_hidden

        # the first word passed into the decoder network is the SOS
        decoder_input = model.output_lang.word2index['SOS']
        decoder_input = torch.LongTensor([[decoder_input]]).to(device)
        word = []
            
        iter_ = 0
        attention = []
            
        while True:
            if iter_ > maxIter:
                print("Not converged")
                break
            output,decoder_hidden,att = model.decoder(decoder_input,decoder_hidden,encoder_outputs)
            
            attention.append(att.flatten())
                
            # find the top scoring candidate
            top1 = torch.argmax(output,dim=1)
           
            # find the current token corresponding to the top scoring candiate
            curr_token = model.output_lang.index2word[top1.item()]

            # update decoder output to the top candidate
            decoder_input = top1.view(1,1)
            if curr_token == 'EOS':
                break
            word.append(curr_token)
            
            iter_ +=1
        if model.output_lang.name == 'eng':
            word = " ".join(word)
        else:
            word = "".join(word)
    return word,attention

# **Training Process**

In [19]:
from sklearn.model_selection import KFold
import numpy as np
kf = KFold(n_splits=10,shuffle=True)

input_class,output_class,pairs = read_Lang()
pairs = np.array(pairs)

HIDDEN_SIZE = 512
EMBEDDED_SIZE = 256
INPUT_LANG = input_class
OUTPUT_LANG = output_class
DROP_P = 0.5

In [0]:
EPOCH = 10
teacher_enforce = 0.5
kfold_train_l = []
kfold_test_l = []

k_split_num = 1
best_loss = float('inf')

for train_idx, test_idx in kf.split(pairs):
  train_pairs,test_pairs = pairs[train_idx],pairs[test_idx]

  train_data = language_loader(train_pairs,input_class,output_class,device=device)
  trainloader = DataLoader(train_data,batch_size=128,shuffle=True)

  test_data = language_loader(test_pairs,input_class,output_class,device=device)
  testloader = DataLoader(test_data,batch_size=128)

  train_loss_vec = []
  test_loss_vec = []

  model = NMT(HIDDEN_SIZE,EMBEDDED_SIZE,INPUT_LANG,OUTPUT_LANG,drop_p=DROP_P,device=device)
  model = model.to(device)
  optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)

  for e in range(EPOCH):
    start = time.time()

    train_loss = train(model,trainloader,optimizer,teacher_enforce=teacher_enforce)
    test_loss = evaluation(model,testloader)

    train_loss_vec.append(train_loss)
    test_loss_vec.append(test_loss)

    end = time.time()
    elapsed = end-start
    print("At {0:d} validation, iteration {1:d}, the train loss is {2:.3f} and test loss is {3:.3f},time it takes is {4:.3f}".format(k_split_num,e+1,train_loss,test_loss,elapsed))

  if test_loss < best_loss:
    torch.save(model.state_dict(), "NMT_model_{}.pt".format(k_split_num))
  k_split_num += 1

  kfold_train_l.append(train_loss_vec)
  kfold_test_l.append(test_loss_vec)

In [29]:
model = NMT(HIDDEN_SIZE,EMBEDDED_SIZE,INPUT_LANG,OUTPUT_LANG,drop_p=DROP_P,device=device)
model.to(device)
model.load_state_dict(torch.load("NMT_model_2.pt",map_location=torch.device('cpu')))

<All keys matched successfully>

## BLEU score

In [28]:
from nltk.translate.bleu_score import sentence_bleu
def bleu_score(ref,can,n_gram=4):    
    if n_gram==4:    # 4-gram
        weights=(0.25,0.25,0.25,0.25)
    elif n_gram==3:  # 3-gram
        weights=(0.33,0.33,0.33,0)
    elif n_gram==2:  # 2-gram
        weights=(0.5,0.5,0,0)        
    elif n_gram==1:  # 1-gram
        weights=(0.5,0.5,0,0)    
    else:
        print("wrong n_gram")
        return 0
    return sentence_bleu(ref, can, weights)*100

def ref_transform(ref):    
    ref=re.sub(r"([，；:。？、！])",r" ",ref)
    ref=ref.split()

    n_blank=0
    for i in range(len(ref)):
        if ref[i]=='':
            n_blank=n_blank+1
    for i in range(n_blank):
        ref.remove('')
    return ref

def can_transform(result):
    result=re.sub(r"([，；:。？、！])",r" ",result)
    can=[]
    for i in range(len(result)):
        if result[i]!=' ':
            can.append(result[i])
    return can

In [0]:
score=0
n_count=0
for i in range(len(pairs)):
    ref=pairs[i][1]
    ref=ref_transform(ref)
    if len(ref)>7:         # only evaluate sentences with 8 or more Chinese characters
        result=translate(model,pairs[i][0],device)[0]
        can=can_transform(result)
        temp=bleu_score([ref],can,4)
        score=score+temp
        n_count=n_count+1
    #if i%1000==999:
        #print(i+1)
print(n_count)
print(score/n_count)