In [None]:
####################
### REQUIREMENTS ###
####################

from __future__ import unicode_literals, print_function, division
from io import open
from itertools import islice # For reading only a part of the data file
from collections import OrderedDict # For defining a variable-length nn.Sequential()
from collections import defaultdict # Used in readData()
import unicodedata
import string
import re
import random
import math # For math.ceil() in readLine()

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

#from google.colab import drive
#drive.mount('/content/gdrive/')

device = torch.device("cpu")

In [None]:
#########################
### CONTROL CONSTANTS ###
#########################

DS_SIZE = 1000
LOAD_DATASET_FROM_FILE = True
LOAD_MODEL_FROM_FILE = True
torch.backends.cudnn.enabled = False
num_of_batch_files = 1
CUDA_LAUNCH_BLOCKING = 1
START_ITER = 0
PATH = '.\\AML\\old\\'
LOGGING = False
LOG = open(PATH + 'log', 'w', encoding='utf-8')
TEACHER_FORCING = False

In [None]:
##########################
### DATA-PREPROCESSING ###
##########################

# ..._BATCH_SIZE is NOT the number of sentences in a batch! It's the size of the tensor (i.e. accounts for the character sequence length).
MIN_BATCH_SIZE = 2400
MAX_BATCH_SIZE = 3000
BATCHES_PER_FILE = 100000

SOS_token = '\2'
EOS_token = '\3'
PSC_token = '\4' # Padding Sequence Character (increase the length of the encoder input sequence by 20% using PSC tokens)
NAC_token = '\5' # Not A Character (used for padding only, doesn't carry any meaning)
UCF_token = '\7' # Unknown Character Found (replace all unknown characters with this token)

PAD_TO = 50 # The input and target (in training) sequences will be padded to the nearest multiple of PAD_TO for efficient batching.

all_letters = SOS_token + EOS_token + UCF_token + PSC_token + NAC_token + string.printable + "£€°\u00E4\u00EF\u00F6\u00FC\u00DF\u00C4\u00CF\u00D6\u00DC\u1E9E"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    html_subs = {'&quot;': '"', '&apos;': '\'', '&amp;': '&', '&#91;': '[', '&#93;': ']', '&lt;': '<',  '&gt;': '>', '&#124;': '|'}

    # NFD = Combine characters that "have the same meaning"
    # unicodedata.category(c) == 'Mn' <==> Unicode Non-spacing Mark
    for sub in html_subs:
        s = s.replace(sub, html_subs[sub])
    s = s.replace(' ##AT##-##AT## ', '-').replace('\n', '')

    NFD =  [c for c in unicodedata.normalize('NFD', s)
            if (unicodedata.category(c) != 'Mn' or c == '\u0308')] # Leave the umlaut!!!

    # But if the umlaut is not over 'a', 'o' or 'u', German doesn't have such a letter. Drop the umlaut.
    NFD_filter_umlauts = [NFD[0]]
    for i in range(1, len(NFD)):
        if NFD[i] != '\u0308' or NFD[i - 1] in 'AIOUaiou': 
            NFD_filter_umlauts.append(NFD[i])   

    return unicodedata.normalize('NFC', ''.join([c if c in all_letters or c == '\u0308'
                    else UCF_token # Insert a UCF_token in place of c otherwise
                    for c in NFD_filter_umlauts]))

def letterToIndex(letter):
    assert all_letters.find(letter) != -1
    return all_letters.find(letter)

def readLine(line, type): 
    if type == 'enc':
        tmp = unicodeToAscii(line.strip()) + PSC_token * int(0.2 * len(line)) + EOS_token
    else:
        tmp = unicodeToAscii(line.strip()) + EOS_token
    tmp += NAC_token * (math.ceil(len(tmp) / PAD_TO) * PAD_TO - len(tmp))
    return tmp

# Turn a line into a <BATCH_SIZE x line_length x n_letters> array of letter indices.
def bucketToBatch(bucket):
    source_tensor = torch.zeros(len(bucket), len(bucket[0][0]), dtype=torch.int64, device=device)
    target_tensor = torch.zeros(len(bucket), len(bucket[0][1]), dtype=torch.int64, device=device)
    for i in range(len(bucket)):
        for char_pos, char in enumerate(bucket[i][0]):
            source_tensor[i][char_pos] = letterToIndex(char)
        for char_pos, char in enumerate(bucket[i][1]):
            target_tensor[i][char_pos] = letterToIndex(char)

    return (source_tensor, target_tensor)

def readData(prefix, lang1, lang2, N = None):
    num_of_batch_files = 0
    bucketised_batches = []

    lines1 = open(PATH + '%s.%s' % (prefix, lang1), encoding='utf-8')
    lines2 = open(PATH + '%s.%s' % (prefix, lang2), encoding='utf-8')

    # Read in only the first N lines of the train files. N = None means reading the entire file.
    bucketised_tensor_pairs = defaultdict(list)

    i = 0

    for line1, line2 in islice(zip(lines1, lines2), N):
        i += 1
        if DS_SIZE >= 100 and i % ((DS_SIZE // 100) * 5) == 0:
            print("%d%% of the datafile was read." % (i / (DS_SIZE // 100)))
        tmp1 = readLine(line1, 'enc')
        tmp2 = readLine(line2, 'dec')
        cur_bucket = bucketised_tensor_pairs[(len(tmp1),len(tmp2))]
        cur_bucket.append((tmp1, tmp2)) #.append((lineToTensor(tmp1), lineToTensor(tmp2)))
        if len(cur_bucket) * max(len(tmp1), len(tmp2)) >= MAX_BATCH_SIZE:
            bucketised_batches.append(bucketToBatch(cur_bucket))
            del bucketised_tensor_pairs[(len(tmp1),len(tmp2))]
            if len(bucketised_batches) >= BATCHES_PER_FILE:
                torch.save(bucketised_batches, PATH + 'bucketised_batches_%d' % num_of_batch_files)
                num_of_batch_files += 1
                bucketised_batches = []

    for key in bucketised_tensor_pairs.keys():
        if len(bucketised_tensor_pairs[key]) * max(key[0], key[1]) >= MIN_BATCH_SIZE:
            bucketised_batches.append(bucketToBatch(bucketised_tensor_pairs[key]))
            if len(bucketised_batches) >= BATCHES_PER_FILE:
                torch.save(bucketised_batches, PATH + 'bucketised_batches_%d' % num_of_batch_files)
                num_of_batch_files += 1
                bucketised_batches = []

    if bucketised_batches:
        torch.save(bucketised_batches, PATH + 'bucketised_batches_%d' % num_of_batch_files)
        num_of_batch_files += 1

    return num_of_batch_files

if not LOAD_DATASET_FROM_FILE:
    num_of_batch_files = readData('train', 'en','de', DS_SIZE)
bucketised_batches = torch.load(PATH + '/bucketised_batches_0', map_location=device)

In [None]:
########################
### MODEL DEFINITION ###
########################

import numpy as np
from torch.autograd import Variable

# CONVOLUTION PARAMETER DEFINITIONS

# wdt = width; str = stride; pad = padding; dil = dilation
conv1x1 = {'wdt': 1, 'str': 1, 'pad': 0, 'dil': 1}
conv1xk = {'wdt': 3, 'str': 1, 'pad': 1} # Keep wdt odd and pad to (wdt+1)/2. On changing wdt MAKE SURE TO TEST masking in ResBlock::forward!!!



# ENCODER & DECODER

class ResBlock(nn.Module):
    def __init__(self, channels, dilation, type):
        super(ResBlock, self).__init__()
        self.type =type
        channel_factor = 2 if type != 'enc' else 1

        # Note that the size of padding in conv1xk is chosen so that the size of the output of the block is equal to the size of the input.
        # Be careful about changing the order! Assumed to be so in forward()!!!
        self.resblock = nn.Sequential(OrderedDict([
            #('norm_1', nn.InstanceNorm1d(channels)),                         # LayerNorm is computed along the last (i.e. the sequence) dimension.
            ('relu_1', nn.ReLU()),
            ('conv1x1_1', nn.Conv1d(channel_factor * channels, channels, conv1x1['wdt'], stride=conv1x1['str'], padding=conv1x1['pad'], dilation=conv1x1['dil'])),
            #('norm_2', nn.InstanceNorm1d(channels)),
            ('relu_2', nn.ReLU()),
            ('conv1xd_2', nn.Conv1d(channels, channels, conv1xk['wdt'], stride=conv1xk['str'], padding=conv1xk['pad']*dilation, dilation=dilation)), 
            #('norm_3', nn.InstanceNorm1d(channels)),
            ('relu_3', nn.ReLU()),
            ('conv1x1_3', nn.Conv1d(channels, channel_factor * channels, conv1x1['wdt'], stride=conv1x1['str'], padding=conv1x1['pad'], dilation=conv1x1['dil']))
         ]))
    
    def forward(self, input):
        if self.type != 'enc':
            c_out, c_in, wdt = self.resblock._modules['conv1xd_2'].weight.data.size()
            self.resblock._modules['conv1xd_2'].weight.data[:,:,wdt//2+1:wdt] = torch.nn.Parameter(torch.zeros(c_out, c_in, wdt-wdt//2-1, device=device))
        
        if LOGGING:
            print('def ResBlock::forward(self, input)', file=LOG)
            print(input.size(), file=LOG)
            print(step1.size(), file=LOG)
            print(step2.size(), file=LOG)
            print(step3.size(), file=LOG, end='\n')
        
        output = self.resblock(input)
        return output + input

class CNN(nn.Module):
    def __init__(self, input_dim, channels, res_sets, res_blocks, type):
        super(CNN, self).__init__()
        channel_factor = 2 if type != 'enc' else 1
        self.type = type
        self.embed = nn.Embedding(input_dim, channels)
        self.channels = channels

        layers = OrderedDict()
        for res_set in range(res_sets):
            for res_block in range(res_blocks):
                layers['res_set_' + str(res_set) + '|res_block_' + str(res_block)] = ResBlock(channels, 2 ** res_block, type)

        layers['fin|conv1x1'] = nn.Conv1d(channel_factor * channels, channels, conv1x1['wdt'], stride=conv1x1['str'], padding=conv1x1['pad'], dilation=conv1x1['dil'])
        layers['fin|ReLU'] = nn.ReLU()
        if type != 'enc':
            layers['fin|conv1xd'] = nn.Conv1d(channels, n_letters, conv1xk['wdt'], stride=conv1xk['str'], padding=conv1xk['pad'])
            layers['fin|logsoftmax'] = nn.LogSoftmax(dim=1)
        self.CNN = nn.Sequential(layers)

    def forward(self, source=None, target=None, encoder_output=None):
        #print('In forward():')
        if self.type == 'enc':
            if LOGGING:
                print('def CNN::forward(self, source, target, encoder_output) self.type == enc', file=LOG)
                print('source', file=LOG)
                print(source.size(), file=LOG, end='\n')
                #print(source, file=LOG)
            
            tmp = self.embed(source)
            emb = torch.transpose(tmp, 1, 2)
        else:
            if LOGGING:
                print('def CNN::forward(self, source, target, encoder_output) self.type == dec', file=LOG)
                print('target', file=LOG)
                print(target.size(), file=LOG)
                #print(target, file=LOG)
                print('encoder_output', file=LOG)
                print(encoder_output.size(), file=LOG)
                #print(encoder_output, file=LOG)
            
            tmp = self.embed(target)
            tmp2 = torch.transpose(tmp, 1, 2)
            
            if LOGGING:
                print('embedding of the target', file=LOG)
                print(tmp2.size(), file=LOG, end='\n')
                #print(tmp2)
            
            emb = torch.zeros(tmp2.size(0), 2 * tmp2.size(1), tmp2.size(2), device=device)
            emb[:,:self.channels,:] = tmp2
            for char in range(min(tmp2.size(2), encoder_output.size(2))):
                emb[:,self.channels:2*self.channels,char] = encoder_output[:,:,char]

            c_out, c_in, wdt = self.CNN._modules['fin|conv1xd'].weight.data.size()
            self.CNN._modules['fin|conv1xd'].weight.data[:,:,wdt//2+1:wdt] = torch.nn.Parameter(torch.zeros(c_out, c_in, wdt-wdt//2-1, device=device))
        return self.CNN(emb)

In [None]:
################
### TRAINING ###
################

# TIMING [COPY-PASTED FROM THE PYTORCH TUTORIAL]

import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    return '%s (- %s)' % (asMinutes(s), asMinutes(s / percent - s))

# TRAIN

EMBED_DIM = 20

# Constants from the paper.

ADAM_LEARNING_RATE = 0.0003
RES_SETS = 6        # Number of sets of residual blocks
RES_BLOCKS = 5      # Number of residual blocks per set

# Define 1 GD step. W/o Teacher Forcing
def train_no_TF(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    target_length = target_tensor.size(1)

    encoder_output = encoder(source=input_tensor)

    output = torch.zeros(target_tensor.size(0), target_length, dtype=torch.int64, device=device, requires_grad=False)
    for i in range(target_tensor.size(0)):
        output[i][0] = letterToIndex(SOS_token)
    loss = 0

    for char_index in range(1, target_length):
        decoder_output = decoder(target=output[:,:char_index], encoder_output=encoder_output[:,:,:char_index])
        loss_per_char = criterion(decoder_output[:,:,char_index-1], target_tensor[:,char_index-1])
        loss += loss_per_char.detach()

        if LOGGING:
            print('In def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)', file=LOG)
            print('char_index = %d' % char_index, file=LOG)
            print(output.size(), file=LOG)
            print(decoder_output.size(), file=LOG)

        loss_per_char.backward(retain_graph=True)
        output[:,char_index] = torch.tensor(np.argmax(decoder_output[:,:,char_index-1].detach().numpy(), axis=1))
                               

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.detach() / target_length

# Define 1 GD step. W/ Teacher Forcing
def train_TF(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    target_length = target_tensor.size(1)

    encoder_output = encoder(source=input_tensor)
    decoder_output = decoder(target=target_tensor, encoder_output=encoder_output) # Teacher forcing

    loss = criterion(decoder_output, target_tensor)

    if LOGGING:
        print('In def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)', file=LOG)
        print('char_index = %d' % char_index, file=LOG)
        print(encoder_output.size(), file=LOG)
        print(decoder_output.size(), file=LOG)

    loss.backward()                               

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.detach() 

# Define the actual training.
def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, save_every=10000):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every
    start_iter = START_ITER

    # Choose whether to use teacher forcing or not.
    train = lambda it, tt, e, d, eo, do, c: train_TF(it, tt, e, d, eo, do, c) if TEACHER_FORCING else train_no_TF(it, tt, e, d, eo, do, c)

    encoder_optimizer = optim.Adam(encoder.parameters(), ADAM_LEARNING_RATE)
    decoder_optimizer = optim.Adam(decoder.parameters(), ADAM_LEARNING_RATE)

    if LOAD_MODEL_FROM_FILE:
        checkpoint = torch.load(PATH + 'bytenet.train.tar', map_location=device)
        start_iter = checkpoint['iter']
        encoder.load_state_dict(checkpoint['encoder_state_dict'])
        decoder.load_state_dict(checkpoint['decoder_state_dict'])
        encoder_optimizer.load_state_dict(checkpoint['encoder_optimizer_state_dict'])
        decoder_optimizer.load_state_dict(checkpoint['decoder_optimizer_state_dict'])

        encoder.to(device)
        decoder.to(device)

        encoder.train()
        decoder.train()

        print('Loaded the model from %sbytenet.train.tar, which has already been trained for %d iterations.' % (PATH, start_iter))

    criterion = nn.NLLLoss()

    for iter in range(start_iter + 1, start_iter + n_iters + 1):    
        input_tensor, target_tensor = bucketised_batches[iter % len(bucketised_batches)]
        if LOGGING:
            print('def trainIters(encoder, decoder, n_iters, print_every, plot_every, save_every, learning_rate)', file=LOG)
            print('iter = %d' % iter, file=LOG)
            print(input_tensor.size(), file=LOG)
            print(target_tensor.size(), file=LOG, end='\n')

        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            real_iter = iter - start_iter
            print('%s (%d %d%%) %.4f' % (timeSince(start, real_iter / n_iters), iter, real_iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

        if iter % save_every == 0:
            torch.save({
                'iter': iter,
                'encoder_state_dict': encoder.state_dict(),
                'decoder_state_dict': decoder.state_dict(),
                'encoder_optimizer_state_dict': encoder_optimizer.state_dict(),
                'decoder_optimizer_state_dict': decoder_optimizer.state_dict(),
            }, PATH + 'bytenet.train.tar')

    showPlot(plot_losses)

In [None]:
import matplotlib.pyplot as plt
plt.switch_backend('agg')
import matplotlib.ticker as ticker
import numpy as np 

def showPlot(points):
    plt.figure()
    fig, ax = plt.subplots()
    # this locator puts ticks at regular intervals
    loc = ticker.MultipleLocator(base=0.2)
    ax.yaxis.set_major_locator(loc)
    plt.plot(points)

encoder = CNN(n_letters, EMBED_DIM, RES_SETS, RES_BLOCKS, 'enc').to(device)
decoder = CNN(n_letters, EMBED_DIM, RES_SETS, RES_BLOCKS, 'dec').to(device)

trainIters(encoder, decoder, 1000000, print_every=10, plot_every=100, save_every=50)

LOG.close()

In [None]:
sum(p.numel() for p in encoder.parameters()) + sum(p.numel() for p in decoder.parameters()) 

In [None]:
##################
### EVALUATION ###
##################

# TIMING [COPY-PASTED FROM THE PYTORCH TUTORIAL]

import time
import math

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    return '%s (- %s)' % (asMinutes(s), asMinutes(s / percent - s))

# EVALUATE

EMBED_DIM = 20
RES_SETS = 6        # Number of sets of residual blocks
RES_BLOCKS = 5      # Number of residual blocks per set

# Evaluate on 1 batch.
def evaluate(input_tensor, target_tensor, encoder, decoder, criterion):
    target_length = target_tensor.size(1)
        
    encoder_output = encoder(source=input_tensor)

    output = torch.zeros(target_tensor.size(0), target_length, dtype=torch.int64, device=device, requires_grad=False)
    for i in range(target_tensor.size(0)):
        output[i][0] = letterToIndex(SOS_token)
    loss = 0

    for char_index in range(1, target_length):
        decoder_output = decoder(target=output[:,:char_index], encoder_output=encoder_output[:,:,:char_index])
        loss += criterion(decoder_output[:,:,char_index-1], target_tensor[:,char_index-1]).detach()
        output[:,char_index] = torch.tensor(np.argmax(decoder_output[:,:,char_index-1].detach().numpy(), axis=1))

    return loss.detach() / target_length



# Evaluate on all batches.
def evaluateIters(encoder, decoder, print_every = 10):
    print_loss_total = 0
    num_of_pairs = 0
    checkpoint = torch.load(PATH + 'bytenet.eval.tar', map_location=device)
    start_iter = checkpoint['iter']
    encoder.load_state_dict(checkpoint['encoder_state_dict'])
    decoder.load_state_dict(checkpoint['decoder_state_dict'])

    encoder.to(device)
    decoder.to(device)

    encoder.eval()
    decoder.eval()

    print('Loaded the model from %sbytenet.eval.tar, which has already been trained for %d iterations.' % (PATH, start_iter))

    criterion = nn.NLLLoss(reduction='sum')

    start = time.time()

    for batch, (input_tensor, target_tensor) in enumerate(bucketised_batches):
        loss = evaluate(input_tensor, target_tensor, encoder, decoder, criterion)
        print_loss_total += loss
        num_of_pairs += input_tensor.size(0)

        if batch % print_every == 0:
            print('%s (%d %d%%) %d pairs processed so far' % (timeSince(start, num_of_pairs / DS_SIZE), batch, batch / len(bucketised_batches) * 100, num_of_pairs))
            
    return print_loss_total, num_of_pairs

In [None]:
with torch.no_grad():
    encoder = CNN(n_letters, EMBED_DIM, RES_SETS, RES_BLOCKS, 'enc').to(device)
    decoder = CNN(n_letters, EMBED_DIM, RES_SETS, RES_BLOCKS, 'dec').to(device)


    print_loss_total, num_of_pairs = evaluateIters(encoder, decoder, print_every=1)

In [None]:
print(print_loss_total)
print(num_of_pairs)
print(print_loss_total / num_of_pairs)