## PREREQUISITES
We have to install the new spacy version for python. 

We also need a language model, in this case I use fr_core_news_lg. 

At the end of this part, we use spacy's validate to assess compatibility. 

In [1]:
#%%capture
## UPDATES --- 
#!conda install -y -c conda-forge spacy
#!conda install -c pytorch torchtext -y 
#!conda install -y -c conda-forge scikit-learn 
#!conda install -y -c anaconda numpy

In [2]:
#%%capture
# SPACY LANGUAGE ------ 
    #* Downloads 571 MB  
#!python -m spacy download fr_core_news_lg # Maybe try fr_dep_news_trf
# More info at https://spacy.io/usage/models 

Spacy's validate command tells us that fr_core_news_lg is compatible with the version of spacy that is installed (3.1.0).

In [3]:
# VERIFY SPACY ------ 
!python -m spacy validate

[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation:
/opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/spacy[0m

NAME              SPACY            VERSION                            
fr_core_news_lg   >=3.1.0,<3.2.0   [38;5;2m3.1.0[0m   [38;5;2m✔[0m



In [4]:
# SPACY INFO ------ 
!python -m spacy info 


[1m

spaCy version    3.1.1                         
Location         /opt/conda/envs/pytorch-py3.6/lib/python3.6/site-packages/spacy
Platform         Linux-5.11.0-27-generic-x86_64-with-debian-stretch-sid
Python version   3.6.11                        
Pipelines        fr_core_news_lg (3.1.0)       



### LIBRARIES

In [5]:
# LIBRARIES ------
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.legacy.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from torchtext.data.utils import get_tokenizer
from collections import Counter
from torchtext.vocab import Vocab
#from torchtext.utils import download_from_url, extract_archive
import io
#from utils import translate_sentence, bleu, save_checkpoint, load_checkpoint

### GET THE DATA 
sample_path= "/home/camilo/Documents/Own Projects/Gutenberg 2/ML_GutenbergFR/data_output/Normal/Sample_ML.txt"


In [6]:
# TORCH, GPU INFORMATION ------
print(
torch.__version__,
torch.cuda.current_device(),
torch.cuda.device(0),
torch.cuda.device_count(),
torch.cuda.get_device_name(0)
)

1.9.0 0 <torch.cuda.device object at 0x7fab1c653e10> 1 Quadro P400


### PATHS ------

In [7]:
train_filepaths = "/home/camilo/Documents/Own Projects/Gutenberg 2/ML_GutenbergFR/data_output/Normal/Sample_ML.txt"

### TOKENIZER

We use the language model that we downloaded. In this case it is the french corpus trained on news and media sources. 

As we can see, the tokenizer keeps, by default, uppercase and punctuation.

In [8]:
# TOKENIZER ------
#spacy_lang = spacy.load("fr_core_news_lg")
#def tokenize_fr(text):
#    return [tok.text for tok in spacy_lang.tokenizer(text)]
#* Example --- 
#tokenize_fr("Bonjour, ceci est un exemple. ...  ")

# NEW 
def split(word):
    return list(word)
tokenizer=get_tokenizer(split)
tokenizer("hello bye")

['h', 'e', 'l', 'l', 'o', ' ', 'b', 'y', 'e']

In [9]:
### VOCAB AT CHARACTER LEVEL ------ 
from torchtext.vocab import build_vocab_from_iterator

def build_vocab(filepath, tokenizer, pos):
  #counter = Counter()
  with io.open(filepath, encoding="utf8") as f:
    for string_ in f:
      string_= string_.split(", ,")[pos]
      yield tokenizer(string_)
  #return Vocab(counter, specials=['<unk>', '<pad>', '<bos>', '<eos>'])
  #return Vocab(counter)
fr_char_vocab=build_vocab_from_iterator(build_vocab(train_filepaths, tokenizer, pos=0),specials=['<unk>', '<pad>', '<bos>', '<eos>','\n'] )
#fr_char_vocab = build_vocab(train_filepaths, tokenizer)

In [10]:
print("Number of characters in the vocab :",fr_char_vocab.__len__())

Number of characters in the vocab : 53


In [11]:
out_index=fr_char_vocab["<unk>"]
fr_char_vocab.set_default_index(out_index)
fr_char_vocab["hello"] # => 0 because "<unk>" is 0... 

0

In [12]:
def split_T_from_F(sentence,pos):
    string_= sentence.split(", ,")[pos]
    return string_

def data_process(filepath):
  raw_iter = iter(io.open(filepath, encoding="utf8"))
  data = []
  for raw_sentence in raw_iter:
    T_sentence_tensor = torch.tensor([fr_char_vocab[token] for token in tokenizer(split_T_from_F(raw_sentence,0))],
                            dtype=torch.long)
    F_sentence_tensor = torch.tensor([fr_char_vocab[token] for token in tokenizer(split_T_from_F(raw_sentence,1))],
                            dtype=torch.long)
    data.append((T_sentence_tensor,F_sentence_tensor))
  return data

train_data = data_process(train_filepaths)

In [13]:
# VERIF => Truth Mistake
print(train_data[0])

print([fr_char_vocab[token] for token in tokenizer("Truth")])
print([fr_char_vocab[token] for token in tokenizer("Mistake\n")])

(tensor([47, 12, 13, 10, 26]), tensor([ 0,  9,  8, 10,  7, 44,  6,  4]))
[47, 12, 13, 10, 26]
[0, 9, 8, 10, 7, 44, 6, 4]


In [21]:
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

BATCH_SIZE = 128
PAD_IDX = fr_char_vocab['<pad>']
BOS_IDX = fr_char_vocab['<bos>']
EOS_IDX = fr_char_vocab['<eos>']

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

def generate_batch(data_batch):
  T_batch, F_batch = [], []
  for (T_item, F_item) in data_batch:
    T_batch.append(torch.cat([torch.tensor([BOS_IDX]), T_item, torch.tensor([EOS_IDX])], dim=0)) # 
    F_batch.append(torch.cat([torch.tensor([BOS_IDX]), F_item, torch.tensor([EOS_IDX])], dim=0))
  T_batch = pad_sequence(T_batch, padding_value=PAD_IDX)
  F_batch = pad_sequence(F_batch, padding_value=PAD_IDX)
  return F_batch, T_batch

def train_val_dataset(dataset, val_split=0.25):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), test_size=val_split)
    datasets = {}
    datasets['train'] = Subset(dataset, train_idx)
    datasets['val'] = Subset(dataset, val_idx)
    return datasets

datasets = train_val_dataset(train_data)

train_iter = DataLoader(datasets["train"], batch_size=BATCH_SIZE,
                        shuffle=False, collate_fn=generate_batch)
valid_iter = DataLoader(datasets["val"], batch_size=BATCH_SIZE,
                        shuffle=True, collate_fn=generate_batch)

In [61]:
for i,content in enumerate(train_iter):
    if(i==0):
        print(content[0].shape)
        print(content[1].shape)
        print(content[0][0])
        print(content[0][1])
fr_char_vocab.vocab.lookup_token(2)

torch.Size([75, 128])
torch.Size([71, 128])
tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2])
tensor([21, 19, 22, 14, 13, 18, 15, 15, 19,  6, 32, 16, 10, 16, 23, 23, 16, 13,
        18,  7, 14, 14, 14, 19,  8, 10, 16,  6, 19, 11, 16, 13, 18, 14,  7, 25,
        16, 11, 17, 27,  8, 19, 14, 43, 27, 17,  8, 27, 17, 10, 14, 21, 14, 17,
        14, 19,  8, 19, 19,  7,  9, 11, 18, 16, 14, 14,  7, 22,  6, 14, 25, 20,
         8,  7, 16, 38, 16, 16, 26, 19, 18, 24,  8, 16, 18,  8, 27, 14, 10, 24,
        16, 14, 14,  8,  7, 19, 18, 14, 10, 16, 10, 29,  6, 19, 17, 16,  7,  7,
         7,  7,  7, 19, 23, 14, 16, 10,  8,

'<bos>'

In [11]:
### GET THE DATA 
#sample_path= "/home/camilo/Documents/Own Projects/Gutenberg 2/ML_GutenbergFR/data_output/Normal/Sample_ML.txt"
#import pandas as pd 
#sample_data = pd.read_csv(sample_path, sep=", ,")
#train_data= sample_data.iloc[0:800]
#valid_data= sample_data.iloc[800:]

  after removing the cwd from sys.path.


### ENCODER

In [62]:
# ENCODER ------
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size # FOR LSTM : hidden and cell states
        self.num_layers = num_layers # FOR LSTM : layers

        self.embedding = nn.Embedding(input_size, embedding_size) # Char representation in embedding space
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p) # recurrent layers

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding) # h and c states init randomly
        # outputs shape: (seq_length, N, hidden_size) => discarded
        # hidden shape: [num_layers* n_directions (1), N, hidden_size]
        return hidden, cell

In [118]:
next(iter(train_iter))[0].shape
x= Encoder(len(fr_char_vocab),10,20,5,0.2)
temp_result=x.forward(next(iter(train_iter))[0])
h= temp_result[0]
c= temp_result[1]

In [110]:
print(h.shape)
print(c.shape)

torch.Size([5, 128, 20])
torch.Size([5, 128, 20])


### DECODER

In [119]:
class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size # For LSTM : hidden and cell states
        self.num_layers = num_layers # for LSTM : layers to connect 

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size) # To have a fixed size output

    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell)) # this time we pass h and c states from the encoder
        # outputs shape: (1, N, hidden_size) "!!!" because we pass only a "character" => sequence length = 1 time-step

        predictions = self.fc(outputs)

        # predictions shape: (1, N, length_target_vocabulary) to send it to
        # loss function we want it to be (N, length_target_vocabulary) so we're
        # just gonna remove the first dim
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell

In [120]:
next(iter(train_iter))[0].shape

torch.Size([75, 128])

In [121]:
dec= Decoder(75,10,20,len(fr_char_vocab.vocab),5,0.25)
x=dec.forward(next(iter(train_iter))[0][0], h,c)
x[0].shape # predictions 

torch.Size([128, 53])

### SEQ2SEQ

In [107]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(fr_char_vocab.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]
        print(x.shape) # 1, batch

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden, cell = self.decoder(x, hidden, cell)

            # Store next output prediction
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess

        return outputs

In [152]:

### We're ready to define everything we need for training our Seq2Seq model ###

# Training hyperparameters
num_epochs = 5
learning_rate = 0.001
batch_size = 8

# Model hyperparameters
load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(fr_char_vocab.vocab)
input_size_decoder = len(fr_char_vocab.vocab)
output_size = len(fr_char_vocab.vocab)
encoder_embedding_size = 10
decoder_embedding_size = 10
hidden_size = 64  # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard to get nice loss plot
writer = SummaryWriter(f"runs/loss_plot")
step = 0

train_iterator, valid_iterator = BucketIterator.splits(
    (datasets["train"], datasets["val"]),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x),
    device=device
)

In [155]:
encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)

decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout,
).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = fr_char_vocab.vocab.lookup_indices(["<pad>"])
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)


def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 109,977 trainable parameters


In [119]:
def translate_sentence(model, sentence, french, device, max_length=50):
    print(sentence)

    # sys.exit()

    # Load fr tokenizer
    #spacy_ger = spacy.load("fr_core_news_lg")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_lang.tokenizer(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    print("POST TOKENS")
    print(tokens)

    # sys.exit()
    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, french.init_token)
    tokens.append(french.eos_token)
    print(tokens)

    # Go through each german token and convert to an index
    text_to_indices = [french.vocab.stoi[token] for token in tokens]
    print(text_to_indices)
    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [french.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == french.vocab.stoi["<eos>"]:
            break

    translated_sentence = [french.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]

In [117]:
sentence="bonjour comment allez vous "
for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")

    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
    #save_checkpoint(checkpoint)

    model.eval()

    translated_sentence = translate_sentence(
        model, sentence, french, device, max_length=50
    )

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        # Forward prop
        output = model(inp_data, target)

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        # Gradient descent step
        optimizer.step()

        # Plot to tensorboard
        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

[Epoch 0 / 5]
bonjour comment allez vous 
POST TOKENS
['bonjour', 'comment', 'allez', 'vous']
Translated example sentence: 
 ['m', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p', 'p']


KeyError: 238