# Title
The title of the notebook should be coherent with file name. Namely, file name should be:    
*author's initials_progressive number_title.ipynb*    
For example:    
*EF_01_Data Exploration.ipynb*

## Purpose
State the purpose of the notebook.

## Methodology
Quickly describe assumptions and processing steps.

## WIP - improvements
Use this section only if the notebook is not final.

Notable TODOs:
- todo 1;
- todo 2;
- todo 3.

## Results
Describe and comment the most important results.

## Suggested next steps
State suggested next steps, based on results obtained in this notebook.

# Setup

## Library import
We import all the required Python libraries

In [1]:
# Data manipulation
import pandas as pd
import numpy as np

# Options for pandas
pd.options.display.max_columns = 50
pd.options.display.max_rows = 30

# Visualizations
import plotly
import plotly.graph_objs as go
import plotly.offline as ply
plotly.offline.init_notebook_mode(connected=True)

import cufflinks as cf
cf.go_offline(connected=True)
cf.set_config_file(theme='white')

import matplotlib as plt

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
    
%autoreload 2

## Local library import
We import all the required local libraries libraries

In [60]:
# Include local library paths
import nltk
import re

import sys
sys.path.append('./src') # uncomment and fill to import local libraries

from textloader import *
from tp5 import *

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Parameter definition
We set all relevant parameters for our notebook. By convention, parameters are uppercase, while all the 
other variables follow Python's guidelines.


# Data import
We retrieve all the required data for the analysis.

In [61]:
data_path = "./data/trump_full_speech.txt"
with open(f'{data_path}') as f:
    text = f.readlines()
text = text[0]+text[1]+text[2]
text = re.sub('Trump: ','',text)
text = re.sub('^Trump.$','',text)
text = re.sub('\[.*\] ', '', text)
text = text.strip()
traindataset = TextDataset(text)
trainloader = DataLoader(traindataset, batch_size=16, collate_fn=collate_fn, shuffle=False)

In [62]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [63]:
model = LSTM(num_embeddings=len(id2lettre), embedding_dim=100, hidden_size=150, output_size=len(id2lettre))
criterion = MaskedCrossEntropy()

In [64]:
def train_step(model, criterion, x, y, m):
    x.unsqueeze_(-1)
    h = [ t.to(device) for t in model.initHidden(x.shape[1])]
    loss = 0
    for i, x in enumerate(x):
        h = model.one_step(x, *h)
        logits = model.decode(h[0])
        loss += criterion(logits, y[i], m[i])
    return loss


In [65]:
x = next(iter(trainloader))
n = 7
x, y, m = x[:n], x[1:n+1], x[1:n+1]!=PAD_IX 

In [86]:
model = LSTM(num_embeddings=len(id2lettre), embedding_dim=100, hidden_size=150, output_size=len(id2lettre)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = maskedCrossEntropy

In [79]:
from tqdm import tqdm
def train(train, model, criterion, optimizer, scheduler, n_epochs):
    losses = []
    pbar = tqdm(range(n_epochs), total=n_epochs, file=sys.stdout)
    for _ in pbar:
        l = []
        for x in train:
            x, y, m = x[:-1].to(device), x[1:].to(device), (x[1:]!=PAD_IX ).to(device)
            
            #logits, loss = train_step(model, criterion, x, y, m)
            h = [ t.to(device) for t in model.initHidden(x.shape[1])] 
            d = model.decode(model(x, h))
            loss = criterion(d, y, m)
            loss.backward()
            
            optimizer.step()
            optimizer.zero_grad()
            l.append(loss.item())
      
        #scheduler.step()
        
        lo = np.mean(l)
        losses.append(lo)
        pbar.set_description(f'Train: Loss: {np.round(lo, 4)}') # \tTest: Loss: {np.round(test_lo, 4)}
        
    return losses

In [81]:
losses = train(trainloader, model, criterion, optimizer, None, 25)

In [17]:
def generate(model, start, maxlength):
    x = torch.tensor(string2code(start)).unsqueeze(-1).to(device)
    h = [ v.to(device) for v in model.initHidden(1)]
    l = [lettre2id[start]]
    model.eval()
    predictions = []
    with torch.no_grad():
        for i in range(maxlength):
            h = model.one_step(x, *h)
            d = model.decode(h[0])
            predictions.append(d.squeeze(0).tolist())
            probs = torch.exp(d)
            start = torch.distributions.categorical.Categorical(probs).sample()
            l.append(start.item())
            if start.item() == EOS_IX:
                break
            start = start.unsqueeze(-1).to(device)
    return code2string(l), predictions

In [30]:
string, predictions = generate(model, "H", 100)

In [31]:
pr

"Hh[4yC&g^/og-H?PgdV|r~zz<D2L~q~U3SK#>NW~G*s?|;c3|UHtK(cX9)5r-d#)ej)__<;qs%i'{/UxP/'Wb'Y_mK9cV-CK4^|"

In [71]:
# d = model.decode()
d = model.decode(model(x, [ v.to(device) for v in model.initHidden(x.shape[1])]))

In [84]:
losses = []
for x in trainloader:
    
    x, y, m = x[:-1].to(device), x[1:].to(device), (x[1:]!=PAD_IX ).to(device)
    #logits, loss = train_step(model, criterion, x, y, m)
    h = [ t.to(device) for t in model.initHidden(x.shape[1])] 
    d = model.decode(model(x, h))
    loss = criterion(d, y, m)
    losses.append(loss.item())
    print(loss)
    #loss.backward()


tensor(4.4635, grad_fn=<MeanBackward0>)
tensor(4.4298, grad_fn=<MeanBackward0>)
tensor(4.4320, grad_fn=<MeanBackward0>)
tensor(4.4291, grad_fn=<MeanBackward0>)
tensor(4.4344, grad_fn=<MeanBackward0>)
tensor(4.4259, grad_fn=<MeanBackward0>)
tensor(4.4202, grad_fn=<MeanBackward0>)
tensor(4.4348, grad_fn=<MeanBackward0>)
tensor(4.4231, grad_fn=<MeanBackward0>)
tensor(4.4267, grad_fn=<MeanBackward0>)
tensor(4.4260, grad_fn=<MeanBackward0>)
tensor(4.4160, grad_fn=<MeanBackward0>)
tensor(4.4333, grad_fn=<MeanBackward0>)
tensor(4.4234, grad_fn=<MeanBackward0>)
tensor(4.4217, grad_fn=<MeanBackward0>)
tensor(4.4324, grad_fn=<MeanBackward0>)
tensor(4.4292, grad_fn=<MeanBackward0>)
tensor(4.4232, grad_fn=<MeanBackward0>)
tensor(4.4287, grad_fn=<MeanBackward0>)
tensor(4.4237, grad_fn=<MeanBackward0>)
tensor(4.4280, grad_fn=<MeanBackward0>)
tensor(4.4224, grad_fn=<MeanBackward0>)
tensor(4.4274, grad_fn=<MeanBackward0>)
tensor(4.4325, grad_fn=<MeanBackward0>)
tensor(4.4358, grad_fn=<MeanBackward0>)


In [15]:
start = " "

In [16]:
torch.tensor(string2code(start)).unsqueeze(-1).to(device)

tensor([[96]])

# Data processing
Put here the core of the notebook. Feel free di further split this section into subsections.

# References
We report here relevant references:
1. author1, article1, journal1, year1, url1
2. author2, article2, journal2, year2, url2