<a href="https://colab.research.google.com/github/akarshippili/pytorch_stuff/blob/main/nlp_pytorch_lstm_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip mydata.zip

Archive:  mydata.zip
   creating: mydata/
  inflating: mydata/val.csv          
  inflating: mydata/train.csv        
  inflating: mydata/test.csv         


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy.data import Field, BucketIterator,TabularDataset

import spacy
import numpy as np

import random
import math
import time

In [None]:
# import pandas as pd
# df = pd.read_csv("/content/mydata/train.csv")

In [None]:
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
spacy_en = spacy.load('en')

In [None]:
def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [None]:
SRC = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [None]:
feilds = [('text', SRC), ('summary', TRG)] 

In [None]:
train_data,val_data = TabularDataset.splits(
    path="/content/mydata", format='csv',
    train="train.csv",
    validation="val.csv",
    fields=feilds
)

In [None]:
# for batch in train_iterator:
#   for e in batch:
#     print(e[0])
#     print(e[1])
#     break
#   break

In [None]:
SRC.build_vocab(train_data, max_size=10000, min_freq=3)
TRG.build_vocab(train_data, max_size=10000, min_freq=3)

In [None]:
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 8325
Unique tokens in target (en) vocabulary: 2683


In [None]:
print(SRC.vocab.__dict__.keys())
print(list(SRC.vocab.__dict__.values()))
e = list(SRC.vocab.__dict__.values())
for i in e:
  print(i)

dict_keys(['freqs', 'itos', 'unk_index', 'stoi', 'vectors'])
0
None


In [None]:
print(train_data[5].__dict__.keys())
print(train_data[5].__dict__.values())

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
BATCH_SIZE = 32

train_iterator, valid_iterator = BucketIterator.splits(
    (train_data, val_data), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.summary),
    sort_within_batch=True,
    device = device)

In [None]:
count = 0
max_len_eng = []
max_len_ger = []
for data in train_data:
  max_len_ger.append(len(data.text))
  max_len_eng.append(len(data.summary))
  if count < 10 :
    print("German - ",*data.text, " Length - ", len(data.text))
    print("English - ",*data.summary, " Length - ", len(data.summary))
    print()
  count += 1

print("Maximum Length of English sentence {} and German sentence {} in the dataset".format(max(max_len_eng),max(max_len_ger)))
print("Minimum Length of English sentence {} and German sentence {} in the dataset".format(min(max_len_eng),min(max_len_ger)))

In [None]:
count = 0
for data in train_iterator:
  if count < 1 :
    print("Shapes", data.text.shape, data.summary.shape)
    print()
    print("German - ",*data.text, " Length - ", len(data.text))
    print()
    print("English - ",*data.summary, " Length - ", len(data.summary))
    temp_ger = data.text
    temp_eng = data.summary
    count += 1

In [None]:
import pandas as pd

df_eng_idx = pd.DataFrame(data = temp_eng_idx, columns = [str("S_")+str(x) for x in np.arange(1, 33)])
df_eng_idx.index.name = 'Time Steps'
df_eng_idx.index = df_eng_idx.index + 1 
# df_eng_idx.to_csv('/content/idx.csv')
df_eng_idx

In [None]:
df_eng_word = pd.DataFrame(columns = [str("S_")+str(x) for x in np.arange(1, 33)])
df_eng_word = df_eng_idx.replace(idx_2_word)
# df_eng_word.to_csv('/content/Words.csv')
df_eng_word

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

In [None]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        #input = [1, batch size]
        embedded = self.dropout(self.embedding(input))
        #embedded = [1, batch size, emb dim]
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src len, batch size]
        #trg = [trg len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, trg_len):
            
            #insert input token embedding, previous hidden and previous cell states
            #receive output tensor (predictions) and new hidden and cell states
            output, hidden, cell = self.decoder(input, hidden, cell)
            
            #place predictions in a tensor holding predictions for each token
            outputs[t] = output
            
            #decide if we are going to use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            
            #get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            
            #if teacher forcing, use actual next token as next input
            #if not, use predicted token
            input = trg[t] if teacher_force else top1
        
        return outputs

In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 100
DEC_EMB_DIM = 100
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

In [None]:
def init_weights(m):
    for name, param in m.named_parameters():
      nn.init.uniform_(param.data, -0.5, 0.5)
        # nn.init.uniform_(param.data, 0.0,1.0)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(8325, 100)
    (rnn): LSTM(100, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(2683, 100)
    (rnn): LSTM(100, 512, num_layers=2, dropout=0.5)
    (fc_out): Linear(in_features=512, out_features=2683, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 9,194,619 trainable parameters


In [None]:
optimizer = optim.Adam(model.parameters(),lr=0.00001)

In [None]:
# TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]
criterion = nn.CrossEntropyLoss()

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.text
        trg = batch.summary
        
        optimizer.zero_grad()
        
        output = model(src, trg)
        
        #trg = [trg len, batch size]
        #output = [trg len, batch size, output dim]
        
        output_dim = output.shape[-1]
        
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        
        #trg = [(trg len - 1) * batch size]
        #output = [(trg len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.text
            trg = batch.summary

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg len, batch size]
            #output = [trg len, batch size, output dim]

            output_dim = output.shape[-1]
            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)

            #trg = [(trg len - 1) * batch size]
            #output = [(trg len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
def translate_sentence(model, sentence, src, trg, device, max_length=50):

  # print(sentence)
  # Load german tokenizer
  spacy_en = spacy.load("en")

  # Create tokens using spacy and everything in lower case (which is what our vocab is)
  if type(sentence) == str:
      tokens = [token.text.lower() for token in spacy_en(sentence)]
  else:
      tokens = [token.lower() for token in sentence]

  # print(tokens)

  # sys.exit()
  # Add <SOS> and <EOS> in beginning and end respectively
  tokens.insert(0, src.init_token)
  tokens.append(src.eos_token)

  # Go through each german token and convert to an index
  text_to_indices = [src.vocab.stoi[token] for token in tokens]

  # Convert to Tensor
  sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

  # Build encoder hidden, cell state
  with torch.no_grad():
      hidden, cell = model.encoder(sentence_tensor)

  outputs = [trg.vocab.stoi["<sos>"]]

  for _ in range(max_length):
      previous_word = torch.LongTensor([outputs[-1]]).to(device)

      with torch.no_grad():
          output, hidden, cell = model.decoder(previous_word, hidden, cell)
          best_guess = output.argmax(1).item()

      outputs.append(best_guess)

      # Model predicts it's the end of the sentence
      if output.argmax(1).item() == trg.vocab.stoi["<eos>"] or output.argmax(1).item() == trg.vocab.stoi["eos"]:
          break

  translated_sentence = [trg.vocab.itos[idx] for idx in outputs]

  # remove start token
  return translated_sentence[1:]

In [None]:
sentance = "yummy product convenient love warm mine microwave seconds dive"

In [None]:
N_EPOCHS = 50
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    print(translate_sentence(model,sentance, SRC, TRG, device, max_length=50) )

    start_time = time.time()
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'modelv1.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

In [None]:
model.load_state_dict(torch.load('modelv1.pt'))
test_loss = evaluate(model, valid_iterator, criterion)
print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 10.766 | Test PPL: 47384.592 |


In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/content/mydata/test.csv")

In [None]:
n=30
for i in range(n):
  index = round(random.random()*df.shape[0])
  print("Review : " + df["text"][index])
  print("original : ", df["summary"][index])
  print(translate_sentence(model, df["text"][index], SRC, TRG, device, max_length=50) )
  print("*"*1000)

In [None]:
!git clone https://github.com/pytorch/examples.git

Cloning into 'examples'...
remote: Enumerating objects: 2876, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 2876 (delta 3), reused 6 (delta 1), pack-reused 2863[K
Receiving objects: 100% (2876/2876), 39.18 MiB | 17.70 MiB/s, done.
Resolving deltas: 100% (1440/1440), done.


In [None]:
!cd /content/word_language_model/

In [None]:
!python /content/word_language_model/main.py --cuda --data /content/word_language_model/data/wikitext-2 --epochs 15

| epoch   1 |   200/ 2983 batches | lr 20.00 | ms/batch 47.59 | loss  7.63 | ppl  2068.24
| epoch   1 |   400/ 2983 batches | lr 20.00 | ms/batch 45.26 | loss  6.86 | ppl   953.24
| epoch   1 |   600/ 2983 batches | lr 20.00 | ms/batch 45.22 | loss  6.50 | ppl   663.25
| epoch   1 |   800/ 2983 batches | lr 20.00 | ms/batch 45.10 | loss  6.30 | ppl   544.83
| epoch   1 |  1000/ 2983 batches | lr 20.00 | ms/batch 45.18 | loss  6.16 | ppl   471.85
| epoch   1 |  1200/ 2983 batches | lr 20.00 | ms/batch 45.04 | loss  6.07 | ppl   432.85
| epoch   1 |  1400/ 2983 batches | lr 20.00 | ms/batch 45.18 | loss  5.96 | ppl   387.37
| epoch   1 |  1600/ 2983 batches | lr 20.00 | ms/batch 45.18 | loss  5.96 | ppl   388.76
| epoch   1 |  1800/ 2983 batches | lr 20.00 | ms/batch 45.17 | loss  5.82 | ppl   336.07
| epoch   1 |  2000/ 2983 batches | lr 20.00 | ms/batch 45.22 | loss  5.80 | ppl   330.01
| epoch   1 |  2200/ 2983 batches | lr 20.00 | ms/batch 45.26 | loss  5.68 | ppl   294.24
| epoch   

In [None]:
!python /content/word_language_model/generate.py --cuda --data /content/word_language_model/data/wikitext-2 

/content/word_language_model/data/wikitext-2/train.txt
/content/word_language_model/data/wikitext-2/valid.txt
/content/word_language_model/data/wikitext-2/test.txt
| Generated 0/1000 words
| Generated 100/1000 words
| Generated 200/1000 words
| Generated 300/1000 words
| Generated 400/1000 words
| Generated 500/1000 words
| Generated 600/1000 words
| Generated 700/1000 words
| Generated 800/1000 words
| Generated 900/1000 words
