# Tasks
1. load text data from the book text file
2. prepare the data for training
    - hot-encode the characters
    - create data loader for training (split the data into (batch,sequences) for input and target)
3. create the LSTM model
4. training
5. testing

# Import libraries and define constants

In [1]:
import helper
from src import *
import random
from torch.utils.data import DataLoader
import torch
import traintracker

In [2]:
dataset_path = "../dataset"
text_file_name = "anna.txt"
weights_path="../model_weights"
train_data_path="../train_data"
train_on_gpu=False

# Load Data

In [3]:
with open(dataset_path + "/" + text_file_name, 'r') as txt_file:
    txt = txt_file.read()

In [4]:
txt[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [5]:
chars = set(txt)
helper.save_dict_to_json(dataset_path + "\chars.json", list(chars))

## Testing Dataset class

In [6]:
from IPython.core.display_functions import display

for i in range(1000):
    print(f"test {i+1}")
    batch_Size=random.randint(1,10)
    no_characters=random.randint(20,100)
    seq_len=random.randint(1,10)
    characters="".join(random.choices(list(chars),k=no_characters))
    data_loader_test(characters,batch_size=batch_Size,seq_length=seq_len)
    print()
display("all test cases passed")

test 1
test text no of characters=64
H9ZpS3PS-O@-RUUAA@oASFJUvp -YL5mupFeRPFHfYGz!
u3(X FIpZYh-Y!HZj4
converting characters to numbers ....
Done
characters after clipping to fit n sequence size=63
no of batches=7  , batch_size 1 , seq_length=9 , no of chars=63
Test Text H9ZpS3PS-O@-RUUAA@oASFJUvp -YL5mupFeRPFHfYGz!
u3(X FIpZYh-Y!HZj4
Data text H9ZpS3PS-O@-RUUAA@oASFJUvp -YL5mupFeRPFHfYGz!
u3(X FIpZYh-Y!HZj
labels text 9ZpS3PS-O@-RUUAA@oASFJUvp -YL5mupFeRPFHfYGz!
u3(X FIpZYh-Y!HZj4

test 2
test text no of characters=31
hBDqgtBFsh6sisuDtYuhyP8hsi5yWr6
converting characters to numbers ....
Done
characters after clipping to fit n sequence size=30
no of batches=15  , batch_size 1 , seq_length=2 , no of chars=30
Test Text hBDqgtBFsh6sisuDtYuhyP8hsi5yWr6
Data text hBDqgtBFsh6sisuDtYuhyP8hsi5yWr
labels text BDqgtBFsh6sisuDtYuhyP8hsi5yWr6

test 3
test text no of characters=90
u2ym2qTNKMN%2"422GKBBG2FN%"x%4VFVju'TIM,JG'%43KuKlyFy`c,'uJ%2wBNBjBUGmuxJF%u(J_yVyF2D`,RqT
converting characters to numb

'all test cases passed'

# Train and test dataset

## Data split

In [6]:
test_percentage=0.25
train_size=len(txt)-int(len(txt)*test_percentage)
train_text=txt[:train_size]
test_text=txt[train_size:]

## train and test loaders

In [7]:
batch_Size=64
seq_len=50

In [8]:
train_dataset=CharsDataset(chars,train_text,seq_length=seq_len)
test_dataset=CharsDataset(chars,test_text,seq_length=seq_len)

train_loader=DataLoader(train_dataset,batch_size=batch_Size,drop_last=True)
test_loader=DataLoader(test_dataset,batch_size=batch_Size,drop_last=True)

converting characters to numbers ....
Done
converting characters to numbers ....
Done


In [26]:
train_data_text=[]
train_labels_text=[]
train_itr=iter(train_dataset)
for i in range(5):
  data,label=next(train_itr)
  for j in range(seq_len):
      char_idx=torch.argmax(data[j]).item()
      char=train_dataset.int_to_chars[char_idx]
      train_data_text.append(char)

      char_idx=label[j].item()
      char=train_dataset.int_to_chars[char_idx]
      train_labels_text.append(char)

train_data_text="".join(train_data_text)
train_labels_text="".join(train_labels_text)

print("data loader")
print(train_data_text)
print("Actual")
print(txt[:250])
print("\n\n")
print(" labels")
print(train_labels_text)
print("Actual")
print(txt[1:251])


data loader
Chapter 1


Happy families are all alike; every unhappy family is unhappy in its own
way.

Everything was in confusion in the Oblonskys' house. The wife had
discovered that the husband was carrying on an intrigue with a French
girl, who had been a go
Actual
Chapter 1


Happy families are all alike; every unhappy family is unhappy in its own
way.

Everything was in confusion in the Oblonskys' house. The wife had
discovered that the husband was carrying on an intrigue with a French
girl, who had been a go



 labels
hapter 1


Happy families are all alike; every unhappy family is unhappy in its own
way.

Everything was in confusion in the Oblonskys' house. The wife had
discovered that the husband was carrying on an intrigue with a French
girl, who had been a gov
Actual
hapter 1


Happy families are all alike; every unhappy family is unhappy in its own
way.

Everything was in confusion in the Oblonskys' house. The wife had
discovered that the husband was carrying on an intrigu

In [27]:
str(train_dataset)

'sequence_length:50 , no_sequences 29778 , no_chars 83'

In [28]:
test_itr=iter(train_loader)
data,labels=next(test_itr)
print(f"data shape {data.shape} \nlabels shape {labels.shape}")


data shape torch.Size([64, 50, 83]) 
labels shape torch.Size([64, 50])


# Create starting model

In [12]:
traintracker.last_saved_hyperparameters(train_data_path)['model architecture']

'CharsRnn(\n  (lstm): LSTM(83, 256, num_layers=2, batch_first=True, dropout=0.5)\n  (dropout): Dropout(p=0.5, inplace=False)\n  (fc): Linear(in_features=256, out_features=83, bias=True)\n)'

In [9]:
hidden_layers=2
hidden_nodes=256

In [10]:
# hidden nodes by default 2* input size which is no of characters exits
charsRnn=CharsRnn(chars=chars,hidden_nodes=hidden_nodes,hidden_layers=hidden_layers,train_on_gpu=train_on_gpu)

In [11]:
print(charsRnn)

CharsRnn(
  (lstm): LSTM(83, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=256, out_features=83, bias=True)
)


In [12]:
charsRnn.load_weights(traintracker.latest_weights_path(weights_path),cuda_weights=True)

In [13]:
lstm_hidden=charsRnn.init_hidden(batch_size=batch_Size)
out,hidden=charsRnn(data,lstm_hidden)
print(f"out shape {out.shape} \nhidden shape {hidden[0].shape}\nlabels shape {labels.view(train_dataset.seq_length*batch_Size).shape}")

NameError: name 'data' is not defined

# Model training and hyper parameters running

In [18]:
train_losses,test_losses=train(model=charsRnn,train_loader=train_loader,test_loader=test_loader,lr=0.001,epochs=2,cuda=train_on_gpu,weight_saving_path=weights_path,train_data_save_path=train_data_path)

Testing before training ...
 testing [==........] time remaining (m) = 0.54 Avg Test_Loss=10.38660384


KeyboardInterrupt



In [13]:
print(charsRnn.predict_text("the children should respect thier parents",no_chars=500))

the children should respect thier parents%bfPmj%j6-tj6A)V6j%mwt6m %6Pj%j6Pj%5A)Fpt6P.-%NrA)V6j%PF,- j%5-wj%A

%mw%j6-mr%6Ar - B%Pwp% j-??-p%Pwp% Pjm 
PIjmAw%A
% A,-% -IAwpB%Pwp%5P %jA%N-%PFF%jA% jA?B%j-Pr iiP% jmFF%j6Pjt5P %Pw1t jAr1%PjtjA%j6-% P,-% m j-rB%Pwp%6-t6Pp%j6-% -IAwp%5-r-%Pw1%AwF1% jPrj-p%Pwp%5-wj%A

%Pwp%jP&mwV%P%V-wjF-,Pw%Pwp%56mI6% PmpB%5PF&-p%P%?-w-rjmI% jPrj-p%5mj6%j6-mr% m j-rB%5mj6%6P??1tA
% AIm-j1B%Pwp%6P.mwV%j6-r-%5P %P% jPj-%A
% jAAp% jrPwV-B%5P %wmV6jT%Rwp%6-%6Pp%wAj%j6mw&%PNA)j%j6-tjmw-pB%N)j% 6-%6-Prp%6-r%j6-t j


In [14]:
def predict_text(self, context: str, no_chars: int, hidden=None):
    self.eval()
    with torch.no_grad():
        if hidden is None:
            hidden = self.init_hidden(1)
        # feed the context
        for c in context:
            next_char, hidden = self._next_char(c, hidden)
        next_char = context[-1]
        # predict next chars
        for i in range(no_chars):
            next_char, hidden = self._next_char(next_char, hidden)
            context += next_char

    return context

In [16]:
print(predict_text(charsRnn,"the children should respect thier parents",no_chars=1000))

KeyboardInterrupt: 

In [None]:
x="There happened to him at that instant what does happen to people when \
they are unexpectedly caught in something very disgraceful. He did not\
succeed in adapting his face to the position in which he was placed\
towards his wife by the discovery of his fault. Instead of being hurt"

In [21]:
def one_hot_encode(arr, n_labels):

    # Initialize the the encoded array
    one_hot = np.zeros((arr.size, n_labels), dtype=np.float32)

    # Fill the appropriate elements with ones
    one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1.

    # Finally reshape it to get back to the original array
    one_hot = one_hot.reshape((*arr.shape, n_labels))

    return one_hot

In [25]:
def predict(net, char, h=None, top_k=None):
        ''' Given a character, predict the next character.
            Returns the predicted character and the hidden state.
        '''

        # tensor inputs
        x = np.array([[net.chars_to_int[char]]])
        x = one_hot_encode(x, len(net.chars))
        inputs = torch.from_numpy(x)

        if(train_on_gpu):
            inputs = inputs.cuda()

        # detach hidden state from history
        h = tuple([each.data for each in h])
        # get the output of the model
        out, h = net(inputs, h)

        # get the character probabilities
        p = F.softmax(out, dim=1).data
        if(train_on_gpu):
            p = p.cpu() # move to cpu

        # get top characters
        if top_k is None:
            top_ch = np.arange(len(net.chars))
        else:
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()

        # select the likely next character with some element of randomness
        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())


        # return the encoded value of the predicted char and the hidden state
        return net.int_to_chars[char], h

In [26]:
def sample(net, size, prime='The', top_k=None):

    if(train_on_gpu):
        net.cuda()
    else:
        net.cpu()

    net.eval() # eval mode

    # First off, run through the prime characters
    chars = [ch for ch in prime]
    h = net.init_hidden(1)
    for ch in prime:
        char, h = predict(net, ch, h, top_k=top_k)

    chars.append(char)

    # Now pass in the previous character and get a new one
    for ii in range(size):
        char, h = predict(net, chars[-1], h, top_k=top_k)
        chars.append(char)

    return ''.join(chars)

In [27]:
print(sample(charsRnn, 1000, prime='Anna', top_k=5))

AnnaT%86-%IP?%j6-%?Pmwj-r%A
%j6-%,Pr-%Pwp% Pjm 
PIjmAw%j6-% jrmwV%Pwp%5Pmj%P%FPp1%jA%j6-%?rmwI-%Pwp%6Ar - %A
%j6-%?-r6-Prp%j6-%I6mFpr-w%Pwp%6-%6Pp%PFFA5%j6-,T%bf-FFB%Pwp% 6PFFJ%Q% 6PFF% A,-t,) jPI6- J%Qj%5P %PwAj6-r%jAA%,Pw% jP1-p%mwjA%j6-t PIr-,-wj%PNA)j%mj%mj%m %wAj%Aw%j6-%prA.P% Pj%pA5w%j6-%Irm- %j6-% )N*-Ij%5mj6%j6-%IA,,m  mAwT%buA)% --%1A)TbttbQ7,%wAj% jrPmV6j%mw%P%IAw jrP1%PVPmwT%Q%IPw7j%N)j%mj%5P t --mwV%6-r%jA%6m %6) NPwpB%j6Pj%6-% P5%j6-%,-w%A
% )I6%P%?-P Pwj B%Pwp% A%j-PB%6-%IrAI&-p%6m % -w.-r1B%P%?rAj-Ij-p%N-
Ar-%6-r%6PwpB%P%IPjjF-%6) NPwpB%j6-% A)wp%A
%6m t
PI-TttbQj7 %wA%,-j%mjBb%6-% Pmp%jA%6m,tPwp%P%,Pr-%A
%j6-% jP1TttbQ%pAw7j%&wA5%j6Pj%j6-1%5rPwV%Pj%j6Pj% jPjmAwT%Qjt6Pp%wAj%jr1mwV%P%,Awj6%Pwp%j6-1%6Pp% A,-% jP1%j6Pj%5A)Fp% 6-%jAFp%6m,%5mj6%P% 6Pp-%mwjA%j6-%I6PFF-w&B%Pwp% A%j6-%IPr- T%buA)%I)Fj)r-T%2) NPwp%5-r-% jP1%Pwp% --%j6-% j-?%Pwp% A,-j6mwV%5P % j-?%6P.-% --,mwVT%x6A)FptN-%PF,A j%jP&mwV%Aw%6m %6Ar -%Pwpt A% jr- 6-p%j6-% -r.Pwj%5mj6%j6-% Aw%jA%PjjrPIjmAw%5mj6%6-r% Ar