# Tasks
1. load text data from the book text file
2. prepare the data for training
    - hot-encode the characters
    - create data loader for training (split the data into (batch,sequences) for input and target)
3. create the LSTM model
4. training
5. testing

# Import libraries and define constants

In [1]:
import helper
from src import *
import random
from torch.utils.data import DataLoader
import torch


In [2]:
dataset_path = "../dataset"
text_file_name = "anna.txt"
weights_path="../model_weights"
train_data_path="../train_data"
train_on_gpu=False

# Load Data

In [3]:
with open(dataset_path + "/" + text_file_name, 'r') as txt_file:
    txt = txt_file.read()

In [4]:
txt[:100]

'Chapter 1\n\n\nHappy families are all alike; every unhappy family is unhappy in its own\nway.\n\nEverythin'

In [5]:
chars = set(txt)
helper.save_dict_to_json(dataset_path + "\chars.json", list(chars))

## Testing Dataset class

In [6]:
from IPython.core.display_functions import display

for i in range(1000):
    print(f"test {i+1}")
    batch_Size=random.randint(1,10)
    no_characters=random.randint(20,100)
    seq_len=random.randint(1,10)
    characters="".join(random.choices(list(chars),k=no_characters))
    data_loader_test(characters,batch_size=batch_Size,seq_length=seq_len)
    print()
display("all test cases passed")

test 1
test text no of characters=51
H3f;ScJ&X&I.zQ9c`tfWX.dw %&c&s%cUCq:v%r;x/R%A%J;A%E
converting characters to numbers ....
Done
characters after clipping to fit n sequence size=50
no of batches=9  , batch_size 6 , seq_length=1 , no of chars=54
Test Text H3f;ScJ&X&I.zQ9c`tfWX.dw %&c&s%cUCq:v%r;x/R%A%J;A%E
Data text H3f;ScJ&X&I.zQ9c`tfWX.dw %&c&s%cUCq:v%r;x/R%A%J;A%
labels text 3f;ScJ&X&I.zQ9c`tfWX.dw %&c&s%cUCq:v%r;x/R%A%J;A%E

test 2
test text no of characters=26
S%lAlJ.P:@wAQ$w%6B:B('ySUr
converting characters to numbers ....
Done
characters after clipping to fit n sequence size=24
no of batches=1  , batch_size 4 , seq_length=6 , no of chars=24
Test Text S%lAlJ.P:@wAQ$w%6B:B('ySUr
Data text S%lAlJ.P:@wAQ$w%6B:B('yS
labels text %lAlJ.P:@wAQ$w%6B:B('ySU

test 3
test text no of characters=30
W!"_SI_(A(Sne,N!hWvdL!re,dddAh
converting characters to numbers ....
Done
characters after clipping to fit n sequence size=27
no of batches=2  , batch_size 8 , seq_length=3 , no of chars=48
Test 

'all test cases passed'

# Train and test dataset

## Data split

In [7]:
test_percentage=0.25
train_size=len(txt)-int(len(txt)*test_percentage)
train_text=txt[:train_size]
test_text=txt[train_size:]

## train and test loaders

In [8]:
batch_Size=64

In [9]:
train_dataset=CharsDataset(chars,train_text)
test_dataset=CharsDataset(chars,test_text)

train_loader=DataLoader(train_dataset,batch_size=batch_Size,drop_last=True)
test_loader=DataLoader(test_dataset,batch_size=batch_Size,drop_last=True)

converting characters to numbers ....
Done
converting characters to numbers ....
Done


In [10]:
str(train_dataset)

'sequence_length:5 , no_sequences 297783 , no_chars 83'

In [11]:
test_itr=iter(train_loader)
data,labels=next(test_itr)
print(f"data shape {data.shape} \nlabels shape {labels.shape}")


data shape torch.Size([64, 5, 83]) 
labels shape torch.Size([64, 5])


# Create starting model

In [12]:
hidden_layers=2
hidden_nodes=len(chars)*2

In [13]:
# hidden nodes by default 2* input size which is no of characters exits
charsRnn=CharsRnn(no_chars=len(chars),hidden_nodes=hidden_nodes,hidden_layers=hidden_layers,train_on_gpu=train_on_gpu)

In [14]:
print(charsRnn)

CharsRnn(
  (lstm): LSTM(83, 166, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc): Linear(in_features=166, out_features=83, bias=True)
)


In [15]:
lstm_hidden=charsRnn.init_hidden(batch_size=batch_Size)
out,hidden=charsRnn(data,lstm_hidden)
print(f"out shape {out.shape} \nhidden shape {hidden[0].shape}\nlabels shape {labels.view(train_dataset.seq_length*batch_Size).shape}")

out shape torch.Size([320, 83]) 
hidden shape torch.Size([2, 64, 166])
labels shape torch.Size([320])


# Model training and hyper parameters running

In [16]:
train_losses,test_losses=train(model=charsRnn,train_loader=train_loader,test_loader=test_loader,lr=0.01,epochs=1,weight_saving_path=weights_path,epoch_data_saving_path=train_data_path)

Testing before training ...
 Testing [=.........]time remaining (m) = 0.55 Avg Test_Loss=4.43361861

KeyboardInterrupt: 