# [ global ]

In [1]:
# inbuilt 
import os
import sys
import math

# most common
import numpy as np
import matplotlib.pyplot as plt

# pytorch
import torch as tt
import torch.nn as nn
import torch.optim as oo
import torch.functional as ff
import torch.distributions as dd
import torch.utils.data as ud

# custom
import known
import known.ktorch as kt
from known.basic import pj
print(f'{sys.version=}\n{np.__version__=}\n{tt.__version__=}\n{known.__version__=}')

from torch.utils.data import Dataset, IterableDataset, DataLoader
import glob

import unicodedata
import string

import known 
import known.ktorch as kt

sys.version='3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]'
np.__version__='1.24.1'
tt.__version__='1.13.1+cpu'
known.__version__='0.0.1'


In [2]:
class NamesData(DataLoader):
    def findFiles(path): return glob.glob(path)

    
    all_letters = string.ascii_letters + " .,;'"
    n_letters = len(all_letters)


    def unicodeToAscii(s):
        return ''.join(
            c for c in unicodedata.normalize('NFD', s)
            if unicodedata.category(c) != 'Mn'
            and c in __class__.all_letters
        )
    

    # Read a file and split into lines
    def readLines(filename):
        lines = open(filename, encoding='utf-8').read().strip().split('\n')
        return [__class__.unicodeToAscii(line) for line in lines]

    def letterToIndex(letter): return __class__.all_letters.find(letter)

    def letterToTensor(letter):
        tensor = tt.zeros(1, __class__.n_letters)
        tensor[0][__class__.letterToIndex(letter)] = 1
        return tensor

    def lineToTensor(line):
        tensor = tt.zeros(len(line), 1, __class__.n_letters)
        for li, letter in enumerate(line):
            tensor[li][0][__class__.letterToIndex(letter)] = 1
        return tensor

    def __len__(self):
        return  self.length
    
    def __getitem__(self, index):
        category = self.all_categories[self.rng.integers(0, len(self.all_categories))]
        lines = self.category_lines[category]
        line =lines[self.rng.integers(0, len(lines))]
        category_tensor = tt.zeros(self.n_categories, dtype=tt.long)
        category_tensor[self.all_categories.index(category)]=1
        line_tensor = __class__.lineToTensor(line)
        self.last=(line, category)
        return line_tensor.squeeze_(1), category_tensor#.squeeze_(0)

    def __init__(self, path = 'data/names/*.txt', seed=None):
        fileslike = __class__.findFiles(path)
        self.category_lines = {}
        self.all_categories = []
        for filename in fileslike:
            category = os.path.splitext(os.path.basename(filename))[0]
            self.all_categories.append(category)
            lines =__class__. readLines(filename)
            self.category_lines[category] = lines
        self.n_categories = len(self.all_categories)
        self.length = sum([len(v) for v in self.category_lines.values()])
        self.rng = np.random.default_rng(seed)

    def categoryFromOutput(self, output):
        top_n, top_i = output.topk(1)
        print(top_n, top_i)
        category_i = top_i[0].item()
        return self.all_categories[category_i], category_i

In [3]:
ds = NamesData()

In [4]:
dl=iter(DataLoader(ds, batch_size=1))


In [5]:
x,y = next(dl)


known.about(x)
known.about(y, True)
print(ds.last)

type: <class 'torch.Tensor'>
len: 1
shape: torch.Size([1, 7, 57])
type: <class 'torch.Tensor'>
len: 1
shape: torch.Size([1, 18])
Object:
tensor([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
('Bernard', 'French')


In [None]:
rnnc = kt.ELMANC(
    has_bias=True,
    actF=lambda x: tt.softmax(x, dim=1),
    input_size=NamesData.n_letters,         # input features
    hidden_sizes=[128, ds.n_categories],       # hidden features at each layer
    dropout=0.0,        # dropout after each layer, only if hidden_sizes > 1
    batch_first=True,  # if true, excepts input as (batch_size, seq_len, input_size) else (seq_len, batch_size, input_size)
    dtype=tt.float32,
    device=None,
    stack_output=True)


In [None]:
with tt.no_grad():
    P, H = rnnc(x)
known.about(P)
known.about(H)

In [None]:

model=rnnc
epochs = 10_0000
batch_size=1
shuffle=True
validation_freq = int(epochs/10)
criterion=nn.NLLLoss()
lr = 0.005
weight_decay = 0.0
optimizer=oo.Adam(rnnc.parameters(), lr=lr, weight_decay=weight_decay)
lrs=oo.lr_scheduler.LinearLR(optimizer, start_factor= 1.0, end_factor=0.7, total_iters=epochs)

early_stop_train=kt.QuantiyMonitor('TrainLoss', patience=50, delta=0.00001)
early_stop_val=kt.QuantiyMonitor('ValLoss', patience=50, delta=0.00001)
checkpoint_freq=int(epochs/4)
save_path='sample.rnn'
loss_plot_start = int(epochs/50)

trainer = kt.Trainer(model)
trainer.optimizer=optimizer
trainer.criterion=criterion

trainer.fit(training_data=ds, validation_data=None, 
            epochs=epochs, batch_size=batch_size,shuffle=shuffle,validation_freq=validation_freq,
            save_path=save_path, use_rnn=True, verbose=1)

trainer.plot_results(loss_plot_start=loss_plot_start)

mtl, tl = trainer.evaluate(ds, use_rnn=True)
print('loss', mtl)
print('=================================================')

In [None]:
def predict(input_line, n_predictions=3):
    xaxis = ds.all_categories
    print('\n> %s' % input_line)
    with tt.no_grad():
        output, *_ = rnnc(NamesData.lineToTensor(input_line))
        print(output.shape)
    for i,ts in enumerate(output):
        plt.figure(figsize=(20,3))
        plt.title(f'{i+1}')
        plt.bar(xaxis, ts[0])
        plt.show()




In [None]:
predict('Dovesky')

In [None]:
predict('Jackson')


In [None]:
predict('Satoshi')