In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [14]:
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import Dataset,DataLoader
import collections 
import re

In [65]:
class Vocab:
    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        #count token frequencies
        counter = collections.Counter(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x:x[1], reverse=True)
        
        #list of unique tokens
        self.itos = list(sorted(set(['<unk>'] + reserved_tokens + [token for token, freq in self.token_freqs if freq >= min_freq])))
        self.stoi = {token: idx for idx,token in enumerate(self.itos) }
        
    def __len__(self):
        #length of vocabulary
        return len(self.itos)
    
    def __getitem__(self, tokens):
        #make tokens into indices
        if not isinstance(tokens,(list,tuple)):
            return self.stoi[tokens]
        return [self.__getitem__(token) for token in tokens]
    
    def to_tokens(self,indices):
        #make indixes into tokens
        if hasattr(indices,'__len__') and len(indices)>1:
            return [self.itos[int(index)] for index in indices]
        return slef.itos[indices]
    
    @property
    def unk(self):
        return self.token_to_idx['<unk>']
class text_dataset(Dataset):
    def __init__(self, num_steps,train= True, train_size=10000, val_size=5000):
        super().__init__()
        corpus,self.vocab = self.build(self._load())
        array = torch.tensor([corpus[i:i+num_steps+1] for i in range(len(corpus)-num_steps)])
        if train:
            self.X,self.Y = array[:,:-1][:train_size],array[:,1:][:train_size]
        else:
            self.X,self.Y = array[:,:-1][train_size:train_size+val_size],array[:,1:][train_size:train_size+val_size]
        
    def _load(self,path= '/kaggle/input/time-machine/timemachine.txt'):
        with open(path,"r") as f:
            return f.read()
        
    def _preprocess(self, raw_text):
        return re.sub('[^A-Za-z]+',' ',raw_text).lower()
    
    def _tokenize(self,text):
        return list(text)
    
    def build(self,raw_text,vocab=None):
        tokens = self._tokenize(self._preprocess(raw_text))
        if vocab is None: vocab = Vocab(tokens)
        corpus = [vocab[token] for token in tokens]
        return corpus, vocab
    
    def __getitem__(self,index):
        sample = self.X[index],self.Y[index]
        return sample
    
    def __len__(self):
        return self.X.shape[0]

In [33]:
class RNN(nn.Module):
    def __init__(self,input_size,hidden_size):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size)
    
    def forward(self,inputs,H=None):
        return self.rnn(inputs, H)

see docs for torch.RNN\
output shape:T,N,channel_len

In [73]:
class RNNLM(nn.Module):
    '''language model based on RNN'''
    def __init__(self, rnn, vocab_size, clip_value=100.0):
        super().__init__()
        self.rnn = rnn
        self.vocab_size = vocab_size
        self.clip_value = clip_value
#         self.lr = lr
#         self.W_hq = nn.Parameter(torch.randn(self.rnn.hidden_size,vocab_size)*self.rnn.sigma)
#     #the output size set to be vocab_size
#         self.b_q = nn.Parameter(torch.zeros(self.vocab_size))
        self.linear = nn.Linear(self.rnn.hidden_size,vocab_size)
        
    def one_hot(self,X):
        return F.one_hot(X.T, self.vocab_size).type(torch.float32)
    
    def output_layer(self,hiddens):
#         outputs = [torch.matmul(H,self.W_hq) + self.b_q for H in rnn_outputs]
#         return torch.stack(outputs, 1)#num_steps,batch_size,vocab_size
        return self.linear(hiddens).transpose(0,1)
#passing through the linear it is T,B,C(like the input) after that (B,C,T)
    def forward(self,X,state=None):
        '''
        X:batch_size,time_step
        '''
        embs = self.one_hot(X)
        # embs.shape:timesteps,batch_size,embedding_dimension
        rnn_outputs,_ = self.rnn(embs,state)
        return self.output_layer(rnn_outputs)
        
    def clip_gradient(self):
        norm = 0
        for parameter in self.parameters():
            if parameter.requires_grad:
                norm += torch.sum(parameter.grad**2)
            norm = torch.sqrt(norm)
        if norm > self.clip_value:
            for param in self.parameters():
                param.grad*=self.clip_value/norm
    def generate(self,prefix,num_preds,vocab,device = "cuda" if torch.cuda.is_available() else "cpu"):
        state,outputs = None,[vocab[prefix[0]]]#2D array that batch_size,1(index of prefix[0])
        for i in range(len(prefix) + num_preds - 1):
            X = torch.tensor([[outputs[-1]]],device=device)
            embs = self.one_hot(X)
            rnn_outputs, state = self.rnn(embs,state)
            if i < len(prefix) - 1:
                outputs.append(vocab[prefix[i+1]])
            else:
                Y = self.output_layer(rnn_outputs)
                outputs.append(int(Y.argmax(axis=2).reshape(1)))
        return ''.join([vocab.itos[i] for i in outputs])

In [74]:
train_data = text_dataset(num_steps=32,train=True)
test_data = text_dataset(num_steps=32,train=False)

In [75]:
rnn = RNN(input_size=len(train_data.vocab),hidden_size=32)
model = RNNLM(rnn,vocab_size=len(train_data.vocab) )#this decodes to a size same as the input

In [76]:
for name,x in model.named_parameters():
    print(name,i)

rnn.rnn.weight_ih_l0 9
rnn.rnn.weight_hh_l0 9
rnn.rnn.bias_ih_l0 9
rnn.rnn.bias_hh_l0 9
linear.weight 9
linear.bias 9


ih and hh biases are different

In [77]:
lr = 1
num_epochs = 100
optimizer = torch.optim.AdamW(model.parameters(),lr=lr)

In [78]:
train_loader = DataLoader(dataset=train_data,batch_size=1024,shuffle=True)
test_loader = DataLoader(dataset=test_data,batch_size=1024,shuffle=True)

In [82]:
for epoch in range(num_epochs):    
    for i,(sample,targets) in enumerate(train_loader):
        out = model(sample)
#         out.shape:num_steps,batch_size,channel/vocab
#         print(sample.shape)
        batch_size,num_steps,vocab_size = out.shape
#         if i==0:
#             print(out.shape,targets.shape)
#             break
#         print(out.shape)
        out = out.reshape(num_steps*batch_size,-1)
        targets = targets.view(num_steps*batch_size,)
        
        loss = F.cross_entropy(out,targets)
        optimizer.zero_grad()
        loss.backward()
#         model.clip_gradient()
        optimizer.step() 

In [83]:
loss.item()
model.generate('father is using mobile', 20, train_data.vocab)

'father is using mobile the the the the the'