# Text classification using Charecter-*RNN*

* Data loading
    * letter2tensor
    * name2tensor
* model defining
    * RNN model
* training
    * RNN from scratch
* inference
    * inference of a name

In [1]:
!wget https://download.pytorch.org/tutorial/data.zip
!unzip data.zip

--2020-02-10 13:23:29--  https://download.pytorch.org/tutorial/data.zip
Resolving download.pytorch.org (download.pytorch.org)... 13.35.163.29, 13.35.163.98, 13.35.163.63, ...
Connecting to download.pytorch.org (download.pytorch.org)|13.35.163.29|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2882130 (2.7M) [application/zip]
Saving to: ‘data.zip’


2020-02-10 13:23:30 (96.5 MB/s) - ‘data.zip’ saved [2882130/2882130]

Archive:  data.zip
   creating: data/
  inflating: data/eng-fra.txt        
   creating: data/names/
  inflating: data/names/Arabic.txt   
  inflating: data/names/Chinese.txt  
  inflating: data/names/Czech.txt    
  inflating: data/names/Dutch.txt    
  inflating: data/names/English.txt  
  inflating: data/names/French.txt   
  inflating: data/names/German.txt   
  inflating: data/names/Greek.txt    
  inflating: data/names/Irish.txt    
  inflating: data/names/Italian.txt  
  inflating: data/names/Japanese.txt  
  inflating: data/names/Korean.tx

In [0]:
import torch
import torch.nn as nn
import glob
import torch.optim as optim
import os

import string
import unicodedata

In [3]:
def findfiles(path):
    return glob.glob(path)

findfiles("data/names/*.txt")

['data/names/Vietnamese.txt',
 'data/names/Greek.txt',
 'data/names/Dutch.txt',
 'data/names/Japanese.txt',
 'data/names/Spanish.txt',
 'data/names/Italian.txt',
 'data/names/French.txt',
 'data/names/Scottish.txt',
 'data/names/Portuguese.txt',
 'data/names/Polish.txt',
 'data/names/Chinese.txt',
 'data/names/German.txt',
 'data/names/English.txt',
 'data/names/Korean.txt',
 'data/names/Russian.txt',
 'data/names/Czech.txt',
 'data/names/Arabic.txt',
 'data/names/Irish.txt']

In [0]:
#initiate all english alphabets and allowed special charecters
all_letters = string.ascii_letters + ".,;'"
n_letters = len(all_letters)

# convert text from unicode to ascii
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

In [0]:
def readlines(filename):
    lines = open(filename).read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

In [11]:
# save each language based names in a category dict

category = {}

for filename in glob.glob("data/names/*.txt"):
    category_name = os.path.splitext(os.path.basename(filename))[0]
    category[category_name] = readlines(filename)

diff_category = category.keys()
n_category = len(category.keys())
print(diff_category)
print(n_category)

dict_keys(['Vietnamese', 'Greek', 'Dutch', 'Japanese', 'Spanish', 'Italian', 'French', 'Scottish', 'Portuguese', 'Polish', 'Chinese', 'German', 'English', 'Korean', 'Russian', 'Czech', 'Arabic', 'Irish'])
18


# names to tensor

In [0]:
#pick a name
def name2tensor(name):
    #initialse a torch tensor of size all_letters (all alphabets in english)
    # vector of (1 X all_letters) dimension.
    init_tensor = torch.zeros(len(name), 1, n_letters)
    #break into charecter
    #if breaked charecter is present replace value to 1 in initialised tensor.
    for idx, char in enumerate(name):
        init_tensor[idx][0][all_letters.find(char)] = 1
    return init_tensor


In [63]:
name2tensor('jones').size()

torch.Size([5, 1, 56])

# Network

In [0]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size

        self.i2h = nn.Linear(input_size+hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size+hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden

    def inithidden(self):
        return torch.zeros(1, self.hidden_size)

In [0]:
n_hidden = 128
rnn = RNN(input_size=n_letters, hidden_size=n_hidden, output_size=n_category)

In [69]:
rnn

RNN(
  (i2h): Linear(in_features=184, out_features=128, bias=True)
  (i2o): Linear(in_features=184, out_features=18, bias=True)
  (softmax): LogSoftmax()
)

In [70]:
input = name2tensor('albert')
hidden = torch.zeros(1, n_hidden)

output, next_hidden = rnn(input[0], hidden)
print(output)

tensor([[-2.8669, -2.8953, -2.8668, -2.8744, -2.9019, -2.8706, -2.8947, -3.0067,
         -2.8875, -2.8354, -2.9108, -2.8204, -2.8739, -2.8474, -2.9808, -2.8686,
         -2.9509, -2.8925]], grad_fn=<LogSoftmaxBackward>)


In [0]:
all_categories = list(category.keys())

# Training :

In [74]:
#lets create index for each category
def categoryfromoutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return all_categories[category_i], category_i

print(categoryfromoutput(output))

('German', 11)


In [79]:
import random

# get training example
def randomchoice(l):
    return l[random.randint(0,len(l)-1)]

def randomtrainingexample():
    select_category = randomchoice(all_categories)
    line = randomchoice(category[select_category])
    category_tensor = torch.tensor([all_categories.index(category[select_category])], dtype=torch.long)
    line_tensor = name2tensor(line)
    return select_category, line, category_tensor, line_tensor

for i in range(10):
    select_category, line, category_tensor, line_tensor = randomtrainingexample()
    print(f"Category : {select_category}, /line= {line}") 

ValueError: ignored

In [0]:
criterion = nn.NLLLoss()

In [0]:
learning_rate = 0.005