<a href="https://colab.research.google.com/github/abishek2019/DeepLearning/blob/main/Classifying_Names_(Character_Level_RNN)_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Load and preprocess data (Get data_paths, read_data, transform and load)
import torch
import torch.nn as nn
import torch.utils.data as d
import glob
import unicodedata
import string
import os

torch.manual_seed(1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dir = '/content/drive/MyDrive/pytorch-Deep-Learning-master/nameclassifier_data/data/names/*.txt'
# Hyperparamters
batch_size = 10
epochs = 10
learning_rate = 0.001
hidden_size = 128

vocabulary = string.ascii_letters + '.,;'
# The total size of all the allowed ascii characters that will train our language model.
vocab_size = len(vocabulary)
languages = []

def unicodetoASCII(s):
  return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' and c in vocabulary)

def lineToTensor(single_name):
  single_name_tensor = torch.zeros(len(single_name), 1, vocab_size)
  for i, letter in enumerate(single_name):
    single_name_tensor[i][0][vocabulary.find(letter)] = 1
  return single_name_tensor

def labelToTensor(label):
  return torch.tensor([label], dtype=torch.long)

# Custom dataset class
class NamesDataset(d.Dataset):
  def __init__(self, root_dir):
    file_paths = glob.glob(root_dir)
    self.names = []
    self.labels = []
    for i, each_file_path in enumerate(file_paths):
      country_name = os.path.splitext(os.path.basename(each_file_path))[0]
      languages.append(country_name)
      names_list = open(each_file_path, encoding = 'utf-8').read().strip().split('\n')
      names_list = [unicodetoASCII(name) for name in names_list]
      self.names.extend(names_list)
      self.labels.extend([i] * len(names_list))

  def __len__(self):
    return len(self.names)

  # not used
  def __getitem__(self, index):
    names = self.names[index]
    labels = self.labels[index]
    return names, labels

dataset = NamesDataset(dir)
# dataset.names = [all names], dataset.labels = [indices all labels]

print(dataset.names)
print(dataset.labels)
print(languages)
print(len(languages))

# Not used in this program as we have generated random names for training. Used if training in batches.
train_size = int(0.75 * len(dataset))
test_size = len(dataset) - train_size
train_set, test_set = d.random_split(dataset, [train_size, test_size])
trainloader = d.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
testloader = d.DataLoader(test_set, batch_size=batch_size, shuffle=False, num_workers=2)

In [None]:
# 2. Define the model
class RNNClassifier(nn.Module):
  def __init__(self, input_size, hidden_size, output_size):
    super(RNNClassifier, self).__init__()
    self.hidden_size = hidden_size
    self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
    self.i2o = nn.Linear(input_size + hidden_size, output_size)
    self.softmax = nn.Softmax(dim = 1)

  def forward(self, x, hidden_x):
    combined = torch.cat((x, hidden_x), 1)
    hidden_x = self.i2h(combined)
    x = self.i2o(combined)
    x = self.softmax(x)
    return x, hidden_x

  # Init the hidden states- helps RNN provide a consistent starting point and capture long-term dependencies(ability to retain earlier info).
  def init_hidden(self):
    return torch.zeros(1, self.hidden_size)

output_size = len(languages)
model = RNNClassifier(vocab_size, hidden_size, output_size).to(device)

In [None]:
# 3. Define loss and optimizer
import torch.optim as optim
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# 4. Train the model
# Demo Train
import random
x = lineToTensor('Albert').to(device)
hidden_x = torch.zeros(1, 128).to(device)
output, next_hidden = model(x[0], hidden_x)
print(output.shape)

def randomLib():
  names = dataset.names
  random_name = names[random.randint(0, len(names) - 1)]
  random_name_index = names.index(random_name)
  name_label = dataset.labels[random_name_index]
  name_tensor = lineToTensor(random_name).to(device)
  label_tensor = labelToTensor(name_label).to(device)
  return name_tensor, label_tensor

def trainFromRandomName():
  running_loss = 0.0
  for i in range(10000):
    # Get the data
    name_tensor, label_tensor = randomLib()
    hidden_x = model.init_hidden().to(device)
    # Zero_grad
    optimizer.zero_grad()
    # Forward
    for ix in range(name_tensor.size(0)):
      output, hidden_x = model(name_tensor[ix], hidden_x)
    loss = loss_fn(output, label_tensor)
    # Backward
    loss.backward()
    # Optimize
    optimizer.step()
    running_loss += loss.item()
    if ((i + 1) % 5000 == 0):
      print(f'Epoch: {epoch + 1}  Iter: {i + 1}   Loss: {running_loss / 5000:.3f}')
      running_loss = 0.0

def trainFromTrainLoader():
  print('-----TRAINING FROM TRAINLOADER-----')
  running_loss = 0.0
  for i, (names, labels) in enumerate(trainloader):
    # Get the data
    for ix, name in enumerate(names):
      hidden_x = model.init_hidden().to(device)
      name_tensor = lineToTensor(name).to(device)
      label_tensor = labelToTensor(labels[ix]).to(device)
      # Set Zero Grad
      optimizer.zero_grad()
      # Forward
      for idx in range(name_tensor.size(0)):
        output, hidden_x = model(name_tensor[idx], hidden_x)
      loss = loss_fn(output, label_tensor)
      # Backward
      loss.backward()
      # Optimize
      optimizer.step()
      running_loss += loss.item()
    if (i + 1) % 500 == 0:
      print(f'Epoch: {epoch}     Batch: {i+1}    Loss = {running_loss/5000}')
      running_loss = 0.0

model.train()
print('-----TRAINING FROM RANDOM EXAMPLE-----')
for epoch in range(epochs):
  # trainFromTrainLoader()
  trainFromRandomName()
print('Training Finished')
torch.save(model.state_dict(), 'modelRNN.pth')

torch.Size([1, 18])
-----TRAINING FROM RANDOM EXAMPLE-----
Epoch: 1  Iter: 5000   Loss: 2.558
Epoch: 1  Iter: 10000   Loss: 2.893
Epoch: 2  Iter: 5000   Loss: 2.931
Epoch: 2  Iter: 10000   Loss: 2.866
Epoch: 3  Iter: 5000   Loss: 2.799
Epoch: 3  Iter: 10000   Loss: 2.774
Epoch: 4  Iter: 5000   Loss: 2.785
Epoch: 4  Iter: 10000   Loss: 2.771
Epoch: 5  Iter: 5000   Loss: 2.859
Epoch: 5  Iter: 10000   Loss: 2.898
Epoch: 6  Iter: 5000   Loss: 2.893
Epoch: 6  Iter: 10000   Loss: 2.897
Epoch: 7  Iter: 5000   Loss: 2.911
Epoch: 7  Iter: 10000   Loss: 2.918
Epoch: 8  Iter: 5000   Loss: 2.918
Epoch: 8  Iter: 10000   Loss: 2.920
Epoch: 9  Iter: 5000   Loss: 2.921
Epoch: 9  Iter: 10000   Loss: 2.905
Epoch: 10  Iter: 5000   Loss: 2.881
Epoch: 10  Iter: 10000   Loss: 2.889
Training Finished


In [None]:
# 5. Test the model
model = RNNClassifier(vocab_size, hidden_size, output_size).to(device)
model.load_state_dict(torch.load('modelRNN.pth'))
model.eval()

correct = 0
total = 0
language_correct = {language: 0 for language in languages}
each_language_total = {language: 0 for language in languages}
# No gradient
with torch.no_grad():
  for i in range(20000):
    # Get the data
    name_tensor,label_tensor = randomLib()
    hidden_x = model.init_hidden().to(device)
    # Forward
    for ix in range(name_tensor.size(0)):
      output, hidden_x = model(name_tensor[ix], hidden_x)
    # Max probabilty
    _, prediction = torch.max(output.data, 1)
    if prediction == label_tensor:
      correct += 1
      language_correct[languages[label_tensor.item()]] += 1
    each_language_total[languages[label_tensor.item()]] += 1
    total += 1
print(f"Total Acccuracy is {1000 * correct/total:.3f}%.")
# print('-----Classwise accuracy-----')
# for language, values in language_correct.items():
#   print(f'Language: {language}\tTotal predicted: {each_language_total[language]}\tAccuracy: {100 * values/ each_language_total[language]:.3f}%')

Total Acccuracy is 79.900%.


In [None]:
# 6. Predict
input_name = 'Abi'
with torch.no_grad():
  hidden_x = model.init_hidden().to(device)
  input_tensor = lineToTensor(input_name).to(device)
  for letter_tensor in input_tensor:
    output, hidden_x = model(letter_tensor, hidden_x)
  # _, prediction = torch.max(output.data, 1)
  # prediction = languages[prediction.item()]
  number_of_predictions = 3
  _, predictions = output.topk(number_of_predictions,1, True)
  print(f'Input Name: {input_name}')
  print(predictions)
  for i in range(number_of_predictions):
    value = _[0][i].item()
    print(f'{languages[predictions[0][i].item()]} Probability: ({value:.3f})')



Input Name: Abi
tensor([[5, 3, 4]])
Chinese Probability: (1.000)
Arabic Probability: (0.000)
English Probability: (0.000)
