# CS6120 Natural Language Processing by Professor Uzair Ahmad. 

## Tensorflow Tutorial. 

## In this tutorial we are gonna learn about building deep neural networks with tensorflow/keras. For this tutorial we are going to build a classifier model to classify the given surname. 

## This work is inspired from https://github.com/DrUzair/NLP/tree/master/textclassification/surnames/mlp. 

# Importing Libraries

In [1]:
import glob
import unicodedata
import string
import pathlib
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Data Preparation

In [2]:
all_letters = string.ascii_letters + " .,;'-"
n_letters = len(all_letters)

def findFiles(path): return glob.glob(path)

# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

# Read a file and split into lines
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicodeToAscii(line) for line in lines]

# Find letter index from all_letters, e.g. "a" = 0
def letterToIndex(letter):
    return all_letters.find(letter)

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
def lineToTensor(line):
    tensor = np.zeros((len(line), 1, n_letters))
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

# Build the category_lines dictionary, a list of lines per category
category_lines = {}
all_categories = []
for filename in findFiles('/content/drive/MyDrive/names_data/names/*.txt'):
    category = filename.split("/")[-1].split('.')[0].replace('names\\','')
    all_categories.append(category)
    lines = readLines(filename)
    category_lines[category] = lines

n_categories = len(all_categories)

import random

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)] 


def randomTrainingExample():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    category_tensor = np.array([all_categories.index(category)])
    line_tensor = lineToTensor(line.lower())
    return category, line, category_tensor, line_tensor

def test():
    for i in range(10):
        category, line, category_tensor, line_tensor = randomTrainingExample()
        print('category =', category, '/ line =', line)

test()

category = Vietnamese / line = Phi
category = Spanish / line = Reyes
category = Arabic / line = Zogby
category = Dutch / line = Vliert
category = Greek / line = Dasios
category = German / line = Breisacher
category = Chinese / line = Rao
category = Italian / line = Piovene
category = French / line = Langlois
category = Greek / line = Kouropoulos


# Generate all training samples

In [3]:
train_lines = []
train_target = []

def generatedata():
  for cate in all_categories:
    for line in category_lines[cate]:
      train_lines.append(lineToTensor(line.lower()).sum(0))
      train_target.append(all_categories.index(cate))

generatedata()

In [4]:
train_lines = np.array(train_lines)
train_target = np.array(train_target)
train_lines = train_lines.squeeze(axis=1)

# Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_lines, train_target, test_size=0.20, random_state=42,stratify = train_target)

# Build model

In [6]:
from tensorflow.python.eager.monitoring import Metric
tf.random.set_seed(42)

#,kernel_regularizer=tf.keras.regularizers.l2(0.001)
def build_model(X):
  inp = tf.keras.Input((X.shape[1],))
  x = tf.keras.layers.Dense(1024, input_dim = X.shape[1] , activation = 'relu')(inp)
  x = tf.keras.layers.Dense(512,activation = 'swish')(x) 
  #x = tf.keras.layers.Dense(256,activation = 'swish')(x)
  #x = tf.keras.layers.Dense(128,activation = 'swish')(x)
  x = tf.keras.layers.Dense(18,activation = 'softmax')(x) 
  model = tf.keras.models.Model(inputs=inp, outputs=x)
  #lr_schedule = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=1e-3,decay_steps=10000,decay_rate=0.9)
  opt = tf.keras.optimizers.Adam(learning_rate=1e-03)
  model.compile(loss = 'sparse_categorical_crossentropy' , optimizer = opt, metrics = ['sparse_categorical_accuracy']) 
  return model


In [7]:
model = build_model(X_train)

In [8]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 58)]              0         
                                                                 
 dense (Dense)               (None, 1024)              60416     
                                                                 
 dense_1 (Dense)             (None, 512)               524800    
                                                                 
 dense_2 (Dense)             (None, 18)                9234      
                                                                 
Total params: 594,450
Trainable params: 594,450
Non-trainable params: 0
_________________________________________________________________


In [9]:
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath='tutorial.h5',
    save_weights_only=True,
    monitor='val_sparse_categorical_accuracy',
    mode='max',
    save_best_only=True)

lr_schedule = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_sparse_categorical_accuracy',factor=0.1,patience=3)


history = model.fit(X_train, y_train, epochs=30,batch_size = 32, validation_data=(X_test, y_test),callbacks=[model_checkpoint_callback,lr_schedule] ) 

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


# Loading the Best model

In [10]:
model = build_model(X_train)
model.load_weights('/content/tutorial.h5')

In [11]:
model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 58)]              0         
                                                                 
 dense_3 (Dense)             (None, 1024)              60416     
                                                                 
 dense_4 (Dense)             (None, 512)               524800    
                                                                 
 dense_5 (Dense)             (None, 18)                9234      
                                                                 
Total params: 594,450
Trainable params: 594,450
Non-trainable params: 0
_________________________________________________________________


In [12]:
y_pred = model.predict(X_test)

In [13]:
y_pred = y_pred.argmax(axis = 1)

# Calculating Overall Accuracy

In [14]:
from sklearn.metrics import accuracy_score

print(f'The Overall accuracy score is {accuracy_score(y_test, y_pred)}')

The Overall accuracy score is 0.7529265255292652


# Inference 

I am Gonna Test with few samples 

Sabbagh ---> Arabic

Gregory ---> English

Bicchieri ---> Italian

Theofilopoulos ---> Greek

Sokolof ---> Polish

Xiong ---> Chinese

In [18]:

for _ in range(6):

  surname = str(input())

  tensor = lineToTensor(surname.lower()).sum(0)
  
  pred = model.predict(tensor)
  
  pred = pred.argmax(axis = 1)
  
  print(f'{surname} ----> {all_categories[pred[0]]}')

xiong
xiong ----> Chinese
sokolof
sokolof ----> Polish
sabbagh
sabbagh ----> Arabic
gregory
gregory ----> English
bicchieri
bicchieri ----> Italian
theofilopoulos
theofilopoulos ----> Greek
