## Name 2 lang classification task using tensorflow

  The task that we are trying to approach is given a name we have to predict its nationality or which lang it belongs to.
  For e.g   ('Whelan', 'Irish'),
 ('William', 'Irish'),
 ('Abana', 'Spanish'),
 ('Abano', 'Spanish'),
 ('Abarca', 'Spanish').
 The data set is present in txt file.

### Imports : 

In [0]:
from io import open
import os, string, random, time, math
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
import tensorflow as tf
print(tf.__version__)

1.14.0


In [0]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras import optimizers

In [0]:
from sklearn.model_selection import train_test_split

In [0]:
from IPython.display import clear_output

### Reading the text file and saving the data in some data-structures.

In [0]:
languages = []
data = []
X = []
Y = []

with open('name2lang.txt', 'r') as f: 
    for line in f:
        line = line.split(',')
        name = line[0].strip()
        lang = line[1].strip()
        if not lang in languages:
            languages.append(lang)
        X.append(name)
        Y.append(lang)
        data.append((name, lang))

n_languages = len(languages) # will be used to design o/p layer of the model.

### Creating a dictionary to store letter and its frequency in the dataset.

In [0]:
char_to_freq = {}

In [0]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)

In [9]:
print(n_letters)
print(languages)

57
['Portuguese', 'Irish', 'Spanish', 'Vietnamese', 'Chinese', 'Greek', 'Czech', 'Dutch', 'Japanese', 'French', 'German', 'Scottish', 'English', 'Russian', 'Polish', 'Arabic', 'Korean', 'Italian']


In [10]:
# lets see what is the max len in X
len(max(X,key = len))

19

In [0]:
for l in all_letters:
  char_to_freq[l] = 0

In [0]:
char_to_freq

In [0]:
for name in X:
  for n in name:
    char_to_freq[n] += 1

In [0]:
char_to_freq

In [0]:
import collections
sorted_char_to_f = sorted(char_to_freq.items(), key=lambda kv: kv[1])

In [0]:
char_to_freq_count = collections.OrderedDict()
for pair in sorted_char_to_f:
  char_to_freq_count[pair[0]] = pair[1]

In [0]:
char_to_freq_count

In [0]:
import pandas as pd
df_char_freq_rank = pd.DataFrame(index = char_to_freq_count.keys())

In [19]:
df_char_freq_rank['index'] = range(1,58)
df_char_freq_rank['count'] = char_to_freq_count.values()
df_char_freq_rank.head()

Unnamed: 0,index,count
.,1,0
",",2,0
;,3,0
X,4,14
q,5,38


In [20]:
df_char_freq_rank.tail()

Unnamed: 0,index,count
n,53,9348
i,54,10178
e,55,10269
o,56,10778
a,57,14743


In [21]:
df_char_freq_rank['index']['q']

5

### Functions to construct numerical representation for a given name and lang.

In [0]:
def name_rep(name):
  rep = []
  for l in name:
    rep.append(df_char_freq_rank['index'][l])
  return rep
  

In [0]:
def lang_rep(lang):
  rep = np.zeros(n_languages)
  rep[languages.index(lang)] = 1
  return rep

In [40]:
(lang_rep('Irish'))

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])

In [25]:
name_rep(',qXq')

[2, 5, 4, 5]

In [0]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0, stratify=Y)

In [42]:
print(len(x_train),len(x_test))

16040 4010


### Creating final train and test sets

In [0]:
# converting all the train and test examples to list

X_train = []
Y_train = []

for xtr,ytr in zip(x_train,y_train):
  X_train.append(name_rep(xtr))
  Y_train.append(lang_rep(ytr))

X_test = []
Y_test = []

for xtes,ytes in zip(x_test,y_test):
  X_test.append(name_rep(xtes))
  Y_test.append(lang_rep(ytes))


In [44]:
print(X_train[:5])
print('='*40)
print(Y_train[:5])

[[38, 39, 57, 50, 51, 48, 54, 53], [30, 45, 49, 54, 42, 56, 47], [32, 34, 55, 49, 56, 39, 55, 53, 56, 48], [16, 45, 53, 53, 55, 49, 49], [18, 51, 45, 54]]
[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0.]), array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0.]), array([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0.])]


### Padding the data to improve tranning speed.

In [0]:
max_name_len = 19 
X_train = sequence.pad_sequences(X_train,maxlen=max_name_len)
X_test = sequence.pad_sequences(X_test,maxlen=max_name_len)

In [59]:
print(X_train[0])
print(X_train.shape)

[ 0  0  0  0  0  0  0  0  0  0  0 38 39 57 50 51 48 54 53]
(16040, 19)


### Building the model

In [67]:
# now we will create our model
top_alpha = 14743
embedding_vector_len = 32
model  = Sequential()
model.add(Embedding(top_alpha,embedding_vector_len,input_length = max_name_len))
model.add(LSTM(100,return_sequences=True,dropout=0.5))
model.add(LSTM(100,return_sequences=False,dropout=0.5))
model.add(Dense(n_languages,activation='softmax'))

opt = optimizers.Adam(lr=0.01, decay=1e-6)

model.compile(loss='categorical_crossentropy',optimizer = opt,metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 19, 32)            471776    
_________________________________________________________________
lstm_8 (LSTM)                (None, 19, 100)           53200     
_________________________________________________________________
lstm_9 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_6 (Dense)              (None, 18)                1818      
Total params: 607,194
Trainable params: 607,194
Non-trainable params: 0
_________________________________________________________________
None


In [61]:
type(X_test)

numpy.ndarray

In [53]:
# converting X's and Y's to np array

Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

print(type(Y_train),type(Y_test))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


### Tranning the model.

In [68]:
model.fit(X_train,Y_train,nb_epoch = 30,batch_size = 128)

# final eval
scores = model.evaluate(X_test,Y_test,verbose = 0)
print("Accuracy: %.2f%%" % (scores[1]*100))

  """Entry point for launching an IPython kernel.


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Accuracy: 80.82%
