In [1]:
# Include the dependent modules
import tensorflow as tf
from tensorflow import keras

import numpy as np
import pandas as pd

import os

In [3]:
# parameter declaration
maxlen = 30
labels = 2

In [4]:
input = pd.read_csv("gender_data.csv", header=None)
input.columns = ["name", "male_or_female"]
input['namelen'] = [ len(str(i))  for i in input['name']]

In [5]:
len(input['namelen'])

15290

In [6]:
actualInput = input[(input['namelen'] >= 2)]

In [7]:
len(actualInput), len(input)

(15226, 15290)

In [8]:
removedName = input[(input['namelen'] < 2)]

In [9]:
actualInput.groupby('male_or_female')['name'].count()

male_or_female
f    6705
m    8475
Name: name, dtype: int64

In [10]:
names = input['name']
gender = input['male_or_female']
vocab = set(' '.join([str(i) for i in names ]))
vocab.add('END')
lenOfVocab = len(vocab)

In [11]:
print(vocab)
print('Length of vocab is ', lenOfVocab)
print('Lenght of input is ', len(actualInput))

{'e', '4', 'x', '9', 'b', 'p', '.', 'q', '7', 'd', '6', '8', 'z', 'w', 'r', 'k', 'y', '3', 'j', 'n', 'c', 'g', ' ', 'l', 'u', '1', 't', 'a', '0', 'm', '5', 'f', '2', 'o', 'END', 'i', 'v', 's', 'h'}
Length of vocab is  39
Lenght of input is  15226


In [12]:
charIndex = dict((c, i) for i, c in enumerate(vocab))

In [14]:
print(charIndex)

{'e': 0, '4': 1, 'x': 2, '9': 3, 'b': 4, 'p': 5, '.': 6, 'q': 7, '7': 8, 'd': 9, '6': 10, '8': 11, 'z': 12, 'w': 13, 'r': 14, 'k': 15, 'y': 16, '3': 17, 'j': 18, 'n': 19, 'c': 20, 'g': 21, ' ': 22, 'l': 23, 'u': 24, '1': 25, 't': 26, 'a': 27, '0': 28, 'm': 29, '5': 30, 'f': 31, '2': 32, 'o': 33, 'END': 34, 'i': 35, 'v': 36, 's': 37, 'h': 38}


In [15]:
# train test split
msk = np.random.rand(len(actualInput)) < 0.8
train = actualInput[msk]
test = actualInput[~msk]

In [18]:
train_X = []
trunc_train_name = [str(i)[0:30] for i in train.name]

In [19]:
print(len(trunc_train_name))

12196


In [20]:
for i in trunc_train_name:
    tmp = [charIndex[j] for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(charIndex["END"])
    train_X.append(tmp)

In [21]:
np.asarray(train_X).shape

(12196, 30)

In [22]:
def setFlag(i):
    temp = np.zeros(39)
    temp[i] = 1
    return temp

In [23]:
setFlag(3)

array([0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.])

In [24]:
train_X = []
train_Y = []
trunc_train_name = [str(i)[0:maxlen] for i in train.name]

for i in trunc_train_name:
    tmp = [setFlag(charIndex[j]) for j in str(i)]
    for k in range(0, maxlen - len(str(i))):
        tmp.append(setFlag(charIndex["END"]))
    train_X.append(tmp)
    
for i in train.male_or_female:
    if i == 'm':
        train_Y.append([1, 0])
    else:
        train_Y.append([0, 1])


In [25]:
np.asarray(train_X).shape

(12196, 30, 39)

In [26]:
np.asarray(train_Y).shape

(12196, 2)

In [29]:
# Build the model: 2 stacked LSTM
model = keras.Sequential()
model.add(keras.layers.LSTM(512, return_sequences=True, input_shape=(maxlen, lenOfVocab)))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.LSTM(512, return_sequences=False))
model.add(keras.layers.Dropout(0.2))
model.add(keras.layers.Dense(2))
model.add(keras.layers.Activation('softmax'))

In [30]:
# Compile the model
model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [31]:
test_X = []
test_Y = []
trunc_test_name = [str(i)[0:maxlen] for i in test.name]

for i in trunc_test_name:
    tmp = [setFlag(charIndex[j]) for j in str(i)]
    for k in range(0, maxlen - len(str(i))):
        tmp.append(setFlag(charIndex["END"]))
    test_X.append(tmp)

for i in test.male_or_female:
    if i == 'm':
        test_Y.append([1, 0])
    else:
        test_Y.append([0, 1])


In [32]:
np.asarray(test_X).shape

(3030, 30, 39)

In [33]:

print(np.asarray(test_X).shape)
print(np.asarray(test_Y).shape)

(3030, 30, 39)
(3030, 2)


In [34]:
batch_size=1000
model.fit(np.array(train_X), np.array(train_Y),batch_size=batch_size, epochs=10,validation_data=(np.array(test_X), np.array(test_Y)))

Train on 12196 samples, validate on 3030 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x120e91518>

In [35]:
score, acc = model.evaluate(np.array(test_X), np.array(test_Y))
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.4679110310652075
Test accuracy: 0.7788778876314069


In [37]:
name = ['anandhan', 'priya', 'chandru', 'sarathi', 'santhya']
X = []
trunc_name = [i[0:maxlen] for i in name]
for i in trunc_name:
    tmp = [setFlag(charIndex[j]) for j in str(i)]
    for k in range(0, maxlen - len(str(i))):
        tmp.append(setFlag(charIndex["END"]))
    X.append(tmp)
predictions = model.predict(np.asarray(X))
           

In [39]:
predictions

array([[0.662015  , 0.337985  ],
       [0.13195457, 0.86804545],
       [0.7649355 , 0.23506448],
       [0.15344869, 0.8465513 ],
       [0.4765105 , 0.52348953]], dtype=float32)

In [40]:
batch_size = 1000
model.fit(np.array(train_X), np.array(train_Y), batch_size=batch_size, epochs=50, validation_data=(np.array(test_X), np.array(test_Y)) ) 

Train on 12196 samples, validate on 3030 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x13151fd68>

In [41]:
score, acc = model.evaluate(np.array(test_X), np.array(test_Y))
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 0.3434886864703087
Test accuracy: 0.8838283829169699


In [42]:

name=["sandhya","jaspreet","rajesh","kaveri","aditi deepak","arihant","sasikala","aditi","ragini rajaram"]
X=[]
trunc_name = [i[0:maxlen] for i in name]
for i in trunc_name:
    tmp = [setFlag(charIndex[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(setFlag(charIndex["END"]))
    X.append(tmp)
pred=model.predict(np.asarray(X))
pred

array([[0.0616538 , 0.93834627],
       [0.09346944, 0.9065306 ],
       [0.96234685, 0.03765318],
       [0.04838698, 0.95161307],
       [0.00319322, 0.99680686],
       [0.98605204, 0.01394794],
       [0.00693881, 0.9930611 ],
       [0.25296992, 0.74703   ],
       [0.00792741, 0.99207264]], dtype=float32)

In [43]:

name=["abhi","abhi deepak","mr. abhi"]
X=[]
trunc_name = [i[0:maxlen] for i in name]
for i in trunc_name:
    tmp = [setFlag(charIndex[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(setFlag(charIndex["END"]))
    X.append(tmp)
pred=model.predict(np.asarray(X))
pred

array([[0.23649578, 0.7635042 ],
       [0.08896113, 0.9110389 ],
       [0.99321437, 0.0067856 ]], dtype=float32)

In [44]:
name=["rajini","rajinikanth","mr. rajini"]
X=[]
trunc_name = [i[0:maxlen] for i in name]
for i in trunc_name:
    tmp = [setFlag(charIndex[j]) for j in str(i)]
    for k in range(0,maxlen - len(str(i))):
        tmp.append(setFlag(charIndex["END"]))
    X.append(tmp)
pred=model.predict(np.asarray(X))
pred

array([[0.02565806, 0.974342  ],
       [0.9927898 , 0.00721019],
       [0.97167814, 0.02832186]], dtype=float32)

In [45]:
#save our model and data
# model.save_weights('gender_model',overwrite=True)
model.save('gender_model',overwrite=True)
# train.to_csv("train_split.csv")
# test.to_csv("test_split.csv")

In [46]:
train.to_csv("train_split.csv")
test.to_csv("test_split.csv")

In [47]:
evals = model.predict(np.array(test_X))
prob_m = [i[0] for i in evals]

In [48]:
out = pd.DataFrame(prob_m)
out['name'] = test.name.reset_index()['name']
out['male_or_female']=test.male_or_female.reset_index()['male_or_female']

In [49]:
out.head(10)
out.columns = ['prob_m','name','actual']
out.head(10)
out.to_csv("gender_pred_out.csv")