Here, we import all of the needed libraries. 

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset = pd.read_csv("../input/std-name-gender/names.csv")
dataset = dataset.fillna(0)
dataset

Unnamed: 0,Name,Gender
0,Hussain,M
1,Shk. Munir,M
2,Hamna,F
3,Jabbar,M
4,Rana Hayyat,M
...,...,...
13393,Rohi,F
13394,Mahtab Emmad,M
13395,Mouhammed Latif,M
13396,Iman,F


In [3]:
dataset.drop_duplicates('Name',ignore_index=True,inplace=True)
dataset

Unnamed: 0,Name,Gender
0,Hussain,M
1,Shk. Munir,M
2,Hamna,F
3,Jabbar,M
4,Rana Hayyat,M
...,...,...
5131,Muaz Baqir,M
5132,Haroon Abdul Qadeer,M
5133,Shakeel Furqan,M
5134,Iqra Abdul,F


In [4]:
dataset.isnull().values.any()

False

In [5]:
name=dataset['Name']
name = name.apply(str)

In [6]:
label=list(dataset['Gender'])

In [7]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [8]:
labels=le.fit_transform(label)

In [9]:
le.classes_

array(['F', 'M'], dtype='<U1')

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv=CountVectorizer(analyzer='char')


In [12]:
names=cv.fit_transform(name.apply(str)).toarray()

In [13]:
len(names[1])

29

In [14]:
from sklearn.model_selection import train_test_split
feature_train,feature_test,label_train,label_test=train_test_split(names,labels,test_size=0.2,random_state=42)

Now we train the model.

In [15]:
from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(feature_train,label_train)

MultinomialNB()

In [16]:
label_pred=model.predict(feature_test)

In [17]:
import sklearn.metrics as m

In [18]:
m.accuracy_score(label_test,label_pred)

0.7538910505836576

In [19]:
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer(char_level=True)
tokenizer.fit_on_texts(name)
sequence_of_int = tokenizer.texts_to_sequences(name)

In [20]:
from keras.preprocessing.sequence import pad_sequences
padsequences=pad_sequences(sequence_of_int,maxlen=15,padding='post')

In [21]:
len(padsequences[2])

15

In [22]:
from keras.utils.np_utils import to_categorical
labels=to_categorical(labels)

In [23]:
from keras.models import Sequential
from keras.layers import Dense,LSTM,Embedding,Dropout

In [24]:
padsequences.shape

(5136, 15)

In [25]:
from sklearn.model_selection import train_test_split
feature_train,feature_test,label_train,label_test=train_test_split(padsequences,labels,test_size=0.1,random_state=42)

In [26]:
model=Sequential()
model.add(Embedding(30,64,input_length=15))
model.add(LSTM(2048,return_sequences=True))
model.add(LSTM(256,return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(2,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 15, 64)            1920      
_________________________________________________________________
lstm (LSTM)                  (None, 15, 2048)          17309696  
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               2360320   
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 2)                 514       
Total params: 19,672,450
Trainable params: 19,672,450
Non-trainable params: 0
_________________________________________________________________


In [27]:
model.fit(feature_train,label_train,epochs=50,validation_data=(feature_test,label_test),batch_size=1000)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f38481fa390>

In [29]:
model.save('model.h5')

In [30]:
import sklearn.metrics as m

In [31]:
label_pred=model.predict(feature_test)

In [32]:
label_pred=np.argmax(label_pred,axis=1)

In [33]:
l_test=np.argmax(label_test,axis=1)

In [34]:
m.accuracy_score(l_test,label_pred)

0.8988326848249028

In [35]:
print(m.classification_report(l_test,label_pred))

              precision    recall  f1-score   support

           0       0.93      0.66      0.77       134
           1       0.89      0.98      0.93       380

    accuracy                           0.90       514
   macro avg       0.91      0.82      0.85       514
weighted avg       0.90      0.90      0.89       514



In [36]:
m.confusion_matrix(l_test,label_pred)

array([[ 89,  45],
       [  7, 373]])

In [37]:
from keras.models import load_model
import pickle

In [38]:
pickle.dump(tokenizer,open('tokenizer.pkl','wb+'),protocol=pickle.HIGHEST_PROTOCOL)

In [40]:
tokenizer=pickle.load(open('tokenizer.pkl','rb+'))

In [41]:
tokenizer.index_word

{1: 'a',
 2: ' ',
 3: 'm',
 4: 'r',
 5: 'h',
 6: 'i',
 7: 's',
 8: 'e',
 9: 'd',
 10: 'u',
 11: 'n',
 12: 'b',
 13: 'l',
 14: 'z',
 15: 'o',
 16: 't',
 17: 'k',
 18: 'y',
 19: 'f',
 20: 'j',
 21: 'q',
 22: '.',
 23: 'g',
 24: 'w',
 25: '-',
 26: 'c',
 27: 'v',
 28: 'p',
 29: '0'}

In [42]:
def input(n):
  q=tokenizer.texts_to_sequences(n)
  q=[i[0] for i in q]
  q1=[]
  q1.append(q)
  s=pad_sequences(q1,maxlen=15,padding='post')
  return s

In [43]:
tokenizer.texts_to_sequences('aadam')

[[1], [1], [9], [1], [3]]

In [44]:
input('aadam')

array([[1, 1, 9, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)

In [61]:
def pred(n):
  w=model.predict(input(n))
  w=np.argmax(w,axis=1)
  return 'M' if w[0] == 1 else 'F'

In [62]:
pred('Aadam')

'M'

In [64]:
pred('Mehak')

'F'

In [58]:
unlabeled = pd.read_csv("../input/unlabeled-std-names/names_unlabeled.csv", header=None)
unlabeled = unlabeled.fillna(0)
unlabeled

Unnamed: 0,0
0,Karam
1,Abdul Ahad
2,Mehak
3,M. Abdur Rahim
4,Midhat Azhar
...,...
1900,Muneer Saaleh
1901,Wajiha Abdul
1902,Amer Qutab-ud-Din
1903,Agha Abdul kareem


In [65]:
name_gender = {}
for name in unlabeled[0]:
    name_gender.update({name: pred(name)})

In [82]:
pd.DataFrame.from_dict(name_gender, orient='index')[0]

'{"Karam":"M","Abdul Ahad":"M","Mehak":"F","M. Abdur Rahim":"M","Midhat Azhar":"F","Mouhammed Abdul Rafay":"M","Ghosia":"F","Shuja Taimor":"M","Kashif":"M","Muhammad Mumtaz":"M","Sundas Rasheed Arshad":"M","Sahir Sohail":"M","Kareem":"M","Shammaa":"F","Zabir":"M","Atiya":"F","Babar":"M","Mohammed Rai":"M","Jameela":"F","Nozhat":"F","Sheikh Zakir":"M","Khurshid Muamar Qutab-ud-Din":"M","Khurshid":"M","Muazzum":"M","Mahnoor Yosaf":"F","Muhammad Jamal":"M","Mohammad Rahid":"M","Rasheed":"M","Sh. Alam":"M","Badar":"M","Shamim":"M","Mahtab Zameer":"M","Sarwar":"M","Hina":"F","Azhar":"M","Farman Abdul":"M","Kanwal":"M","Baqir Abdul Qadir":"M","Shabi Niaz":"M","Zameer":"M","Nawabzada Amjad":"M","Ibraheem":"M","Midhat":"F","Rafi Masroor":"M","Ayesha":"F","Rahat":"F","Sajeela":"F","Syyed Arshad":"M","Shakir":"M","Alam Aun":"M","Amra":"F","Najeeb Haseeb":"M","Zil-e-Ali Aftab":"M","Benazir":"F","Shujaat Haseeb":"M","Atif":"M","Rukhsana":"F","Haseeb":"M","Shams-ul-Islam Amjad":"M","Tufail":"M","Ag

In [84]:
import json

with open('genders.json', 'w') as f:
    json.dump(name_gender, f)

In [75]:
sum(unlabeled[0] == "")

0