In [38]:
import collections
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy
import random
from string import ascii_uppercase
import pandas as pd


In [39]:
with open('data/male.txt','r') as min:
    male_names = [name.strip('\r\n') for name in min.readlines()]
with open('data/female.txt','r') as fin:
    female_names = [name.strip('\r\n') for name in fin.readlines()]


print("Male name list : ", [name for name in male_names[:10]])
print("Female name list : ", [name for name in female_names[:10]])

Male name list :  ['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim']
Female name list :  ['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale']


In [40]:
male_names = filter(str.isalpha, [str(m) for m in male_names])
female_names = filter(str.isalpha, [str(f) for f in female_names])

In [41]:
all_names = []
for name in male_names:
    all_names.append((name.upper(),'M'))
for name in female_names:
    all_names.append((name.upper(),'F'))

In [42]:
print("Male name list : ", [name for name in all_names[5000:5010]])

Male name list :  [('HALIMEDA', 'F'), ('HALLEY', 'F'), ('HALLI', 'F'), ('HALLIE', 'F'), ('HALLY', 'F'), ('HANA', 'F'), ('HANNA', 'F'), ('HANNAH', 'F'), ('HANNI', 'F'), ('HANNIBAL', 'F')]


In [43]:
def one_hot_encoding(elses):
    one_hot = {}
    for i,l in enumerate(elses):
    
        bits = [0] *  len(elses)
        bits[i] = 1
        one_hot[l] = bits
        #print(i,l)
    #print("\n",one_hot)
    return one_hot

In [44]:
mono_alpha_hot = one_hot_encoding(ascii_uppercase)
#print(mono_alpha_hot)

In [45]:
# for a,b in enumerate(mono_hot):
#     print(a,":",mono_hot[b])

In [46]:
bi_alpha = [a+b for a in ascii_uppercase for b in ascii_uppercase]
bi_alpha_hot = one_hot_encoding(bi_alpha)
#print(bi_alpha)

In [47]:
tri_alpha = [a+b+c for a in ascii_uppercase for b in ascii_uppercase for c in ascii_uppercase]
tri_alpha_hot = one_hot_encoding(tri_alpha)

In [92]:
feature_name = []
feature_name.extend(['Start with '+a for a in mono_alpha_hot.keys()])
feature_name.extend( ['2nd Character '+ a for a in  mono_alpha_hot.keys()] )
feature_name.extend( ['2nd Character from last '+ a for a in  mono_alpha_hot.keys()] )
feature_name.extend( ['Ends with '+ a for a in  mono_alpha_hot.keys()] )
feature_name.extend( ['Freqency of '+ a for a in ascii_uppercase] )
feature_name.extend( ['Contains '+ a for a in list(bi_alpha)] )
print(feature_name)

['Start with A', 'Start with B', 'Start with C', 'Start with D', 'Start with E', 'Start with F', 'Start with G', 'Start with H', 'Start with I', 'Start with J', 'Start with K', 'Start with L', 'Start with M', 'Start with N', 'Start with O', 'Start with P', 'Start with Q', 'Start with R', 'Start with S', 'Start with T', 'Start with U', 'Start with V', 'Start with W', 'Start with X', 'Start with Y', 'Start with Z', '2nd Character A', '2nd Character B', '2nd Character C', '2nd Character D', '2nd Character E', '2nd Character F', '2nd Character G', '2nd Character H', '2nd Character I', '2nd Character J', '2nd Character K', '2nd Character L', '2nd Character M', '2nd Character N', '2nd Character O', '2nd Character P', '2nd Character Q', '2nd Character R', '2nd Character S', '2nd Character T', '2nd Character U', '2nd Character V', '2nd Character W', '2nd Character X', '2nd Character Y', '2nd Character Z', '2nd Character from last A', '2nd Character from last B', '2nd Character from last C', '2

In [74]:
def get_feature(name,gender):
    feature = []
    name = name.strip()
    
    feature.extend(mono_alpha_hot[name[0]])
    feature.extend(mono_alpha_hot[name[1]])
    feature.extend(mono_alpha_hot[name[-2]])
    feature.extend(mono_alpha_hot[name[-1]])
    
    frequency = {key: 0 for key in list(ascii_uppercase)}
    updates = dict(collections.Counter(name))
    frequency.update(updates)
    feature.extend(frequency.values())
    #print("Feature:",feature)
    
    frequency = {key: 0 for key in bi_alpha}
    updates = dict(collections.Counter(zip(name,name[1:])))
    #print("update:",updates)
    updates = {(A+B):n for (A,B),n in zip(updates.keys(),updates.values())}
    #print("update:",updates.keys())
    frequency.update(updates)
    #print("update:",frequency)
    feature.extend(frequency.values())
    #print("feature:",feature)
    
    if(gender == 'M'):
        classification = 0
    else:
        classification = 1
    
    return (feature,classification)
        
    

In [75]:
feature_list = [get_feature(name,gender) for name,gender in all_names]
random.shuffle(feature_list)

In [81]:
train = feature_list[:6000]
test = feature_list[6000:]

x_train, y_train = zip(*train)
x_test, y_test = zip(*test)
#print(y_train)
print(numpy.array(x_test).shape)
print(numpy.array(y_train).shape)



(1904, 806)
(6000,)


In [82]:
classifier = RandomForestClassifier(n_estimators=150, min_samples_split=20)
classifier.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [83]:
y_pred = []

for i in range(0,len(x_test)):
    y_pred.extend(classifier.predict(numpy.array(x_test[i]).reshape(1,-1)))


In [96]:
# print(numpy.array(x_test[1]).reshape(1,-1))
# print("Prdicted:",y_pred)
# print("Y test",y_test)
# print(x_test)
print(accuracy_score(y_test, y_pred))

0.8371848739495799


In [102]:
important_features = sorted(enumerate(classifier.feature_importances_), key=lambda x : x[1], reverse=True)
# print(important_features)
print ("Most Important Features : ")
[(feature_name[idx],prob) for idx, prob in important_features][:10]

Most Important Features : 


[('Ends with A', 0.10513883832604982),
 ('Freqency of A', 0.026461699607904312),
 ('Ends with D', 0.02265644420881819),
 ('Ends with E', 0.021285042463551188),
 ('2nd Character from last O', 0.020811534160498735),
 ('Ends with R', 0.016314166600900563),
 ('Freqency of E', 0.014548614955857726),
 ('Ends with O', 0.014061409566110678),
 ('Freqency of I', 0.01394933928285662),
 ('Ends with I', 0.012523178355017232)]

In [109]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
print(f1_score(y_test,y_pred))
print(f1_score(y_test, y_pred, average="macro"))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))    

0.8740861088545897
0.8218870365966856
0.827488651699178
0.8174906751922052
