## GENDER CLASSIFICATION
### Names data from https://mbejda.github.io/

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/binary_class.csv")
df.head()

Unnamed: 0,name,gender
0,tashanika l,f
1,denetra c,f
2,tomesha d,f
3,trellany c,f
4,cynthia s,f


In [2]:
df.groupby('gender').count()

Unnamed: 0_level_0,name
gender,Unnamed: 1_level_1
f,22606
m,98116


In [3]:
#To remove preceding and trailing spaces, removing blank names
df["name"] = df["name"].str.strip()

#To count how many words are there
count = df['name'].str.count(' ').add(1).value_counts(sort=True)

print(count) #no.of words

2.0     73100
1.0     39471
3.0      7317
4.0       516
5.0       139
6.0        58
7.0        35
8.0        33
9.0        32
10.0       15
12.0        4
11.0        2
Name: name, dtype: int64


In [4]:
df[df.name.str.count(' ') <3]

Unnamed: 0,name,gender
0,tashanika l,f
1,denetra c,f
2,tomesha d,f
3,trellany c,f
4,cynthia s,f
...,...,...
120772,mark e,m
120773,jerread c,m
120774,john e,m
120775,joshua a,m


In [5]:
#39k names only for single word
#print("single words:",len(df[df.name.str.count(' ') == 0]))
#taking strings with 3 words and trying to remove single letter initials
import string

df = df[df.name.str.count(' ') <3]
#df["new_name"] = df['name'].str.replace('[^\w\s]','')

df['name'] = df['name'].str.split().map(lambda sl: " ".join(s for s in sl if len(s) > 2))


In [6]:
print(df['name'].str.count(' ').add(0).value_counts(sort=True))
df[df.name.str.count(' ') == 1]

0    105781
1     12781
2      1326
Name: name, dtype: int64


Unnamed: 0,name,gender
713,mary jean,f
1169,qua shondri,f
2510,robert jr.,m
2526,david jr.,m
2551,james iii,m
...,...,...
111537,ronald jr.,m
111573,leo jr.,m
111611,william jr.,m
117191,michael ric,m


In [7]:
#102k names for single word
df = df[df.name.str.count(' ') == 0]
df = df.replace('',np.nan, regex=True)
df = df.dropna()

In [8]:
#To see how many names do we have for male/female
df.groupby('gender').count()

Unnamed: 0_level_0,name
gender,Unnamed: 1_level_1
f,17159
m,88533


In [9]:
#Defining features taking into consideration the phonemes and patterns for starting of the name and ending of the name (upto 3 letters)
def features(name):
    name = name.lower()
    #print(name)
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }

In [10]:
features = np.vectorize(features)
#defining our input X and output y
X = features(df['name'])
y = df['gender']

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #80/20 split


In [12]:
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
dv = DictVectorizer()
dv.fit_transform(X_train)

<84553x4977 sparse matrix of type '<class 'numpy.float64'>'
	with 507318 stored elements in Compressed Sparse Row format>

In [13]:
#different classifiers to test on vectorised approaches
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm

dT_classifier = DecisionTreeClassifier()
svm_classifier = svm.LinearSVC()
transform_features =dv.transform(X_train)
dT_classifier.fit(transform_features, y_train)
svm_classifier.fit(transform_features, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [14]:
#Best result is with only first name and Decision Tree classifier
sample_name = ["Narendra"]
data_vector =dv.transform(features(sample_name))
transformed_vector = data_vector.toarray()
print(dT_classifier.predict_proba(transformed_vector),dT_classifier.predict(transformed_vector))
print(svm_classifier.decision_function(transformed_vector),svm_classifier.predict(transformed_vector))

[[0. 1.]] ['m']
[-0.46028587] ['f']


In [None]:
#Accuracies
print("Decision Tree (Train) -",dT_classifier.score(dv.transform(X_train), y_train)*100,"%") 
print("Decision Tree (Test) -",dT_classifier.score(dv.transform(X_test), y_test)*100,"%") 

print("SVM (Train) -",svm_classifier.score(dv.transform(X_train), y_train)*100,"%") 
print("SVM (Test) -",svm_classifier.score(dv.transform(X_test), y_test)*100,"%") 

Decision Tree (Train) - 99.00062682577791 %
Decision Tree (Test) - 97.1711055395241 %


In [None]:
#Saving Model
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

dt_clf = Pipeline([('vectorizer', dv),('dT', dT_classifier)])
svm_clf = Pipeline([('vectorizer', dv), ('svm', svm_classifier)])

joblib.dump(dt_clf, 'vectorizer_and_decision_tree.pkl') 
joblib.dump(svm_clf, 'vectorizer_and_svm.pkl') 


In [None]:
##prediction from model
from joblib import dump, load

clf = load('vectorizer_and_decision_tree.pkl') 
print (clf)

name_list = ["Narendra","Parth", "Donald", "Melanie","Virat","Radha","Camilla","Chhavi"]

clf.predict(features(name_list))