## Gender Classification

Aim:  Gender classification according to given names.

In [1]:
# Importing necessary moduls
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

import warnings # ignore warning
warnings.filterwarnings("ignore")



In [2]:
# Loading data
path = "data/names_dataset.csv"
data = pd.read_csv(path, index_col="index")
df = data.copy()

In [3]:
# Overviews of df
df.head()

Unnamed: 0_level_0,name,sex
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Mary,F
1,Anna,F
2,Emma,F
3,Elizabeth,F
4,Minnie,F


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 95040 entries, 0 to 1858703
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   name    95040 non-null  object
 1   sex     95040 non-null  object
dtypes: object(2)
memory usage: 2.2+ MB


In [5]:
# Missing Value Table
def missing_value_table(df):
    missing_value = df.isnull().sum().sort_values(ascending=False)
    missing_value_percent = 100 * df.isnull().sum()//len(df)
    missing_value_table = pd.concat([missing_value, missing_value_percent], axis=1)
    missing_value_table_return = missing_value_table.rename(columns = {0 : 'Missing Values', 1 : '% Value'})
    cm = sns.light_palette("lightblue", as_cmap=True)
    missing_value_table_return = missing_value_table_return.style.background_gradient(cmap=cm)
    return missing_value_table_return
  
missing_value_table(df)

Unnamed: 0,Missing Values,% Value
sex,0,0
name,0,0


In [6]:
# Sex distribution
print("Number of Female Names: ", df[df.sex == "F"].size)
print("Number of Female Names: ", df[df.sex == "M"].size)

Number of Female Names:  121208
Number of Female Names:  68872


In [7]:
df.sex.replace({'F':0,'M':1}, inplace=True)

In [8]:
cv = CountVectorizer()
X = cv.fit_transform(df.name)

In [9]:
gender_vectorizer = open("gender_vectorizer.pkl","wb")
joblib.dump(cv,gender_vectorizer)
gender_vectorizer.close()

## Modeling

In [10]:
# Feature
X 
# Label
y = df.sex
# Train and Test Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
print("Accuracy on test set: ", clf.score(X_test,y_test)*100,"%")
print("Accuracy on test set: ", clf.score(X_train,y_train)*100,"%")

Accuracy on test set:  64.13085065680399 %
Accuracy on test set:  100.0 %


## Saving Model

In [13]:
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")

In [14]:
joblib.dump(clf,NaiveBayesModel)

In [15]:
NaiveBayesModel.close()