In [1]:
import pandas as pd
import re 
import numpy as np 
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
url ="https://raw.githubusercontent.com/brownyu/FirstName-to-predict-gender/master/name_gender.csv"
data = pd.read_csv(url)

In [3]:
data

Unnamed: 0,name,gender
0,Aaban&&,M
1,Aabha*,F
2,Aabid,M
3,Aabriella,F
4,Aada_,F
...,...,...
95020,Zyvion,M
95021,Zyvon,M
95022,Zyyanna,F
95023,Zyyon,M


In [4]:
#check whether the dataset is imbalanced 
data['gender'].value_counts()

F    60304
M    34721
Name: gender, dtype: int64

In [5]:
data['gender'].value_counts(normalize=True)

F    0.634612
M    0.365388
Name: gender, dtype: float64

In [6]:
#check for any empty value 
data.isna().sum()

name      0
gender    0
dtype: int64

In [7]:
#create a function to do data clearning 
def preprocessing(text):
    text=re.sub(r'[^a-zA-Z]','',text)#only get all the letter 
    text=text.lower()#to lower case letter 

    return text

In [8]:
#applying self-defined function to name 
data['name_clean']=data['name'].apply(preprocessing)
data

Unnamed: 0,name,gender,name_clean
0,Aaban&&,M,aaban
1,Aabha*,F,aabha
2,Aabid,M,aabid
3,Aabriella,F,aabriella
4,Aada_,F,aada
...,...,...,...
95020,Zyvion,M,zyvion
95021,Zyvon,M,zyvon
95022,Zyyanna,F,zyyanna
95023,Zyyon,M,zyyon


In [9]:
#lable encoder the target varaible
LabelEncoder=LabelEncoder()
data['gender']=LabelEncoder.fit_transform(data['gender'])
data

Unnamed: 0,name,gender,name_clean
0,Aaban&&,1,aaban
1,Aabha*,0,aabha
2,Aabid,1,aabid
3,Aabriella,0,aabriella
4,Aada_,0,aada
...,...,...,...
95020,Zyvion,1,zyvion
95021,Zyvon,1,zyvon
95022,Zyyanna,0,zyyanna
95023,Zyyon,1,zyyon


In [10]:
X=data['name_clean']
y=data['gender']

In [11]:
#vectorized the X 
count_vec= CountVectorizer()
x_trans=count_vec.fit_transform(X)

In [12]:
#since the data is imbalanced, apply over sample to the dataset 
rus=RandomOverSampler()
X_resample,y_resample=rus.fit_resample(x_trans,y)

In [13]:
y_resample.value_counts()

1    60304
0    60304
Name: gender, dtype: int64

In [14]:
X_resample

<120608x95025 sparse matrix of type '<class 'numpy.int64'>'
	with 120608 stored elements in Compressed Sparse Row format>

In [15]:
#split to train and test data 
x_train,x_test,y_train,y_test = train_test_split(X_resample,y_resample,test_size=0.2,random_state=42)

In [16]:
x_train.shape

(96486, 95025)

In [17]:
y_train.shape

(96486,)

In [18]:
y_test.shape

(24122,)

In [19]:
x_test.shape

(24122, 95025)

Random Forest Classifier 

In [41]:
#build base Random forest classifier with random state 
rf_model =RandomForestClassifier(random_state=78)
rf_model.fit(x_train,y_train)

In [42]:
y_pred = rf_model.predict(x_test)

In [43]:
accuracy_score(y_test, y_pred)

0.8174695298897272

In [64]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [65]:
random_grid 

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4],
 'bootstrap': [True, False]}

In [66]:
rf_random = RandomizedSearchCV(estimator = rf_model, param_distributions = random_grid, n_iter = 10, cv = 2, verbose=2, random_state=78, n_jobs = -1)

In [67]:
rf_random.fit(x_train,y_train)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


In [68]:
rf_random.best_params_

{'n_estimators': 200,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 30,
 'bootstrap': True}

This hyper parameter is trained by undersample dataset 

In [70]:
rf_random.best_score_

0.5009990461184526

Naive Bayes Classifier 

In [20]:
from sklearn.naive_bayes import MultinomialNB
from datetime import datetime
from sklearn.metrics import confusion_matrix

start=datetime.now()
nb = MultinomialNB()
nb.fit(x_train,y_train)


In [21]:
y_pred_nb = nb.predict(x_test)
accuracy_score(y_pred_nb,y_test)

0.4957714949009203

Simple neural network 

In [22]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout

In [23]:
model = Sequential()
model.add(Dense(3, input_dim=x_train.shape[1], activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(3, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [24]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 3)                 285078    
                                                                 
 dropout (Dropout)           (None, 3)                 0         
                                                                 
 dense_1 (Dense)             (None, 3)                 12        
                                                                 
 dense_2 (Dense)             (None, 1)                 4         
                                                                 
Total params: 285,094
Trainable params: 285,094
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
model.fit(x_train.toarray(), y_train, epochs=4, batch_size=500)

MemoryError: Unable to allocate 68.3 GiB for an array with shape (96486, 95025) and data type int64

In [28]:
loss, accuracy = model.evaluate(x_test.toarray(), y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Testing Accuracy:  0.5051


In [27]:
!pip list 

Package                      Version
---------------------------- -----------
absl-py                      1.2.0
adal                         1.2.7
altair                       4.2.0
anyio                        3.6.1
argcomplete                  2.0.0
argon2-cffi                  21.3.0
argon2-cffi-bindings         21.2.0
asgiref                      3.5.2
asttokens                    2.0.5
astunparse                   1.6.3
async-generator              1.10
attrs                        21.4.0
azure-common                 1.1.28
azure-core                   1.24.2
azure-graphrbac              0.61.1
azure-mgmt-authorization     2.0.0
azure-mgmt-containerregistry 10.0.0
azure-mgmt-core              1.3.0
azure-mgmt-keyvault          10.0.0
azure-mgmt-resource          21.1.0
azure-mgmt-storage           20.0.0
azureml-core                 1.43.0
Babel                        2.10.3
backcall                     0.2.0
backports.tempfile           1.0
backports.weakref            1.0.post1