In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import  MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import confusion_matrix, mean_squared_error


# Training Set Exploration

In [2]:
train = pd.read_csv("name_gender_dataset.csv")
print(train.shape)
train.head()


(147269, 4)


Unnamed: 0,Name,Gender,Count,Probability
0,James,M,5304407,0.014517
1,John,M,5260831,0.014398
2,Robert,M,4970386,0.013603
3,Michael,M,4579950,0.012534
4,William,M,4226608,0.011567


In [3]:
train_count = train.groupby(by='Gender').count()
train_count

Unnamed: 0_level_0,Name,Count,Probability
Gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,89749,89749,89749
M,57520,57520,57520


In [4]:
# Male: 1 Female: 0
train_use = train[['Name', 'Gender']]
train_use.Gender = train_use.Gender.apply(lambda x: 1 if x == "M" else 0)
train_use.Name = train_use.Name.str.lower()
print(train_use.shape)
train_use.head()

(147269, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,Name,Gender
0,james,1
1,john,1
2,robert,1
3,michael,1
4,william,1


In [5]:
# Check missing values

train_use.isnull().sum()

Name      0
Gender    0
dtype: int64

In [6]:
train_use.Gender.unique()

array([1, 0])

In [7]:
X_feature = train_use.Name
X_feature

0           james
1            john
2          robert
3         michael
4         william
           ...   
147264     zylenn
147265     zymeon
147266     zyndel
147267     zyshan
147268      zyton
Name: Name, Length: 147269, dtype: object

In [8]:
y_label = train_use.Gender
y_label

0         1
1         1
2         1
3         1
4         1
         ..
147264    1
147265    1
147266    1
147267    1
147268    1
Name: Gender, Length: 147269, dtype: int64

# Feature Extraction

In [9]:
cv = CountVectorizer()
X_matrix = cv.fit_transform(X_feature)

X_matrix

<147269x126918 sparse matrix of type '<class 'numpy.int64'>'
	with 154775 stored elements in Compressed Sparse Row format>

In [10]:
# tfidf = TfidfTransformer()
# X_matrix_tfidf = tfidf.fit_transform(X_matrix)
# X_matrix_tfidf

# Train Test Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_matrix, y_label, test_size=0.5, random_state=10)


In [12]:
X_train

<73634x126918 sparse matrix of type '<class 'numpy.int64'>'
	with 77360 stored elements in Compressed Sparse Row format>

In [13]:
X_test

<73635x126918 sparse matrix of type '<class 'numpy.int64'>'
	with 77415 stored elements in Compressed Sparse Row format>

In [14]:
y_train

51709     1
4226      0
71132     0
15726     0
18440     1
         ..
9372      0
105595    1
93553     0
94735     1
83209     1
Name: Gender, Length: 73634, dtype: int64

In [15]:
y_test

94055     0
96922     0
21172     1
83469     1
91171     0
         ..
6931      1
104064    0
105074    0
17545     0
52976     0
Name: Gender, Length: 73635, dtype: int64

# Model Training

**Multinomial NB**

In [16]:
clf1 = MultinomialNB()
clf1.fit(X_train, y_train)

MultinomialNB()

In [17]:
y_pred1 = clf1.predict(X_test)

In [18]:
# Accuracy of model
print("Accuracy of Model on training sets: ", clf1.score(X_train, y_train)*100, "%")
print("Accuracy of Model on testing sets: ", clf1.score(X_test, y_test)*100, "%")
print("RMSE on testing set =", mean_squared_error(y_test, y_pred1))

Accuracy of Model on training sets:  94.71847244479451 %
Accuracy of Model on testing sets:  58.00638283425001 %
RMSE on testing set = 0.41993617165749986


**Bernoulli NB**

In [19]:
clf2 = BernoulliNB()
clf2.fit(X_train, y_train)

BernoulliNB()

In [20]:
y_pred2 = clf2.predict(X_test)

In [21]:
# Accuracy of model
print("Accuracy of Model on training sets: ", clf2.score(X_train, y_train)*100, "%")
print("Accuracy of Model on testing sets: ", clf2.score(X_test, y_test)*100, "%")
print("RMSE on testing set =", mean_squared_error(y_test, y_pred2))

Accuracy of Model on training sets:  61.817910204525084 %
Accuracy of Model on testing sets:  61.64324030691927 %
RMSE on testing set = 0.3835675969308074


**Random Forest**

In [22]:
clf3 = SGDClassifier()
clf3.fit(X_train, y_train)

SGDClassifier()

In [23]:
y_pred3 = clf3.predict(X_test)

In [24]:
# Accuracy of model
print("Accuracy of Model on training sets: ", clf3.score(X_train, y_train)*100, "%")
print("Accuracy of Model on testing sets: ", clf3.score(X_test, y_test)*100, "%")
print("RMSE on testing set =", mean_squared_error(y_test, y_pred3))

Accuracy of Model on training sets:  61.3643153977782 %
Accuracy of Model on testing sets:  61.43817478101447 %
RMSE on testing set = 0.38561825218985535


# Sample Prediction

In [25]:
sample1 = ["HAMSA", "STEVEN", "PETER", "RICHARD", "KERRI", "LUCA", "ZACHARY", "EDWARD", "DAVID", "OLIVIER"]
gender1 = np.array([0, 1, 1, 1, 0, 1, 1, 1, 1, 1])
gender1

array([0, 1, 1, 1, 0, 1, 1, 1, 1, 1])

In [26]:
vect1 = cv.transform(sample1).toarray()
print("true:", gender1)
print("-----------------")
print("clf1:", clf1.predict(vect1))
print("clf2:", clf2.predict(vect1))
print("clf3:", clf3.predict(vect1))

true: [0 1 1 1 0 1 1 1 1 1]
-----------------
clf1: [1 1 1 1 0 1 1 1 1 1]
clf2: [0 0 1 0 0 0 0 1 1 0]
clf3: [0 0 1 0 0 0 0 0 1 0]


In [27]:
sample2 = ["MARK", "CHUCHU", "EDWARD", "CARMEN", "ROBERT", "ROBERT", "WESLEY", "DANIEL", "JEFFREY", "JONATHAN"]
gender2 = np.array([1, 0, 1, 0, 1, 1, 1, 1, 1, 1])
gender2

array([1, 0, 1, 0, 1, 1, 1, 1, 1, 1])

In [28]:
vect2 = cv.transform(sample2).toarray()
print("true:", gender2)
print("-----------------")
print("clf1:", clf1.predict(vect2))
print("clf2:", clf2.predict(vect2))
print("clf3:", clf3.predict(vect2))

true: [1 0 1 0 1 1 1 1 1 1]
-----------------
clf1: [1 0 1 0 1 1 1 1 1 1]
clf2: [1 0 1 0 0 0 0 1 0 0]
clf3: [1 0 0 0 1 1 0 1 0 0]


As a result of looking at scores and RMSE, also performing 2 sample predictions, we fould that clf1 has the best performance.

# Prediction on DataSet

In [35]:
faculties = pd.read_csv("merged.csv")
print(faculties.shape)
faculties.head()

(749, 11)


Unnamed: 0,Id,Name,LastName,FirstName,RankName,Department,TotalArticleCount,TotalConfProcCount,TotalGrantCount,TotalPatentCount,TotalCourseCount
0,1485,"BALAKRISHNAN, HAMSA",BALAKRISHNAN,HAMSA,Professor,Aeronautics and Astronautics,46,63,15,1,2
1,2030,"BARRETT, STEVEN R. H",BARRETT,STEVEN,Associate,Aeronautics and Astronautics,87,7,11,1,2
2,2190,"BELOBABA, PETER P",BELOBABA,PETER,Other,Aeronautics and Astronautics,24,2,0,0,2
3,2695,"BINZEL, RICHARD P",BINZEL,RICHARD,Professor,Aeronautics and Astronautics,163,13,23,0,2
4,4030,"CAHOY, KERRI",CAHOY,KERRI,Associate,Aeronautics and Astronautics,47,72,45,8,3


In [30]:
first_name = faculties[['FirstName']]
first_name.head(2)

Unnamed: 0,FirstName
0,HAMSA
1,STEVEN


In [31]:
vect = cv.transform(first_name.FirstName).toarray()

predict = pd.DataFrame(data=clf1.predict(vect), columns=['Gender_idx'])
predict['Gender'] = predict.Gender_idx.apply(lambda x: "Female" if x == 0 else "Male")
predict.head()

Unnamed: 0,Gender_idx,Gender
0,1,Male
1,1,Male
2,1,Male
3,1,Male
4,0,Female


In [32]:
first_name['Gender'] = predict.Gender
print(first_name.shape)
first_name.head()

(749, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  first_name['Gender'] = predict.Gender


Unnamed: 0,FirstName,Gender
0,HAMSA,Male
1,STEVEN,Male
2,PETER,Male
3,RICHARD,Male
4,KERRI,Female


In [36]:
faculties.insert(4, "Gender", first_name.Gender)
faculties.head()

Unnamed: 0,Id,Name,LastName,FirstName,Gender,RankName,Department,TotalArticleCount,TotalConfProcCount,TotalGrantCount,TotalPatentCount,TotalCourseCount
0,1485,"BALAKRISHNAN, HAMSA",BALAKRISHNAN,HAMSA,Male,Professor,Aeronautics and Astronautics,46,63,15,1,2
1,2030,"BARRETT, STEVEN R. H",BARRETT,STEVEN,Male,Associate,Aeronautics and Astronautics,87,7,11,1,2
2,2190,"BELOBABA, PETER P",BELOBABA,PETER,Male,Other,Aeronautics and Astronautics,24,2,0,0,2
3,2695,"BINZEL, RICHARD P",BINZEL,RICHARD,Male,Professor,Aeronautics and Astronautics,163,13,23,0,2
4,4030,"CAHOY, KERRI",CAHOY,KERRI,Female,Associate,Aeronautics and Astronautics,47,72,45,8,3


In [37]:
faculties.to_csv("faculties_with_gender.csv", index=False)