## Gender Classification Of Names
### Using Machine Learning To Detect/Predict Gender of Individuals 
+ Sklearn
+ Pandas
+ Text Extraction

In [1]:
# EDA packages
import pandas as pd
import numpy as np


In [2]:
# ML Packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer


In [3]:
# import nltk
# nltk.download('names')
# from nltk.corpus import names
# from nltk import NaiveBayesClassifier as NBC
# from nltk import classify
# import random

# # Fetch NLTK Names dataset
# maleNames = [ (name, 'male') for name in names.words('male.txt') ]
# femaleNames = [ (name, 'female') for name in names.words('female.txt') ]

# allNames = maleNames + femaleNames
# random.shuffle(allNames)
# nltkData = pd.DataFrame(allNames,columns =['name','sex']).reset_index()
# nltkData.sex = nltkData.sex.replace('female', 'F')
# nltkData.sex = nltkData.sex.replace('male', 'M')
# save_to_file(nltkData,saveFileToPath)
# nltkData.sex.value_counts()
# #namesDataDF = nltkData.copy()

In [4]:
# Load our data
namesDataDF = pd.read_csv('names_dataset.csv')

In [5]:
namesDataDF.head()

Unnamed: 0,index,name,sex
0,0,Mary,F
1,1,Anna,F
2,2,Emma,F
3,3,Elizabeth,F
4,4,Minnie,F


In [6]:
namesDataDF.size

285075

In [7]:
# Data Cleaning
# Checking for column name consistency
namesDataDF.columns

Index(['index', 'name', 'sex'], dtype='object')

In [8]:
# Data Types
namesDataDF.dtypes

index     int64
name     object
sex      object
dtype: object

In [9]:
# Checking for Missing Values
namesDataDF.isnull().isnull().sum()

index    0
name     0
sex      0
dtype: int64

In [10]:
# Number of Female Names
namesDataDF[namesDataDF.sex == 'F'].count()

index    60597
name     60597
sex      60597
dtype: int64

In [11]:
# Number of Male Names
namesDataDF[namesDataDF.sex == 'M'].count()

index    34428
name     34428
sex      34428
dtype: int64

In [12]:
df_names = namesDataDF

In [13]:
# Replacing All F and M with 0 and 1 respectively
df_names.sex.replace({'F':0,'M':1},inplace=True)

In [14]:
df_names.sex.unique()

array([0, 1], dtype=int64)

In [15]:
df_names.dtypes

index     int64
name     object
sex       int64
dtype: object

In [16]:
Xfeatures = df_names['name']

In [17]:
# Feature Extraction 
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [18]:
cv.get_feature_names()

['aaban',
 'aabha',
 'aabid',
 'aabriella',
 'aada',
 'aadam',
 'aadan',
 'aadarsh',
 'aaden',
 'aadesh',
 'aadhav',
 'aadhavan',
 'aadhi',
 'aadhira',
 'aadhvik',
 'aadhya',
 'aadhyan',
 'aadi',
 'aadian',
 'aadil',
 'aadin',
 'aadish',
 'aadison',
 'aadit',
 'aadith',
 'aadithya',
 'aaditri',
 'aaditya',
 'aadiv',
 'aadon',
 'aadrian',
 'aadrika',
 'aadrit',
 'aadvik',
 'aadvika',
 'aadya',
 'aadyn',
 'aafia',
 'aafreen',
 'aagam',
 'aage',
 'aagot',
 'aahaan',
 'aahan',
 'aahana',
 'aahil',
 'aahir',
 'aahliyah',
 'aahna',
 'aahron',
 'aaidan',
 'aaiden',
 'aaidyn',
 'aaila',
 'aailiyah',
 'aailyah',
 'aaima',
 'aaira',
 'aairah',
 'aaisha',
 'aaishah',
 'aaiyana',
 'aaiza',
 'aaja',
 'aajah',
 'aajaylah',
 'aajon',
 'aakanksha',
 'aakarsh',
 'aakash',
 'aakeem',
 'aakilah',
 'aakira',
 'aakiyah',
 'aakriti',
 'aala',
 'aalaiya',
 'aalaiyah',
 'aalana',
 'aalanah',
 'aalani',
 'aalap',
 'aalaya',
 'aalayah',
 'aalayiah',
 'aalayjah',
 'aalayna',
 'aalaysha',
 'aalaysia',
 'aalea',
 

# Training the Naives Bayes Classifier model

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
# Features 
X
# Labels
y = df_names.sex

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)


0.6418310970797159

In [23]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_test,y_test)*100,"%")

Accuracy of Model 64.18310970797158 %


In [24]:
# Accuracy of our Model
print("Accuracy of Model",clf.score(X_train,y_train)*100,"%")

Accuracy of Model 100.0 %


### Sample Prediction of Naives Bayes Classifier

In [25]:
# Sample1 Prediction
sample_name = ["Mary"]
vect = cv.transform(sample_name).toarray()
vect

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [26]:
# Female is 0, Male is 1
clf.predict(vect)

array([0], dtype=int64)

In [27]:
# Sample2 Prediction
sample_name1 = ["Mark"]
vect1 = cv.transform(sample_name1).toarray()
clf.predict(vect1)

array([1], dtype=int64)

In [28]:
# Sample3 Prediction of Russian Names
sample_name2 = ["Natasha"]
vect2 = cv.transform(sample_name2).toarray()
clf.predict(vect2)

array([0], dtype=int64)

In [29]:
# Sample3 Prediction of Random Names
sample_name3 = ["Nefertiti","Nasha","Ama","Ayo","Xhavier","Ovetta","Tathiana","Xia","Joseph","Xianliang"]
vect3 = cv.transform(sample_name3).toarray()
clf.predict(vect3)

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0], dtype=int64)

In [30]:
# A function to do it
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")
    

In [31]:
genderpredictor("Martha")

Female


Features fxn
apply the fxn
vectorizer
fit
transform
classifier
fit
predict


In [32]:
namelist = ["Yaa","Yaw","Femi","Masha"]
for i in namelist:
    print(genderpredictor(i))

Female
None
Male
None
Female
None
Female
None


### Using a custom function for feature analysis

In [33]:
# By Analogy most female names ends in 'A' or 'E' or has the sound of 'A'
def features(name):
    name = name.lower()
    return {
        'first-letter': name[0], # First letter
        'first2-letters': name[0:2], # First 2 letters
        'first3-letters': name[0:3], # First 3 letters
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }

In [34]:
# Vectorize the features function
features = np.vectorize(features)
print(features(["Anna", "Hannah", "Peter","John","Vladmir","Mohammed"]))

[{'first-letter': 'a', 'first2-letters': 'an', 'first3-letters': 'ann', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna'}
 {'first-letter': 'h', 'first2-letters': 'ha', 'first3-letters': 'han', 'last-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah'}
 {'first-letter': 'p', 'first2-letters': 'pe', 'first3-letters': 'pet', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ter'}
 {'first-letter': 'j', 'first2-letters': 'jo', 'first3-letters': 'joh', 'last-letter': 'n', 'last2-letters': 'hn', 'last3-letters': 'ohn'}
 {'first-letter': 'v', 'first2-letters': 'vl', 'first3-letters': 'vla', 'last-letter': 'r', 'last2-letters': 'ir', 'last3-letters': 'mir'}
 {'first-letter': 'm', 'first2-letters': 'mo', 'first3-letters': 'moh', 'last-letter': 'd', 'last2-letters': 'ed', 'last3-letters': 'med'}]


In [35]:
# Extract the features for the dataset
df_X = features(df_names['name'])

In [36]:
df_y = df_names['sex']

In [37]:
from sklearn.feature_extraction import DictVectorizer
 
corpus = features(["Mike", "Julia"])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)
 

  (0, 1)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 7)	1.0
  (0, 9)	1.0
  (0, 10)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 6)	1.0
  (1, 8)	1.0
  (1, 11)	1.0


In [38]:
dv.get_feature_names()

['first-letter=j',
 'first-letter=m',
 'first2-letters=ju',
 'first2-letters=mi',
 'first3-letters=jul',
 'first3-letters=mik',
 'last-letter=a',
 'last-letter=e',
 'last2-letters=ia',
 'last2-letters=ke',
 'last3-letters=ike',
 'last3-letters=lia']

In [39]:
# Train Test Split
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.33, random_state=13)

In [40]:
dfX_train

array([{'first-letter': 'a', 'first2-letters': 'ad', 'first3-letters': 'ady', 'last-letter': 'n', 'last2-letters': 'yn', 'last3-letters': 'syn'},
       {'first-letter': 'j', 'first2-letters': 'ju', 'first3-letters': 'jun', 'last-letter': 'a', 'last2-letters': 'ya', 'last3-letters': 'iya'},
       {'first-letter': 'f', 'first2-letters': 'fr', 'first3-letters': 'fre', 'last-letter': 't', 'last2-letters': 'nt', 'last3-letters': 'ont'},
       ...,
       {'first-letter': 'l', 'first2-letters': 'la', 'first3-letters': 'lae', 'last-letter': 'n', 'last2-letters': 'nn', 'last3-letters': 'ynn'},
       {'first-letter': 'm', 'first2-letters': 'me', 'first3-letters': 'mer', 'last-letter': 'y', 'last2-letters': 'ry', 'last3-letters': 'ury'},
       {'first-letter': 'n', 'first2-letters': 'no', 'first3-letters': 'nor', 'last-letter': 'e', 'last2-letters': 'le', 'last3-letters': 'lle'}],
      dtype=object)

# Decision

In [41]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)


<63666x8165 sparse matrix of type '<class 'numpy.float64'>'
	with 381996 stored elements in Compressed Sparse Row format>

In [42]:
# Model building Using DecisionTree

from sklearn.tree import DecisionTreeClassifier
 
dclf = DecisionTreeClassifier()
my_xfeatures =dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [46]:
# Build Features and Transform them
sample_name_eg = ["Alex"]
transform_dv =dv.transform(features(sample_name_eg))

vect3 = transform_dv.toarray()

# Predicting Gender of Name
# Male is 1,female = 0
dclf.predict(vect3)

if dclf.predict(vect3) == 0:
    print("Female")
else:
    print("Male")

Male


In [47]:
# Second Prediction With Nigerian Name
name_eg1 = ["william"]
transform_dv =dv.transform(features(name_eg1))
vect4 = transform_dv.toarray()
if dclf.predict(vect4) == 0:
    print("Female")
else:
    print("Male")

Male


In [48]:
#  Prediction With example  Name
name_eg1 = ["William K.L. Dickson	"]
transform_dv =dv.transform(features(name_eg1))
vect4 = transform_dv.toarray()
if dclf.predict(vect4) == 0:
    print("Female")
else:
    print("Male")

Male


In [49]:
# A function to do it
def genderpredictor1(a):
    test_name1 = [a]
    transform_dv =dv.transform(features(test_name1))
    #print(transform_dv)
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        return "Female"
    else:
        return "Male"
    

In [50]:
random_name_list = ["Alex","Alice","Chioma","Vitalic","Clairese","Chan","George"]

In [51]:
for n in random_name_list:
    print(genderpredictor1(n))

Male
Female
Female
Female
Female
Male
Male


In [52]:
## Accuracy of Models Decision Tree Classifier Works better than Naive Bayes
# Accuracy on training set
print(dclf.score(dv.transform(dfX_train), dfy_train)) 
 

0.989492036565828


In [53]:
# Accuracy on test set
print(dclf.score(dv.transform(dfX_test), dfy_test))

0.8627826142415256


### Saving Our Model

In [54]:
from sklearn.externals import joblib



In [55]:
decisiontreModel = open("decisiontreemodel.pkl","wb")

In [56]:
joblib.dump(dclf,decisiontreModel)

In [57]:
decisiontreModel.close

<function BufferedWriter.close>

In [58]:
#Alternative to Model Saving
import pickle
dctreeModel = open("namesdetectormodel.pkl","wb")

In [59]:
pickle.dump(dclf,dctreeModel)

In [60]:
dctreeModel.close()

##### Save Multinomial NB Model

In [61]:
NaiveBayesModel = open("naivebayesgendermodel.pkl","wb")

In [62]:
joblib.dump(clf,NaiveBayesModel)

In [63]:
NaiveBayesModel.close()

In [64]:
# Thanks
# By Jesse JCharis
# Jesus Saves @ JCharisTech
# J-Secur1ty

### Genderize

In [65]:
from dateutil.parser import parse 
import numpy as np
import pandas as pd
import unicodedata
from datetime import datetime
from datetime import date

In [66]:
# Helper method for save_to_file

def get_df_name(df):
    name =[x for x in globals() if globals()[x] is df][0]
    return name

In [67]:
# Function: Save Dataframe to CSV

def save_to_file(dataFrame, folderPath):
    today = date.today()
    todayDate = today.strftime("%b-%d-%Y")
    dataFrame.to_csv(folderPath+ '\\' + get_df_name(dataFrame) + '_' + todayDate  + '.csv',index=False, encoding="utf8" )

In [89]:
fileLocation = 'C:\\Yuva\\ITU\\3rd Sem\\Research Topics\\Movies & Gender\\Data\\IMDb\\stage2ExplodeAndMergeDF_Nov-17-2019.csv'
saveFileToPath = "C:\\Yuva\\ITU\\3rd Sem\\Research Topics\\Movies & Gender\\Data\\IMDb"

In [90]:
# Import as Dataframe - check later
dfGenderlessData = pd.read_csv(fileLocation,encoding="utf8")

In [91]:
dfGenderlessData

Unnamed: 0,director_ids,director_names,movie_id,title,review_count_user,review_count_critic,metascore,rating_value,rating_count,date_published,release_date,release_year,movie_year,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0005690,William K.L. Dickson,tt0000001,Carmencita - spanyol tánc (1894) - IMDb,12,2,,5.8,1255,1894-03-10,10 March 1894 (USA) See more »,1894.0,1894,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763"
1,nm0721526,Émile Reynaud,tt0000002,A bohóc és kutyái (1892) - IMDb,0,,,6.5,148,1892-10-28,28 October 1892 (France) See more »,1892.0,1892,1844,1918,director,"tt2184231,tt0000003,tt2184201,tt0413219"
2,nm0721526,Émile Reynaud,tt0000003,Szegény Pierrot (1892) - IMDb,12,5,,6.6,819,1892-10-28,28 October 1892 (France) See more »,1892.0,1892,1844,1918,director,"tt2184231,tt0000003,tt2184201,tt0413219"
3,nm0721526,Émile Reynaud,tt0000004,Egy jó pohár sör (1892) - IMDb,1,,,6.6,90,1892-10-28,28 October 1892 (France) See more »,1892.0,1892,1844,1918,director,"tt2184231,tt0000003,tt2184201,tt0413219"
4,nm0005690,William K.L. Dickson,tt0000005,A patkolókovács (1893) - IMDb,18,3,,6.2,1499,1893-05-09,9 May 1893 (USA) See more »,1893.0,1893,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763"
5,nm0005690,William K.L. Dickson,tt0000006,Kínai ópiumbarlang (1894) - IMDb,,,,5.7,70,1894-10-17,17 October 1894 (UK) See more »,1894.0,1894,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763"
6,nm0005690,William K.L. Dickson,tt0000007,A Corbett-Courtney bokszmeccs (1894) - IMDb,5,2,,5.5,499,2008-12-31,,,1894,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763"
7,nm0374658,William Heise,tt0000007,A Corbett-Courtney bokszmeccs (1894) - IMDb,5,2,,5.5,499,2008-12-31,,,1894,1847,1910,"cinematographer,director,producer","tt0285863,tt0241393,tt0229665,tt0241715"
8,nm0005690,William K.L. Dickson,tt0000008,Edison asszisztense tüsszent (1894) - IMDb,19,4,,5.6,1341,1894-01-09,9 January 1894 (USA) See more »,1894.0,1894,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763"
9,nm0085156,Alexander Black,tt0000009,Miss Jerry (1894) - IMDb,1,2,,5.6,58,1894-10-09,9 October 1894 (USA) See more »,1894.0,1894,1859,1940,"director,writer,cinematographer",tt0000009


In [92]:
def specialCharacterConversion(name):
    text = unicodedata.normalize('NFKD', name).encode('ascii', 'ignore').decode("utf-8").lower() 
    return text
    

In [93]:
df4 = dfGenderlessData.copy()

In [94]:
#Creating First name to perform vlookup 

dfGenderlessData['FirstName'] = dfGenderlessData['director_names'].apply(lambda x: x.split(' ')[0]).apply(specialCharacterConversion)
dfGenderlessData.head()

Unnamed: 0,director_ids,director_names,movie_id,title,review_count_user,review_count_critic,metascore,rating_value,rating_count,date_published,release_date,release_year,movie_year,birthYear,deathYear,primaryProfession,knownForTitles,FirstName
0,nm0005690,William K.L. Dickson,tt0000001,Carmencita - spanyol tánc (1894) - IMDb,12,2.0,,5.8,1255,1894-03-10,10 March 1894 (USA) See more »,1894.0,1894,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",william
1,nm0721526,Émile Reynaud,tt0000002,A bohóc és kutyái (1892) - IMDb,0,,,6.5,148,1892-10-28,28 October 1892 (France) See more »,1892.0,1892,1844,1918,director,"tt2184231,tt0000003,tt2184201,tt0413219",emile
2,nm0721526,Émile Reynaud,tt0000003,Szegény Pierrot (1892) - IMDb,12,5.0,,6.6,819,1892-10-28,28 October 1892 (France) See more »,1892.0,1892,1844,1918,director,"tt2184231,tt0000003,tt2184201,tt0413219",emile
3,nm0721526,Émile Reynaud,tt0000004,Egy jó pohár sör (1892) - IMDb,1,,,6.6,90,1892-10-28,28 October 1892 (France) See more »,1892.0,1892,1844,1918,director,"tt2184231,tt0000003,tt2184201,tt0413219",emile
4,nm0005690,William K.L. Dickson,tt0000005,A patkolókovács (1893) - IMDb,18,3.0,,6.2,1499,1893-05-09,9 May 1893 (USA) See more »,1893.0,1893,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",william


In [114]:
# nameDatabaseLocationNLTK = 'C:\\Yuva\\ITU\\3rd Sem\\Research Topics\\Movies & Gender\\Genderize\\Lookup_nltk.csv'
nameDatabaseLocationNLTK = 'C:\\Yuva\\ITU\\3rd Sem\\Research Topics\\Movies & Gender\\Genderize\\nltkData_Dec-15-2019.csv'
# nameDatabaseLocationGuardian = 'C:\\Yuva\\ITU\\3rd Sem\\Research Topics\\Movies & Gender\\Genderize\\GuardianWebiste_names.csv'
# nameDatabaseLocationSSN =  'C:\\Yuva\\ITU\\3rd Sem\\Research Topics\\Movies & Gender\\Genderize\\SSN\\SSNData.csv'

# Import as Dataframe

# nameSSNDF = pd.read_csv(nameDatabaseLocationSSN,encoding="utf8")
# nameGuardianDF = pd.read_csv(nameDatabaseLocationGuardian,encoding="utf8")
nameNLTKDF = pd.read_csv(nameDatabaseLocationNLTK,encoding="utf8")

# Remove duplicate entries from SSN dataset
# nameSSNDF = nameSSNDF.drop_duplicates(subset='FirstName')
nameNLTKDF = nameNLTKDF.drop_duplicates(subset='FirstName')


# convert firstname to lower
# nameSSNDF.FirstName = nameSSNDF.FirstName.str.lower()
# nameGuardianDF.FirstName = nameGuardianDF.FirstName.str.lower()
nameNLTKDF.FirstName = nameNLTKDF.FirstName.str.lower()
nameNLTKDF

Unnamed: 0,FirstName,Gender
0,yancy,Male
1,barret,Male
2,heda,Female
3,siward,Male
4,fredia,Female
5,iolande,Female
6,aina,Female
7,dael,Female
8,churchill,Male
9,andri,Male


In [115]:
dfGenderlessData

Unnamed: 0,director_ids,director_names,movie_id,title,review_count_user,review_count_critic,metascore,rating_value,rating_count,date_published,release_date,release_year,movie_year,birthYear,deathYear,primaryProfession,knownForTitles,FirstName
0,nm0005690,William K.L. Dickson,tt0000001,Carmencita - spanyol tánc (1894) - IMDb,12,2,,5.8,1255,1894-03-10,10 March 1894 (USA) See more »,1894.0,1894,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",william
1,nm0721526,Émile Reynaud,tt0000002,A bohóc és kutyái (1892) - IMDb,0,,,6.5,148,1892-10-28,28 October 1892 (France) See more »,1892.0,1892,1844,1918,director,"tt2184231,tt0000003,tt2184201,tt0413219",emile
2,nm0721526,Émile Reynaud,tt0000003,Szegény Pierrot (1892) - IMDb,12,5,,6.6,819,1892-10-28,28 October 1892 (France) See more »,1892.0,1892,1844,1918,director,"tt2184231,tt0000003,tt2184201,tt0413219",emile
3,nm0721526,Émile Reynaud,tt0000004,Egy jó pohár sör (1892) - IMDb,1,,,6.6,90,1892-10-28,28 October 1892 (France) See more »,1892.0,1892,1844,1918,director,"tt2184231,tt0000003,tt2184201,tt0413219",emile
4,nm0005690,William K.L. Dickson,tt0000005,A patkolókovács (1893) - IMDb,18,3,,6.2,1499,1893-05-09,9 May 1893 (USA) See more »,1893.0,1893,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",william
5,nm0005690,William K.L. Dickson,tt0000006,Kínai ópiumbarlang (1894) - IMDb,,,,5.7,70,1894-10-17,17 October 1894 (UK) See more »,1894.0,1894,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",william
6,nm0005690,William K.L. Dickson,tt0000007,A Corbett-Courtney bokszmeccs (1894) - IMDb,5,2,,5.5,499,2008-12-31,,,1894,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",william
7,nm0374658,William Heise,tt0000007,A Corbett-Courtney bokszmeccs (1894) - IMDb,5,2,,5.5,499,2008-12-31,,,1894,1847,1910,"cinematographer,director,producer","tt0285863,tt0241393,tt0229665,tt0241715",william
8,nm0005690,William K.L. Dickson,tt0000008,Edison asszisztense tüsszent (1894) - IMDb,19,4,,5.6,1341,1894-01-09,9 January 1894 (USA) See more »,1894.0,1894,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",william
9,nm0085156,Alexander Black,tt0000009,Miss Jerry (1894) - IMDb,1,2,,5.6,58,1894-10-09,9 October 1894 (USA) See more »,1894.0,1894,1859,1940,"director,writer,cinematographer",tt0000009,alexander


In [116]:
# Merge genders and first names
# nameGenderSSNDF = dfGenderlessData.merge(nameSSNDF, on='FirstName',how ='left')
# nameGenderGuardianDF = dfGenderlessData.merge(nameGuardianDF, on='FirstName',how ='left')
nameGenderNLTKDF = dfGenderlessData.merge(nameNLTKDF, on='FirstName',how ='left')

In [117]:
# nameGenderSSNDF.Gender.value_counts(dropna=False)
# nameGenderGuardianDF.Gender.value_counts(dropna=False)
nameGenderNLTKDF.Gender.value_counts(dropna=False)

Male      487123
NaN       178938
Female    113613
Name: Gender, dtype: int64

In [None]:
#GuardianMissingGenderDF = nameGenderGuardianDF[pd.isnull(nameGenderGuardianDF['Gender'])]

# Classify the gender for the missing values

In [118]:
# Wherever the value is NA, predict the gender using our trained model

# nameGenderSSNDF.loc[nameGenderSSNDF['Gender'].isna(),'Gender'] = nameGenderSSNDF['FirstName'].apply(genderpredictor1)
# nameGenderGuardianDF.loc[nameGenderGuardianDF['Gender'].isna(),'Gender'] = nameGenderGuardianDF['FirstName'].apply(genderpredictor1)
nameGenderNLTKDF.loc[nameGenderNLTKDF['Gender'].isna(),'Gender'] = nameGenderNLTKDF['FirstName'].apply(genderpredictor1)


In [119]:
# nameGenderSSNDF.Gender.value_counts(dropna=False)
# nameGenderGuardianDF.Gender.value_counts(dropna=False)
nameGenderNLTKDF.Gender.value_counts(dropna=False)

Male      621340
Female    158334
Name: Gender, dtype: int64

In [80]:
#nameGenderSSNDF.loc[nameGenderSSNDF['FirstName'] == 'george']

In [120]:
stage3FinalProcessedDF = nameGenderNLTKDF.copy()

In [121]:
stage3FinalProcessedDF

Unnamed: 0,director_ids,director_names,movie_id,title,review_count_user,review_count_critic,metascore,rating_value,rating_count,date_published,release_date,release_year,movie_year,birthYear,deathYear,primaryProfession,knownForTitles,FirstName,Gender
0,nm0005690,William K.L. Dickson,tt0000001,Carmencita - spanyol tánc (1894) - IMDb,12,2,,5.8,1255,1894-03-10,10 March 1894 (USA) See more »,1894.0,1894,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",william,Male
1,nm0721526,Émile Reynaud,tt0000002,A bohóc és kutyái (1892) - IMDb,0,,,6.5,148,1892-10-28,28 October 1892 (France) See more »,1892.0,1892,1844,1918,director,"tt2184231,tt0000003,tt2184201,tt0413219",emile,Male
2,nm0721526,Émile Reynaud,tt0000003,Szegény Pierrot (1892) - IMDb,12,5,,6.6,819,1892-10-28,28 October 1892 (France) See more »,1892.0,1892,1844,1918,director,"tt2184231,tt0000003,tt2184201,tt0413219",emile,Male
3,nm0721526,Émile Reynaud,tt0000004,Egy jó pohár sör (1892) - IMDb,1,,,6.6,90,1892-10-28,28 October 1892 (France) See more »,1892.0,1892,1844,1918,director,"tt2184231,tt0000003,tt2184201,tt0413219",emile,Male
4,nm0005690,William K.L. Dickson,tt0000005,A patkolókovács (1893) - IMDb,18,3,,6.2,1499,1893-05-09,9 May 1893 (USA) See more »,1893.0,1893,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",william,Male
5,nm0005690,William K.L. Dickson,tt0000006,Kínai ópiumbarlang (1894) - IMDb,,,,5.7,70,1894-10-17,17 October 1894 (UK) See more »,1894.0,1894,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",william,Male
6,nm0005690,William K.L. Dickson,tt0000007,A Corbett-Courtney bokszmeccs (1894) - IMDb,5,2,,5.5,499,2008-12-31,,,1894,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",william,Male
7,nm0374658,William Heise,tt0000007,A Corbett-Courtney bokszmeccs (1894) - IMDb,5,2,,5.5,499,2008-12-31,,,1894,1847,1910,"cinematographer,director,producer","tt0285863,tt0241393,tt0229665,tt0241715",william,Male
8,nm0005690,William K.L. Dickson,tt0000008,Edison asszisztense tüsszent (1894) - IMDb,19,4,,5.6,1341,1894-01-09,9 January 1894 (USA) See more »,1894.0,1894,1860,1935,"cinematographer,director,producer","tt0219560,tt6687694,tt1428455,tt1496763",william,Male
9,nm0085156,Alexander Black,tt0000009,Miss Jerry (1894) - IMDb,1,2,,5.6,58,1894-10-09,9 October 1894 (USA) See more »,1894.0,1894,1859,1940,"director,writer,cinematographer",tt0000009,alexander,Male


In [122]:
stage3FinalProcessedDF = stage3FinalProcessedDF.drop(columns=['FirstName','release_year'])

In [123]:
save_to_file(stage3FinalProcessedDF,saveFileToPath)

# End

# Testing Accuracy of trained model on SSN Data

In [124]:
# Load SSN
SSNDatabaseLocation = 'C:\\Yuva\\ITU\\3rd Sem\\Research Topics\\Movies & Gender\\Genderize\\SSN\\SSNData.csv'


In [125]:
genderSSN = pd.read_csv(SSNDatabaseLocation)

# Remove duplicate entries from SSN dataset
genderSSN = genderSSN.drop_duplicates(subset='FirstName')

genderSSN.head()

Unnamed: 0,FirstName,Gender
0,Emma,Female
1,Olivia,Female
2,Ava,Female
3,Isabella,Female
4,Sophia,Female


In [127]:
#nameGenderGuardianDF.loc[nameGenderGuardianDF['Gender'].isna(),'Gender'] = nameGenderGuardianDF['director_names'].apply(genderpredictor1)

genderSSN['PredictedGender'] = genderSSN.FirstName.apply(genderpredictor1)

In [129]:
genderSSN['Correct'] = np.where((genderSSN.Gender == genderSSN.PredictedGender),1,0)

# df['que'] = np.where((df['one'] >= df['two']) & (df['one'] <= df['three']) , df['one'], np.nan)

In [144]:
AccuracyModelOnSSNData = genderSSN.Correct.value_counts()[1]/genderSSN.Correct.count()
print("Accuracy of the model on SSN Data is :",round((AccuracyModelOnSSNData*100),2),"%")

Accuracy of the model on SSN Data is : 88.75 %
