In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# EDA package
import pandas as pd
import numpy as np

In [3]:
# ML packages
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
#from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
# Load the dataset
df = pd.read_csv('drive/My Drive/JuzzChatBot/100names.csv')
df.head()

Unnamed: 0,names,sex
0,Kyaw,male
1,Quek Kim Lee,female
2,Herry Chai,male
3,Julie,female
4,Gerd,male


In [5]:
df.size

196

In [6]:
#Data Cleaning
# Check for column name consistency
df.columns

Index(['names', 'sex'], dtype='object')

In [7]:
#Data types
df.dtypes

names    object
sex      object
dtype: object

In [8]:
# Check for missing values
df.isnull().isnull().sum()

names    0
sex      0
dtype: int64

In [9]:
# Number of female names
df[df.sex == 'female'].size

70

In [10]:
# Number of Male names
df[df.sex == 'male'].size


126

In [11]:
df_name = df

In [12]:
#Replacing all female and male with 0 and 1 
df_name.sex.replace({'female':0,'male':1}, inplace=True)

In [13]:
df_name.sex.unique()

array([1, 0])

In [14]:
df_name.dtypes

names    object
sex       int64
dtype: object

In [15]:
Xfeatures = df_name['names']

In [16]:
# Feature Extraction
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [17]:
cv.get_feature_names()



['aden',
 'adilah',
 'adrian',
 'ahammad',
 'alex',
 'alistar',
 'ananda',
 'ang',
 'ann',
 'ansley',
 'ariel',
 'bebeh',
 'beh',
 'boo',
 'brenda',
 'catherine',
 'cavien',
 'chai',
 'chan',
 'chee',
 'cheng',
 'chia',
 'chiew',
 'chong',
 'choo',
 'choong',
 'chua',
 'cova',
 'de',
 'dino',
 'fariz',
 'fern',
 'gek',
 'gerd',
 'gildon',
 'gina',
 'goh',
 'gunjikar',
 'halim',
 'hansen',
 'hauy',
 'heng',
 'herry',
 'hong',
 'hsin',
 'hui',
 'hwee',
 'inis',
 'jarrod',
 'jen',
 'jennie',
 'jenny',
 'jens',
 'jessie',
 'jiaming',
 'jiekai',
 'jimmy',
 'johnny',
 'joo',
 'joshua',
 'julie',
 'jun',
 'kaiyun',
 'karan',
 'karthick',
 'keam',
 'kean',
 'keat',
 'kee',
 'kheong',
 'khim',
 'kim',
 'kong',
 'kuah',
 'kwan',
 'kyaw',
 'lau',
 'lay',
 'lee',
 'leh',
 'leng',
 'li',
 'lim',
 'linda',
 'ling',
 'lional',
 'loong',
 'louis',
 'madelil',
 'md',
 'mei',
 'michael',
 'min',
 'ming',
 'mohammad',
 'mon',
 'monica',
 'nair',
 'najib',
 'namrata',
 'naren',
 'ng',
 'ngiap',
 'nicholas

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
#Features
X
#Labels
y = df_name.sex

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33,random_state = 42)

In [21]:
# Naive Baynes Classifier
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.6060606060606061

In [22]:
# Accuracy of our model
print("Accuracy of Model",clf.score(X_test,y_test)*100,"%")

Accuracy of Model 60.60606060606061 %


In [23]:
#Accuracy of our model
print("Accuracy of model", clf.score(X_train,y_train)*100,"%")

Accuracy of model 100.0 %


In [24]:
#Sample1 Prediction
sample_name = ["Mark"]
vect =cv.transform(sample_name).toarray()

In [25]:
#Female is 0, Male is 1
clf.predict(vect)

array([1])

In [28]:
# Sample2 Prediction
sample_name1 =["Peter"]
vect2 =cv.transform(sample_name1).toarray()

In [29]:
#Female is 0, Male is 1
clf.predict(vect2)

array([1])

In [36]:
# Sample3 Prediction
sample_name2 =["Jessie"]
vect3 =cv.transform(sample_name2).toarray()

In [37]:
#Female is 0, Male is 1
clf.predict(vect3)

array([0])