In [6]:
# importing the required libraries
import pandas as pd
from sklearn import preprocessing
import scikitplot as skplt # for plotting the confusion matrix

# importing the required ML classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# importing ML evaluation metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn import metrics, model_selection
from sklearn.model_selection import ShuffleSplit,cross_val_score # for Monte Carlo Cross Validation
from sklearn.model_selection import LeavePOut,cross_val_score # for leave P-out Cross Validation 

In [7]:
df_gender = pd.read_csv('gender-prediction.csv')
df_gender

Unnamed: 0,height,weight,beard,hair_length,shoe_size,scarf,eye_color,gender
0,71,176,yes,short,44,no,black,male
1,68,165,no,bald,41,no,black,male
2,62,132,no,medium,37,yes,blue,female
3,65,138,no,long,38,no,gray,female
4,70,197,yes,medium,43,no,gray,male
...,...,...,...,...,...,...,...,...
80,80,124,no,short,46,no,brown,male
81,83,110,no,long,35,no,brown,female
82,85,130,yes,bald,48,no,black,male
83,78,111,no,medium,37,yes,green,female


In [8]:
# converting the string based inputs into integer as the model only understands integers
labels = preprocessing.LabelEncoder()
beard_encd = labels.fit_transform(df_gender.beard)
hair_length_encd = labels.fit_transform(df_gender.hair_length)
scarf_encd = labels.fit_transform(df_gender.scarf)
eye_color_encd = labels.fit_transform(df_gender.eye_color)
gender_encd = labels.fit_transform(df_gender.gender)

In [9]:
# selecting a classifier
# model = RandomForestClassifier()

# create a Gaussian classifier
model = GaussianNB()

# create a Decision tree classifier
# model = DecisionTreeClassifier()

In [10]:
# separating independent and dependent variables from the data frame
x = list(zip(df_gender.height, df_gender.weight, beard_encd, hair_length_encd, df_gender.shoe_size, scarf_encd, eye_color_encd))
y = gender_encd

In [11]:
# # Leave POut stuff
# lpo = LeavePOut(p=2)
# lpo.get_n_splits(x)
# tree = RandomForestClassifier(n_estimators = 10, max_depth = 5, n_jobs= -1)
# score = cross_val_score(tree, x, y, cv = lpo)
# print("Cross Validation Scores are {}".format(score))
# print("Average Cross Validation score :{}".format(score.mean()))

In [12]:
# for shuffle split model
# mc = ShuffleSplit(n_splits = 5,test_size = 0.33,random_state = 7)

In [13]:
# print("Cross validation scores with Monte Carlo Cross Validation")
# cross_val_score(model, x, y, cv = mc).mean() 

In [14]:
# make train/test split 67/33
X_train, x_test, Y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state = 2)

In [15]:
# training the model using the training data
model.fit(X_train,Y_train)

GaussianNB()

In [16]:
# test the model using the test input only
prediction = model.predict(x_test)

In [17]:
# calcualting and printing the accuracy
model_acc = accuracy_score(y_test, prediction)*100
print(model_acc)

93.10344827586206


In [18]:
# generate classification report
model_cl_rep = metrics.classification_report(y_test, prediction)
print(model_cl_rep)

              precision    recall  f1-score   support

           0       1.00      0.82      0.90        11
           1       0.90      1.00      0.95        18

    accuracy                           0.93        29
   macro avg       0.95      0.91      0.92        29
weighted avg       0.94      0.93      0.93        29



In [19]:
# generate confusion matrix
model_cm = metrics.confusion_matrix(y_test, prediction)
print(model_cm)

[[ 9  2]
 [ 0 18]]


In [20]:
# changing the model 
model = SVC()

In [21]:
# training the model using the training data
model.fit(X_train,Y_train)

SVC()

In [22]:
# test the model using the test input only
prediction_SVC = model.predict(x_test)

In [23]:
# calcualting and printing the accuracy
model_acc_SVC = accuracy_score(y_test, prediction_SVC)*100
print(model_acc_SVC)

86.20689655172413


In [24]:
# generate classification report
model_cl_rep_SVC = metrics.classification_report(y_test, prediction_SVC)
print(model_cl_rep_SVC)

              precision    recall  f1-score   support

           0       0.82      0.82      0.82        11
           1       0.89      0.89      0.89        18

    accuracy                           0.86        29
   macro avg       0.85      0.85      0.85        29
weighted avg       0.86      0.86      0.86        29



In [25]:
# generate confusion matrix
model_cm_SVC = metrics.confusion_matrix(y_test, prediction_SVC)
print(model_cm_SVC)

[[ 9  2]
 [ 2 16]]


In [26]:
# changing the classifier as required
model_MLPC = MLPClassifier()

In [27]:
# training the model using the training data
model_MLPC.fit(X_train,Y_train)



MLPClassifier()

In [28]:
# test the model using the test input only
prediction_MLPC = model_MLPC.predict(x_test)

In [29]:
# calcualting and printing the accuracy
model_acc_MLPC = accuracy_score(y_test, prediction_MLPC)*100
print(model_acc_MLPC)

93.10344827586206


In [30]:
# generate classification report
model_cl_rep_MLPC = metrics.classification_report(y_test, prediction_MLPC)
print(model_cl_rep_MLPC)

              precision    recall  f1-score   support

           0       0.91      0.91      0.91        11
           1       0.94      0.94      0.94        18

    accuracy                           0.93        29
   macro avg       0.93      0.93      0.93        29
weighted avg       0.93      0.93      0.93        29



In [31]:
# generate confusion matrix
model_cm_MLPC = metrics.confusion_matrix(y_test, prediction_MLPC)
print(model_cm_MLPC)

[[10  1]
 [ 1 17]]
