In [1]:
# importing the required libraries
import pandas as pd
from sklearn import preprocessing
import scikitplot as skplt # for plotting the confusion matrix

# importing the required ML classifiers
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

# importing ML evaluation metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn import metrics, model_selection
from sklearn.model_selection import ShuffleSplit,cross_val_score # for Monte Carlo Cross Validation

In [2]:
df_gender = pd.read_csv('gender-prediction.csv')
df_gender

Unnamed: 0,height,weight,beard,hair_length,shoe_size,scarf,eye_color,gender
0,71,176,yes,short,44,no,black,male
1,68,165,no,bald,41,no,black,male
2,62,132,no,medium,37,yes,blue,female
3,65,138,no,long,38,no,gray,female
4,70,197,yes,medium,43,no,gray,male
...,...,...,...,...,...,...,...,...
80,80,124,no,short,46,no,brown,male
81,83,110,no,long,35,no,brown,female
82,85,130,yes,bald,48,no,black,male
83,78,111,no,medium,37,yes,green,female


In [3]:
# converting the string based inputs into integer as the model only understands integers
labels = preprocessing.LabelEncoder()
beard_encd = labels.fit_transform(df_gender.beard)
hair_length_encd = labels.fit_transform(df_gender.hair_length)
scarf_encd = labels.fit_transform(df_gender.scarf)
eye_color_encd = labels.fit_transform(df_gender.eye_color)
gender_encd = labels.fit_transform(df_gender.gender)

In [4]:
# selecting a classifier
# model = RandomForestClassifier()

# create a Gaussian classifier
# model = GaussianNB()

# create a Decision tree classifier
model = DecisionTreeClassifier()

In [5]:
# separating independent and dependent variables from the data frame
x = list(zip(df_gender.height, df_gender.weight, beard_encd, hair_length_encd, df_gender.shoe_size, scarf_encd, eye_color_encd))
y = gender_encd

In [9]:
# for shuffle split model
mc = ShuffleSplit(n_splits = 5,test_size = 0.33,random_state = 7)

In [10]:
print("Cross validation scores with Monte Carlo Cross Validation")
cross_val_score(model, x, y, cv = mc).mean() 

Cross validation scores with Monte Carlo Cross Validation


0.9517241379310345

In [25]:
# make train/test split 67/33
X_train, x_test, Y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 2)

In [27]:
# training the model using the training data
model.fit(X_train,Y_train)

RandomForestClassifier()

In [28]:
# test the model using the test input only
prediction = model.predict(x_test)

In [29]:
# calcualting and printing the accuracy
model_acc = accuracy_score(y_test, prediction)*100
print(model_acc)

100.0


In [30]:
# generate classification report
model_cl_rep = metrics.classification_report(y_test, prediction)
print(model_cl_rep)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00        10

    accuracy                           1.00        16
   macro avg       1.00      1.00      1.00        16
weighted avg       1.00      1.00      1.00        16



In [31]:
# generate confusion matrix
model_cm = metrics.confusion_matrix(y_test, prediction)
print(model_cm)

[[ 6  0]
 [ 0 10]]


In [32]:
# changing the model 
model = SVC()

In [33]:
# training the model using the training data
model.fit(X_train,Y_train)

SVC()

In [34]:
# test the model using the test input only
prediction_SVC = model.predict(x_test)

In [35]:
# calcualting and printing the accuracy
model_acc_SVC = accuracy_score(y_test, prediction_SVC)*100
print(model_acc_SVC)

81.25


In [36]:
# generate classification report
model_cl_rep_SVC = metrics.classification_report(y_test, prediction_SVC)
print(model_cl_rep_SVC)

              precision    recall  f1-score   support

           0       0.80      0.67      0.73         6
           1       0.82      0.90      0.86        10

    accuracy                           0.81        16
   macro avg       0.81      0.78      0.79        16
weighted avg       0.81      0.81      0.81        16



In [37]:
# generate confusion matrix
model_cm_SVC = metrics.confusion_matrix(y_test, prediction_SVC)
print(model_cm_SVC)

[[4 2]
 [1 9]]


In [38]:
# changing the classifier as required
model_MLPC = MLPClassifier()

In [39]:
# training the model using the training data
model_MLPC.fit(X_train,Y_train)

MLPClassifier()

In [40]:
# test the model using the test input only
prediction_MLPC = model_MLPC.predict(x_test)

In [41]:
# calcualting and printing the accuracy
model_acc_MLPC = accuracy_score(y_test, prediction_MLPC)*100
print(model_acc_MLPC)

62.5


In [42]:
# generate classification report
model_cl_rep_MLPC = metrics.classification_report(y_test, prediction_MLPC)
print(model_cl_rep_MLPC)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         6
           1       0.62      1.00      0.77        10

    accuracy                           0.62        16
   macro avg       0.31      0.50      0.38        16
weighted avg       0.39      0.62      0.48        16



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [43]:
# generate confusion matrix
model_cm_MLPC = metrics.confusion_matrix(y_test, prediction_MLPC)
print(model_cm_MLPC)

[[ 0  6]
 [ 0 10]]
