In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/ESS_practice_data/ESSdata_Thinkful.csv')

In [3]:
data.shape

(8594, 13)

In [4]:
data = pd.get_dummies(data)

In [5]:
data.head()

Unnamed: 0,idno,year,tvtot,ppltrst,pplfair,pplhlp,happy,sclmeet,sclact,gndr,agea,partner,cntry_CH,cntry_CZ,cntry_DE,cntry_ES,cntry_NO,cntry_SE
0,5.0,6,3.0,3.0,10.0,5.0,8.0,5.0,4.0,2.0,60.0,1.0,1,0,0,0,0,0
1,25.0,6,6.0,5.0,7.0,5.0,9.0,3.0,2.0,2.0,59.0,1.0,1,0,0,0,0,0
2,26.0,6,1.0,8.0,8.0,8.0,7.0,6.0,3.0,1.0,24.0,2.0,1,0,0,0,0,0
3,28.0,6,4.0,6.0,6.0,7.0,10.0,6.0,2.0,2.0,64.0,1.0,1,0,0,0,0,0
4,29.0,6,5.0,6.0,7.0,5.0,8.0,7.0,2.0,2.0,55.0,1.0,1,0,0,0,0,0


In [6]:
for column in data.columns: 
    data[column] = data[column].fillna(data[column].median())

In [7]:
data.isnull().sum()

idno        0
year        0
tvtot       0
ppltrst     0
pplfair     0
pplhlp      0
happy       0
sclmeet     0
sclact      0
gndr        0
agea        0
partner     0
cntry_CH    0
cntry_CZ    0
cntry_DE    0
cntry_ES    0
cntry_NO    0
cntry_SE    0
dtype: int64

# Modeling Preparation

In [8]:
data['gndr'].value_counts()

1.0    4326
2.0    4268
Name: gndr, dtype: int64

In [9]:
X = data.drop('gndr', axis=1)
Y = data['gndr']

**The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.**

**The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.**

**The F_beta score can be interpreted as a weighted harmonic mean of the precision and recall, where an F_beta score reaches its best value at 1 and worst score at 0.**

**The F_beta score weights recall beta as much as precision. beta = 1.0 means recall and precsion are equally important. The support is the number of occurrences of each class in y_true.**

# Initial Multi-Layer Perceptron Classifier

In [10]:
mlp = MLPClassifier(hidden_layer_sizes=(20,))

In [11]:
mlp.fit(X, Y)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(20,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [12]:
scores_mlp = cross_val_score(mlp, X, Y, cv=3)
print('Multi-Level Perceptron Classifier #1 Performance: {:.2%} +/- {:.2%}'.format(scores_mlp.mean(),scores_mlp.std()))

Multi-Level Perceptron Classifier #1 Performance: 50.50% +/- 1.48%




In [13]:
Y_pred_mlp = mlp.predict(X)

In [14]:
print(classification_report(Y, Y_pred_mlp))

              precision    recall  f1-score   support

         1.0       0.50      1.00      0.67      4326
         2.0       0.74      0.01      0.01      4268

    accuracy                           0.51      8594
   macro avg       0.62      0.50      0.34      8594
weighted avg       0.62      0.51      0.34      8594



## Varying Activation Function

In [15]:
mlp2 = MLPClassifier(activation='logistic')

In [16]:
mlp2.fit(X, Y)

MLPClassifier(activation='logistic', alpha=0.0001, batch_size='auto',
              beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [17]:
scores_mlp2 = cross_val_score(mlp2, X, Y, cv=3)
print('Multi-Level Perceptron Classifier #2 Performance: {:.2%} +/- {:.2%}'.format(scores_mlp2.mean(),scores_mlp2.std()))

Multi-Level Perceptron Classifier #2 Performance: 48.73% +/- 1.74%


In [18]:
Y_pred_mlp2 = mlp2.predict(X)

In [19]:
print(classification_report(Y, Y_pred_mlp2))

              precision    recall  f1-score   support

         1.0       0.55      0.07      0.13      4326
         2.0       0.50      0.94      0.65      4268

    accuracy                           0.50      8594
   macro avg       0.53      0.51      0.39      8594
weighted avg       0.53      0.50      0.39      8594



## Varying Alpha

In [20]:
mlp3 = MLPClassifier(alpha = 0.001)

In [21]:
mlp3.fit(X, Y)



MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [22]:
scores_mlp3 = cross_val_score(mlp3, X, Y, cv=3)
print('Multi-Level Perceptron Classifier #3 Performance: {:.2%} +/- {:.2%}'.format(scores_mlp3.mean(),scores_mlp3.std()))

Multi-Level Perceptron Classifier #3 Performance: 49.62% +/- 0.65%




In [23]:
Y_pred_mlp3 = mlp3.predict(X)

In [24]:
print(classification_report(Y, Y_pred_mlp3))

              precision    recall  f1-score   support

         1.0       0.50      0.71      0.58      4326
         2.0       0.48      0.27      0.35      4268

    accuracy                           0.49      8594
   macro avg       0.49      0.49      0.47      8594
weighted avg       0.49      0.49      0.47      8594



## Varying Hidden Layer Size

In [25]:
mlp4 = MLPClassifier(hidden_layer_sizes=(6,5,4,3,2))

In [26]:
mlp4.fit(X, Y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(6, 5, 4, 3, 2), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=None, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False)

In [27]:
scores_mlp4 = cross_val_score(mlp4, X, Y, cv=3)
print('Multi-Level Perceptron Classifier #4 Performance: {:.2%} +/- {:.2%}'.format(scores_mlp4.mean(),scores_mlp4.std()))

Multi-Level Perceptron Classifier #4 Performance: 50.76% +/- 1.24%


In [28]:
Y_pred_mlp4 = mlp4.predict(X)

In [29]:
print(classification_report(Y, Y_pred_mlp4))

              precision    recall  f1-score   support

         1.0       0.00      0.00      0.00      4326
         2.0       0.50      1.00      0.66      4268

    accuracy                           0.50      8594
   macro avg       0.25      0.50      0.33      8594
weighted avg       0.25      0.50      0.33      8594



  'precision', 'predicted', average, warn_for)


# Random Forest Classifier 

In [30]:
rfc = RandomForestClassifier(max_depth=20)

In [31]:
rfc.fit(X, Y)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=20, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [32]:
scores_rfc = cross_val_score(rfc, X, Y, cv=3)
print('Random Forest Classifier Performance: {:.2%} +/- {:.2%}'.format(scores_rfc.mean(),scores_rfc.std()))

Random Forest Classifier Performance: 48.37% +/- 1.29%


In [33]:
Y_pred_rfc = rfc.predict(X)

In [34]:
print(classification_report(Y, Y_pred_rfc))

              precision    recall  f1-score   support

         1.0       0.97      0.98      0.97      4326
         2.0       0.98      0.97      0.97      4268

    accuracy                           0.97      8594
   macro avg       0.97      0.97      0.97      8594
weighted avg       0.97      0.97      0.97      8594

