In [22]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [115]:
#LOAD DATA
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
adult = fetch_ucirepo(id=2) 
  
# data (as pandas dataframes) 
X = adult.data.features 
y = adult.data.targets 


In [120]:
#CLEAN DATA
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import scipy
from scipy.stats import pearsonr
import sklearn
from sklearn.preprocessing import StandardScaler 
from sklearn.preprocessing import LabelEncoder
from sklearn import datasets, linear_model
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score


all_df = pd.read_csv("./adult.data",index_col=False)

column_titles = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'income']

# Assign the column titles to the DataFrame using the columns attribute.
all_df.columns = column_titles

X = all_df
le = LabelEncoder()
X['workclass'] = le.fit_transform(X['workclass'])
X['education'] = le.fit_transform(X['education'])
X['marital-status'] = le.fit_transform(X['marital-status'])
X['occupation'] = le.fit_transform(X['occupation'])
X['relationship'] = le.fit_transform(X['relationship'])
X['race'] = le.fit_transform(X['race'])
X['sex'] = le.fit_transform(X['sex'])
X['native-country'] = le.fit_transform(X['native-country'])
X = X.drop('income', axis=1)

y = all_df

y['income'].replace(' <=50K', 0, inplace = True)
y['income'].replace(' <=50K.', 0, inplace = True)
y['income'].replace(' >50K', 1, inplace = True)
y['income'].replace(' >50K.', 1, inplace = True)

y = y['income']

In [121]:
#SPLIT DATA
scaler = StandardScaler()
Xs = scaler.fit_transform(X)
Xs_train, Xs_test, y_train, y_test = train_test_split(Xs, y, test_size=0.3,random_state=1, stratify=y)

In [122]:
#APPLY NEURAL NETWORK, MLP CLASSIFIER
from sklearn.neural_network import MLPClassifier
mlp_clf = MLPClassifier(random_state=1, max_iter=500).fit(Xs_train, y_train)
mlp_clf.predict_proba(Xs_test)
classifier_score = mlp_clf.score(Xs_test, y_test)
print('The classifier accuracy score of MLP is {:03.2f}'.format(classifier_score))

The classifier accuracy score of MLP is 0.85


In [55]:
#K-FOLD
param_grid = {
    'hidden_layer_sizes': [2, 3],
    'max_iter': [700, 800]
}
# kf = KFold(n_splits=10, shuffle=True, random_state=1)
inner_cv = KFold(n_splits=4, shuffle=True, random_state=1)
outer_cv = KFold(n_splits=4, shuffle=True, random_state=1)
grid_search = GridSearchCV(MLPClassifier(), param_grid, cv=inner_cv, scoring='neg_mean_squared_error')

In [57]:
grid_search.fit(Xs_train, y_train)
non_nested_scores = grid_search.best_score_
print(non_nested_scores)


GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=MLPClassifier(),
             param_grid={'hidden_layer_sizes': [2, 3], 'max_iter': [300, 500]},
             scoring='neg_mean_squared_error')

In [63]:
nested_cv_results = cross_val_score(grid_search, Xs_train, y_train, cv=outer_cv).mean()
print(nested_cv_results)

-0.24104958623040068


In [64]:
best_params = grid_search.best_params_
best_mlp = grid_search.best_estimator_
print(best_params)
print(best_mlp)

In [68]:
#CHANGE TO LOGISTIC REGRESSION FOR CLASSIFICATION

In [129]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=100)
log_reg.fit(Xs_train, y_train)

LogisticRegression()

In [130]:
classifier_score_reg = log_reg.score(Xs_test, y_test)
print('The classifier accuracy score is {:03.2f}'.format(classifier_score_reg))

The classifier accuracy score is 0.83


In [131]:
param_grid = {
    'intercept_scaling': [1, 2, 3],
    'C': [1, 2, 3],
    'max_iter': [200, 300]
}
# kf = KFold(n_splits=10, shuffle=True, random_state=1)
inner_cv = KFold(n_splits=5, shuffle=True, random_state=1)
outer_cv = KFold(n_splits=5, shuffle=True, random_state=1)
grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=inner_cv, scoring='neg_mean_squared_error')

In [132]:
grid_search.fit(Xs_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=1, shuffle=True),
             estimator=LogisticRegression(),
             param_grid={'C': [1, 2, 3], 'intercept_scaling': [1, 2, 3],
                         'max_iter': [200, 300]},
             scoring='neg_mean_squared_error')

In [133]:
non_nested_scores = grid_search.best_score_
print(non_nested_scores)

-0.1765530303722988


In [134]:
nested_cv_results = cross_val_score(grid_search, Xs_train, y_train, cv=outer_cv).mean()
print(nested_cv_results)

-0.17664078816080253


In [136]:
best_params = grid_search.best_params_
best_log = grid_search.best_estimator_
print(best_params)
print(best_log)

{'C': 1, 'intercept_scaling': 1, 'max_iter': 200}
LogisticRegression(C=1, max_iter=200)
