In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

warnings.filterwarnings('ignore')

column_names = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
                'marital-status', 'occupation', 'relationship', 'race', 'gender',
                'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income']


adult_df = pd.read_csv("data/adult_training.csv",
                      delimiter=",",
                      skipinitialspace=True,
                      names = column_names,
                      dtype=None)

adult_test_df = pd.read_csv("data/adult_training.csv",
                      delimiter=",",
                      skipinitialspace=True,
                      names = column_names,
                      dtype=None)

from sklearn.model_selection import train_test_split
from __future__ import division
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.utils import resample
from sklearn.ensemble import BaggingClassifier

#from joblib import dump, load
from sklearn.model_selection import cross_val_score
import sys
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [2]:
def performance_metrics(y_true, y_pred):
    
    cm = confusion_matrix(y_true=y_true, y_pred=y_pred)
    
    TP = cm[1,1]
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]

    accuracy = ((TP+TN))/(TP+FN+FP+TN)
    precision = (TP)/(TP+FP)
    recall = (TP)/(TP+FN)
    f_measure = (2*recall*precision)/(recall+precision)
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    error_rate = 1 - accuracy
    false_pos = FP/(FP+TN)
    
    metrics = {}
    metrics['accuracy'] =  accuracy
    metrics['precision'] = precision
    metrics['recall'] = recall
    metrics['f_measure'] = f_measure
    metrics['sensitivity'] = sensitivity
    metrics['specificity'] = specificity
    metrics['error_rate'] = error_rate
    metrics['false_pos'] = false_pos
    
    return metrics

In [3]:
def adult_preprocess(df, balanced=False, impute=False):
    """adult_preprocess(df, balanced=False, impute=False)
            balanced: will sample an even amount of data from each
    
            impute:
                Imputes missing data using random forest,
                or removes rows with missing data
                
        expands categorical data returns X and Y arrays"""
    #drop columns
    drop_columns = ["fnlwgt"]
    df = df.drop(drop_columns , axis=1)
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import tree
    
    
    if impute:
        impute_df = df.copy()
        impute_df.drop(columns = ['income'], inplace=True)
        
        impute_labels = impute_df.workclass
        impute_df.drop(columns = ['workclass'], inplace=True)
        
        impute_df = pd.get_dummies(impute_df)
        
        test_data = impute_df[(df.workclass.values == '?')].copy()
        
        train_data = impute_df[(df.workclass.values != '?')].copy()
        train_label = impute_labels[(df.workclass.values != '?')]
     
        random_forest = RandomForestClassifier(n_estimators=10)
        random_forest = random_forest.fit(train_data, train_label)
        random_forest_pred = random_forest.predict(test_data)    
        df.loc[(df.workclass.values == '?'),'workclass'] = random_forest_pred
        
        #repeat for occupation
        
        impute_df = df.copy()
        impute_df.drop(columns = ['income'], inplace=True)
        
        impute_labels = impute_df.occupation
        impute_df.drop(columns = ['occupation'], inplace=True)
        
        impute_df = pd.get_dummies(impute_df)
        
        test_data = impute_df[(df.occupation.values == '?')].copy()
        
        train_data = impute_df[(df.occupation.values != '?')].copy()
        train_label = impute_labels[(df.occupation.values != '?')]
     
        random_forest = RandomForestClassifier(n_estimators=10)
        random_forest = random_forest.fit(train_data, train_label)
        random_forest_pred = random_forest.predict(test_data)    
        df.loc[(df.occupation.values == '?'),'occupation'] = random_forest_pred
        
        # repeat for native-country
        
        impute_df = df.copy()
        impute_df.drop(columns = ['income'], inplace=True)
        
        impute_labels = impute_df['native-country']
        impute_df.drop(columns = ['native-country'], inplace=True)
        
        impute_df = pd.get_dummies(impute_df)
        
        test_data = impute_df[(df['native-country'].values == '?')].copy()
        
        train_data = impute_df[(df['native-country'].values != '?')].copy()
        train_label = impute_labels[(df['native-country'].values != '?')]
     
        random_forest = tree.DecisionTreeClassifier()
        random_forest = random_forest.fit(train_data, train_label)
        random_forest_pred = random_forest.predict(test_data)    
        df.loc[(df['native-country'].values == '?'),'native-country'] = random_forest_pred    
    else:
        # remove rows with '?'s
        df = df[(df != '?').all(1)]
    
    # convert categorical data into one-hot
    df_one_hot = pd.get_dummies(df)
    
    # sample equal number of plus and minus
    if balanced:
        # find number of income > $50k
        sample_number = len(df_one_hot[df_one_hot['income_>50K'] == 1])
        df_over_50k = df_one_hot[df_one_hot['income_>50K'] == 1].sample(n=sample_number, random_state=0)
        df_under_50k = df_one_hot[df_one_hot['income_>50K'] == 0].sample(n=sample_number, random_state=0)
        frames = [df_over_50k, df_under_50k]
        df_clean = pd.concat(frames)
    else:
        df_clean = df_one_hot
    
    #randomize data order
    df_clean = df_clean.sample(frac=1)
    
    # split into inputs and targets
    X = df_clean.iloc[:,0:-2].values
    Y = df_clean.loc[:,'income_>50K'].values
    
    return X, Y

In [4]:
X, Y = adult_preprocess(adult_df, balanced=False, impute=True)
scaler = StandardScaler()  # Default behavior is to scale to [0,1]
X = scaler.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2, random_state = 0)

X_test, Y_test = adult_preprocess(adult_test_df, balanced=False, impute=True)
X_test = scaler.fit_transform(X_test)

In [19]:
def create_baseline(activation = 'relu',depth = 2):
    # create model
    model = Sequential()
    model.add(Dense(X_test.shape[1], input_dim=X_test.shape[1], kernel_initializer='normal', activation=activation))


    for i in range(depth):
            model.add(Dense (60,kernel_initializer='normal', activation=activation))
            model.add(Dropout(0.5))


    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [2]:
def performance_metrics(y_true, y_pred):
    
    cm = confusion_matrix(y_true=y_true, y_pred=y_pred)
    
    TP = cm[1,1]
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]

    accuracy = ((TP+TN))/(TP+FN+FP+TN)
    precision = (TP)/(TP+FP)
    recall = (TP)/(TP+FN)
    f_measure = (2*recall*precision)/(recall+precision)
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    error_rate = 1 - accuracy
    false_pos = FP/(FP+TN)
    
    metrics = {}
    metrics['accuracy'] =  accuracy
    metrics['precision'] = precision
    metrics['recall'] = recall
    metrics['f_measure'] = f_measure
    metrics['sensitivity'] = sensitivity
    metrics['specificity'] = specificity
    metrics['error_rate'] = error_rate
    metrics['false_pos'] = false_pos
    
    return metrics

In [26]:
nn_metrics = []
for i in range(0,5):
    clf = create_baseline(depth=i)
    clf.fit(X_train,Y_train,batch_size=32,epochs=10, verbose=0)
    
    y_pred = clf.predict(X_test).round()
    
    nn_pm = performance_metrics(Y_test, y_pred)
    print(nn_pm)
    nn_metrics.append(nn_pm)

depth_list = ["0 hidden layers", "1 hidden layer", "2 hidden layers", "3 hidden layers", "4 hidden layers"]
nn_df = pd.DataFrame(nn_metrics,index=depth_list).round(4)


{'error_rate': 0.13344184760910294, 'recall': 0.6522127279683714, 'sensitivity': 0.6522127279683714, 'precision': 0.7596553773024362, 'false_pos': 0.06545307443365696, 'specificity': 0.934546925566343, 'f_measure': 0.7018458793659507, 'accuracy': 0.8665581523908971}
{'error_rate': 0.13123061331040198, 'recall': 0.670195128172427, 'sensitivity': 0.670195128172427, 'precision': 0.7569864592336503, 'false_pos': 0.06824433656957929, 'specificity': 0.9317556634304207, 'f_measure': 0.7109517689237639, 'accuracy': 0.868769386689598}
{'error_rate': 0.13181413347255921, 'recall': 0.6772095395995409, 'sensitivity': 0.6772095395995409, 'precision': 0.7509546033092914, 'false_pos': 0.0712378640776699, 'specificity': 0.9287621359223301, 'f_measure': 0.7121781115879828, 'accuracy': 0.8681858665274408}
{'error_rate': 0.136205890482479, 'recall': 0.583343961229435, 'sensitivity': 0.583343961229435, 'precision': 0.7965865552072449, 'false_pos': 0.04724919093851133, 'specificity': 0.9527508090614887, 'f

Unnamed: 0,accuracy,error_rate,f_measure,false_pos,precision,recall,sensitivity,specificity
0 hidden layers,0.8666,0.1334,0.7018,0.0655,0.7597,0.6522,0.6522,0.9345
1 hidden layer,0.8688,0.1312,0.711,0.0682,0.757,0.6702,0.6702,0.9318
2 hidden layers,0.8682,0.1318,0.7122,0.0712,0.751,0.6772,0.6772,0.9288
3 hidden layers,0.8638,0.1362,0.6735,0.0472,0.7966,0.5833,0.5833,0.9528
4 hidden layers,0.8642,0.1358,0.6784,0.0504,0.7893,0.5948,0.5948,0.9496


In [27]:
nn_df.sort_values(by = ['f_measure', 'accuracy'], ascending = False, inplace = True)
display(nn_df)

Unnamed: 0,accuracy,error_rate,f_measure,false_pos,precision,recall,sensitivity,specificity
2 hidden layers,0.8682,0.1318,0.7122,0.0712,0.751,0.6772,0.6772,0.9288
1 hidden layer,0.8688,0.1312,0.711,0.0682,0.757,0.6702,0.6702,0.9318
0 hidden layers,0.8666,0.1334,0.7018,0.0655,0.7597,0.6522,0.6522,0.9345
4 hidden layers,0.8642,0.1358,0.6784,0.0504,0.7893,0.5948,0.5948,0.9496
3 hidden layers,0.8638,0.1362,0.6735,0.0472,0.7966,0.5833,0.5833,0.9528


In [28]:
nn_metrics[2]

{'accuracy': 0.8681858665274408,
 'error_rate': 0.13181413347255921,
 'f_measure': 0.7121781115879828,
 'false_pos': 0.0712378640776699,
 'precision': 0.7509546033092914,
 'recall': 0.6772095395995409,
 'sensitivity': 0.6772095395995409,
 'specificity': 0.9287621359223301}

In [1]:
nn_performance = performance_metrics(Y_test, y_pred)

nn_df = pd.DataFrame([nn_metrics[2]],index=["Neural Network"]).round(4)

NameError: name 'performance_metrics' is not defined

NameError: name 'nn_df' is not defined

In [11]:
file_name = "nn.csv"
nn_df.to_csv(file_name, encoding='utf-8', index=True)

csv_nn_df = pd.read_csv(file_name, encoding='utf-8', index_col=0)
display(csv_nn_df)

Unnamed: 0,accuracy,error_rate,f_measure,false_pos,precision,recall,sensitivity,specificity
Neural Network,0.8596,0.1404,0.6664,0.0524,0.779,0.5822,0.5822,0.9476
