In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
warnings.filterwarnings('ignore')

column_names = ['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
                'marital-status', 'occupation', 'relationship', 'race', 'gender',
                'capital-gain', 'capital-loss', 'hours-per-week', 'native-country','income']


adult_df = pd.read_csv("data/adult_training.csv",
                      delimiter=",",
                      skipinitialspace=True,
                      names = column_names,
                      dtype=None)

adult_test_df = pd.read_csv("data/adult_training.csv",
                      delimiter=",",
                      skipinitialspace=True,
                      names = column_names,
                      dtype=None)

from sklearn.model_selection import train_test_split
from __future__ import division
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


In [2]:
adult_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
adult_df.describe()

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [4]:
display(adult_df.corr())

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week
age,1.0,-0.076646,0.036527,0.077674,0.057775,0.068756
fnlwgt,-0.076646,1.0,-0.043195,0.000432,-0.010252,-0.018768
educational-num,0.036527,-0.043195,1.0,0.12263,0.079923,0.148123
capital-gain,0.077674,0.000432,0.12263,1.0,-0.031615,0.078409
capital-loss,0.057775,-0.010252,0.079923,-0.031615,1.0,0.054256
hours-per-week,0.068756,-0.018768,0.148123,0.078409,0.054256,1.0


In [5]:
for column, n in zip(adult_df.columns,(adult_df.values.astype(str) == '?').sum(axis = 0)):
    if n > 0:
        print("{} records have \'?\' as \'{}\'".format(n, column))

1836 records have '?' as 'workclass'
1843 records have '?' as 'occupation'
583 records have '?' as 'native-country'


In [6]:
def oneHotCatVars(df, df_cols):
    
    df_1 = adult_data = df.drop(columns = df_cols, axis = 1)
    
    print(df_cols)
    df_2 = pd.get_dummies(df[df_cols])
    
    print(df_1, df_2)
    
    return (pd.concat([df_1, df_2], axis=1, join='inner'))

In [7]:
def adult_preprocess(df, balanced=False, impute=False):
    """adult_preprocess(df, balanced=False, impute=False)
            balanced: will sample an even amount of data from each
    
            impute:
                Imputes missing data using random forest,
                or removes rows with missing data
                
        expands categorical data returns X and Y arrays"""
    #drop columns
    drop_columns = ["fnlwgt"]
    df = df.drop(drop_columns , axis=1)
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn import tree
    
    
    if impute:
        impute_df = df.copy()
        impute_df.drop(columns = ['income'], inplace=True)
        
        impute_labels = impute_df.workclass
        impute_df.drop(columns = ['workclass'], inplace=True)
        
        impute_df = pd.get_dummies(impute_df)
        
        test_data = impute_df[(df.workclass.values == '?')].copy()
        
        train_data = impute_df[(df.workclass.values != '?')].copy()
        train_label = impute_labels[(df.workclass.values != '?')]
     
        random_forest = RandomForestClassifier(n_estimators=10)
        random_forest = random_forest.fit(train_data, train_label)
        random_forest_pred = random_forest.predict(test_data)    
        df.loc[(df.workclass.values == '?'),'workclass'] = random_forest_pred
        
        #repeat for occupation
        
        impute_df = df.copy()
        impute_df.drop(columns = ['income'], inplace=True)
        
        impute_labels = impute_df.occupation
        impute_df.drop(columns = ['occupation'], inplace=True)
        
        impute_df = pd.get_dummies(impute_df)
        
        test_data = impute_df[(df.occupation.values == '?')].copy()
        
        train_data = impute_df[(df.occupation.values != '?')].copy()
        train_label = impute_labels[(df.occupation.values != '?')]
     
        random_forest = RandomForestClassifier(n_estimators=10)
        random_forest = random_forest.fit(train_data, train_label)
        random_forest_pred = random_forest.predict(test_data)    
        df.loc[(df.occupation.values == '?'),'occupation'] = random_forest_pred
        
        # repeat for native-country
        
        impute_df = df.copy()
        impute_df.drop(columns = ['income'], inplace=True)
        
        impute_labels = impute_df['native-country']
        impute_df.drop(columns = ['native-country'], inplace=True)
        
        impute_df = pd.get_dummies(impute_df)
        
        test_data = impute_df[(df['native-country'].values == '?')].copy()
        
        train_data = impute_df[(df['native-country'].values != '?')].copy()
        train_label = impute_labels[(df['native-country'].values != '?')]
     
        random_forest = tree.DecisionTreeClassifier()
        random_forest = random_forest.fit(train_data, train_label)
        random_forest_pred = random_forest.predict(test_data)    
        df.loc[(df['native-country'].values == '?'),'native-country'] = random_forest_pred    
    else:
        # remove rows with '?'s
        df = df[(df != '?').all(1)]
    
    # convert categorical data into one-hot
    df_one_hot = pd.get_dummies(df)
    
    # sample equal number of plus and minus
    if balanced:
        # find number of income > $50k
        sample_number = len(df_one_hot[df_one_hot['income_>50K'] == 1])
        df_over_50k = df_one_hot[df_one_hot['income_>50K'] == 1].sample(n=sample_number, random_state=0)
        df_under_50k = df_one_hot[df_one_hot['income_>50K'] == 0].sample(n=sample_number, random_state=0)
        frames = [df_over_50k, df_under_50k]
        df_clean = pd.concat(frames)
    else:
        df_clean = df_one_hot
    
    #randomize data order
    df_clean = df_clean.sample(frac=1)
    
    # split into inputs and targets
    X = df_clean.iloc[:,0:-2].values
    Y = df_clean.loc[:,'income_>50K'].values
    
    return X, Y

In [8]:
X, Y = adult_preprocess(adult_df, balanced=False, impute=True)
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2, random_state = 0)

X_test, Y_test = adult_preprocess(adult_test_df, balanced=False, impute=True)

In [9]:
# print(list(adult_one_hot))
# print(list(X))
print(len(Y[Y==1]))
print(len(Y[Y==0]))

7841
24720


In [10]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [11]:
p_thres = 0.3

Y_pred = classifier.predict(X_val)
Y_test_pred = classifier.predict(X_test)
Y_test_pred = classifier.predict_proba(X_test)
Y_test_pred = Y_test_pred[:,1]

Y_test_pred[Y_test_pred >= p_thres] = 1 
Y_test_pred[Y_test_pred < p_thres ] = 0

In [12]:
def print_metrics(Y_true, Y_pred):
    """Prints metrics comparing true and predicted classifications"""
    
    cm_test = confusion_matrix(y_true=Y_true, y_pred=Y_pred)

    total = cm_test.sum()

    correct = 0
    for i in range(len(cm_test)):
        correct += cm_test[i,i]
    
    acc = correct/total
    
    print("Confusion Matrix:\n")
    print("      predicted class:")
    print("          0\t1")
    print("        _____________")
    print("true  0| {}\t{}".format(cm_test[0,0], cm_test[0,1]))
    print("class 1| {}\t{}".format(cm_test[1,0], cm_test[1,1]))
    print("")
    print("Correct: \t{}".format(correct))
    print("Misclassified: \t{}".format(total-correct))
    print("Accuracy: \t{:.2f}%".format(acc*100))
    print("Error rate: \t{:.2f}%".format((1-acc)*100))
    print("Sensitivity: \t{:.2f}% (true positive)".format(cm_test[1,1]*100 / cm_test[1].sum()))
    print("Specificity: \t{:.2f}% (true negative)".format(cm_test[0,0]*100 / cm_test[0].sum()))
    print("Precision: \t{:.2f}% (positive predict value)".format(100*cm_test[1,1] / cm_test[:,1].sum()))
    print("False Pos: \t{:.2f}%".format(100*cm_test[0,1] / cm_test[0].sum()))

In [13]:
def performance_metrics(y_true, y_pred):
    
    cm = confusion_matrix(y_true=y_true, y_pred=y_pred)
    
    TP = cm[1,1]
    TN = cm[0,0]
    FP = cm[0,1]
    FN = cm[1,0]

    accuracy = ((TP+TN))/(TP+FN+FP+TN)
    precision = (TP)/(TP+FP)
    recall = (TP)/(TP+FN)
    f_measure = (2*recall*precision)/(recall+precision)
    sensitivity = TP / (TP + FN)
    specificity = TN / (TN + FP)
    error_rate = 1 - accuracy
    false_pos = FP/(FP+TN)
    
    metrics = {}
    metrics['accuracy'] =  accuracy
    metrics['precision'] = precision
    metrics['recall'] = recall
    metrics['f_measure'] = f_measure
    metrics['sensitivity'] = sensitivity
    metrics['specificity'] = specificity
    metrics['error_rate'] = error_rate
    metrics['false_pos'] = false_pos
    
    return metrics

In [14]:
performance_metrics(Y_test, Y_test_pred)

{'accuracy': 0.8306563066244894,
 'error_rate': 0.16934369337551058,
 'f_measure': 0.6899808838412235,
 'false_pos': 0.15408576051779935,
 'precision': 0.6169934640522876,
 'recall': 0.7825532457594695,
 'sensitivity': 0.7825532457594695,
 'specificity': 0.8459142394822007}

In [15]:
print_metrics(Y_test, Y_test_pred)

Confusion Matrix:

      predicted class:
          0	1
        _____________
true  0| 20911	3809
class 1| 1705	6136

Correct: 	27047
Misclassified: 	5514
Accuracy: 	83.07%
Error rate: 	16.93%
Sensitivity: 	78.26% (true positive)
Specificity: 	84.59% (true negative)
Precision: 	61.70% (positive predict value)
False Pos: 	15.41%
