In [1]:
import numpy as np
import mltools as ml
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
warnings.filterwarnings('ignore')

adult = pd.read_csv("data/adult_training.csv",
                      delimiter=",",
                      skipinitialspace=True,
                        #nrows=10000,
                      dtype=None)

from sklearn.model_selection import train_test_split
from __future__ import division
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


In [2]:
def adult_preprocess(adult):
    """Takes in an adult income pandas dataframe, removes '?', 
    expands categorical data returns X and Y arrays"""
    
    # remove rows with '?'s
    adult = adult[(adult != '?').all(1)]
    
    # convert categorical data into one-hot
    adult_one_hot = pd.get_dummies(adult)

    # split into inputs and targets
    X = adult_one_hot.iloc[:,0:-2].values
    Y = adult_one_hot.loc[:,'income_>50K'].values
    
    return X, Y

In [3]:
X, Y = adult_preprocess(adult)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [4]:
# print(list(adult_one_hot))
# print(list(X))
print(len(Y[Y==1]))
print(len(Y[Y==0]))

7508
22654


In [5]:
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train,Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [6]:
Y_pred = classifier.predict(X_test)

In [7]:
Y_test

array([0, 0, 0, ..., 0, 1, 0], dtype=uint8)

In [8]:
def print_metrics(Y_true, Y_pred):
    """Prints metrics comparing true and predicted classifications"""
    
    cm_test = confusion_matrix(y_true=Y_true, y_pred=Y_pred)

    total = cm_test.sum()

    correct = 0
    for i in range(len(cm_test)):
        correct += cm_test[i,i]
    
    acc = correct/total
    
    print("Confusion Matrix:\n")
    print("      predicted class:")
    print("          0\t1")
    print("        _____________")
    print("true  0| {}\t{}".format(cm_test[0,0], cm_test[0,1]))
    print("class 1| {}\t{}".format(cm_test[1,0], cm_test[1,1]))
    print("")
    print("Correct: \t{}".format(correct))
    print("Misclassified: \t{}".format(total-correct))
    print("Accuracy: \t{:.2f}%".format(acc*100))
    print("Error rate: \t{:.2f}%".format((1-acc)*100))

In [9]:
print_metrics(Y_test, Y_pred)

Confusion Matrix:

      predicted class:
          0	1
        _____________
true  0| 4371	161
class 1| 1114	387

Correct: 	4758
Misclassified: 	1275
Accuracy: 	78.87%
Error rate: 	21.13%
