In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')

adult = pd.read_csv("data/adult_training.csv",
                      delimiter=",",
                      skipinitialspace=True,
                        #nrows=10000,
                      dtype=None)

adult_test = pd.read_csv("data/adult_training.csv",
                      delimiter=",",
                      skipinitialspace=True,
                        #nrows=10000,
                      dtype=None)

from sklearn.model_selection import train_test_split
from __future__ import division
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from sklearn.utils import resample
from sklearn.ensemble import BaggingClassifier

from joblib import dump, load
from sklearn.model_selection import cross_val_score
import sys
import time



In [7]:
def adult_preprocess_balanced(adult):
    """Takes in an adult income pandas dataframe, removes '?', 
    expands categorical data returns X and Y arrays"""
    
    # remove rows with '?'s
    adult = adult[(adult != '?').all(1)]
    
    # convert categorical data into one-hot
    adult_one_hot = pd.get_dummies(adult)
    
    adult_over_50k = adult_one_hot[adult_one_hot['income_>50K'] == 1].sample(n=7500, random_state=0)
    adult_under_50k = adult_one_hot[adult_one_hot['income_>50K'] == 0].sample(n=7500, random_state=0)
    
    frames = [adult_over_50k, adult_under_50k]
    
    adult_clean = pd.concat(frames)
    adult_clean = adult_clean.sample(frac=1)
    
    # split into inputs and targets
    X = adult_clean.iloc[:,0:-2].values
    Y = adult_clean.loc[:,'income_>50K'].values
    
    return X, Y

In [8]:
def adult_preprocess_unbalanced(adult):
    """Takes in an adult income pandas dataframe, removes '?', 
    expands categorical data returns X and Y arrays"""
    
    # remove rows with '?'s
    adult = adult[(adult != '?').all(1)]
    
    # convert categorical data into one-hot
    adult_one_hot = pd.get_dummies(adult)

    # split into inputs and targets
    X = adult_one_hot.iloc[:,0:-2].values
    Y = adult_one_hot.loc[:,'income_>50K'].values
    
    return X, Y

In [9]:
X, Y = adult_preprocess_balanced(adult)
print(X[1])
# scaler = StandardScaler()  # Default behavior is to scale to [0,1]
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y)

X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2, random_state = 0)

X_test, Y_test = adult_preprocess_balanced(adult_test)

[    36 196529     14      0      0     40      0      1      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      1      0      0      0      0
      0      1      0      0      0      0      0      0      0      0
      0      0      0      0      0      1      0      0      0      0
      0      0      0      0      0      1      0      0      0      0
      1      1      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      1      0      0]


In [10]:
#  kernel types = ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ 

# for i in range(100):
    
#     classifiers[i] = SVC(gamma='auto', kernel = "linear")
    
#     X_train_sample, Y_train_sample = resample(X_train,Y_train, n_samples=500, replace=False)
#     classifiers[i].fit(X_train_sample,Y_train_sample)

# start = time.time()

# bag = SVC(gamma='auto', kernel = "linear")

# # bag =  BaggingClassifier(SVC(gamma='auto', kernel = 'sigmoid'),
# #                              max_samples=0.2, n_estimators = 1000,n_jobs = 5)

# bag.fit(X_train,Y_train)

# # dump(bag,'svmBag_sigmoid.joblib')

# end = time.time()
# print(end - start)

# bag =  BaggingClassifier(SVC(gamma='auto', kernel = 'poly'),
#                              max_samples=0.1, n_estimators = 50,n_jobs = 5)

# bag.fit(X_train,Y_train)

# dump(bag,'svmBag_linear.joblib')

In [11]:
# CValue = 2.0 ** np.arange(-5,16,2)
# GammaValue = 2.0 ** np.arange(-15,4,2)
# for c in CValue:
#     for gamma in GammaValue:
#         clf = SVC(kernel='rbf', C=c, gamma = gamma)
#         scores = cross_val_score(clf, X_train, Y_train, cv=5)
#         print("Accuracy with a gamma %0.5f and c %0.2f: %0.2f (+/- %0.2f)" % (gamma, c, scores.mean(), scores.std() * 2))
#         sys.stdout.flush()
        

In [None]:
kernels = ['rbf','linear','sigmoid','poly']
c = 2 ** 13
gamma =  2 **(-11)
for kernel in kernels:
        start = time.time()
        clf = SVC(kernel=kernel, C=c, gamma = gamma)
        clf.fit(X_train,Y_train)
        dump(clf,'svm_' + kernel + '.joblib')
        print('Done  ' , kernel +  str(time.time()- start) )
        sys.stdout.flush()



('Done  ', 'rbf14.6808309555')


In [11]:
import time
start = time.time()
c = 2 ** 13
gamma =  2 **(-11)
# bag = SVC(gamma='auto', kernel = "linear")

for n_estimator in range (5,101, 5):
    bag =  BaggingClassifier(SVC(gamma=gamma, kernel = 'rbf', C=c),
                                 max_samples=0.2, n_estimators = n_estimator, n_jobs = 5)

    bag.fit(X_train,Y_train)


    dump(bag,'svmBag_' + str(n_estimator) +'.joblib')
    print('Done  ' + str(n_estimator) + ' bag' )
    sys.stdout.flush()

Done  5 bag
Done  10 bag
Done  15 bag
Done  20 bag
Done  25 bag
Done  30 bag
Done  35 bag
Done  40 bag
Done  45 bag
Done  50 bag
Done  55 bag
Done  60 bag
Done  65 bag
Done  70 bag
Done  75 bag
Done  80 bag
Done  85 bag
Done  90 bag
Done  95 bag
Done  100 bag


In [14]:
def print_metrics(Y_true, Y_pred):
    """Prints metrics comparing true and predicted classifications"""
    
    cm_test = confusion_matrix(y_true=Y_true, y_pred=Y_pred)

    total = cm_test.sum()

    correct = 0
    for i in range(len(cm_test)):
        correct += cm_test[i,i]
    
    acc = correct/total
    
    print("Confusion Matrix:\n")
    print("      predicted class:")
    print("          0\t1")
    print("        _____________")
    print("true  0| {}\t{}".format(cm_test[0,0], cm_test[0,1]))
    print("class 1| {}\t{}".format(cm_test[1,0], cm_test[1,1]))
    print("")
    print("Correct: \t{}".format(correct))
    print("Misclassified: \t{}".format(total-correct))
    print("Accuracy: \t{:.2f}%".format(acc*100))
    print("Error rate: \t{:.2f}%".format((1-acc)*100))
    print("Sensitivity: \t{:.2f}% (true positive)".format(cm_test[1,1]*100 / cm_test[1].sum()))
    print("Specificity: \t{:.2f}% (true negative)".format(cm_test[0,0]*100 / cm_test[0].sum()))
    print("Precision: \t{:.2f}% (positive predict value)".format(100*cm_test[1,1] / cm_test[:,1].sum()))
    print("False Pos: \t{:.2f}%".format(100*cm_test[0,1] / cm_test[0].sum()))

In [16]:
clf = load('svm_rbf.joblib')

In [17]:
y_pred = clf.predict(X_val)

print_metrics(Y_val,y_pred)

Confusion Matrix:

      predicted class:
          0	1
        _____________
true  0| 1196	348
class 1| 194	1262

Correct: 	2458
Misclassified: 	542
Accuracy: 	81.93%
Error rate: 	18.07%
Sensitivity: 	86.68% (true positive)
Specificity: 	77.46% (true negative)
Precision: 	78.39% (positive predict value)
False Pos: 	22.54%
