In [1]:
import copy
from data import data
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer

In [8]:
# Create Column Names
columns = ['age', 'workclass', 'fnlwgt','education','education-num', 'marital-status', 
           'occupation', 'relationship', 'race', 'sex', 'capital-gain','capital-loss',
           'hours-per-week','native-country', 'money']

adult = data('./adult.data', './adult.test', columns)

# Drop Unkown
adult.train_x.replace('\?',np.nan, inplace=True, regex=True)
adult.test_x.replace('\?', np.nan, inplace=True, regex=True)
adult.train_x = adult.train_x.dropna()
adult.test_x = adult.test_x.dropna()


# Encode money category to binary and create labels
def encode_money(df):
    lb = LabelBinarizer()
    df['money'] = df['money'].str.replace("\.","",regex=True)
    binary = lb.fit_transform(df['money'])
    df.drop('money', axis=1, inplace=True)
    return df, pd.DataFrame(binary,columns=['money'])

adult.train_x, adult.train_y = encode_money(adult.train_x)
adult.test_x, adult.test_y = encode_money(adult.test_x)


# Do the onehotencoding
def transform_data(enc, data, column):
    enc_df = pd.DataFrame(enc.transform(data[[column]]).toarray(),columns=enc.categories_[0])
    for item in enc.categories_[0]:
        data[item.strip()] = enc_df[item].to_numpy()
    return data

# Set up the one hot encoding
def encode(df_train, df_test, column_name):
    # Creates the one hot encoder
    enc = OneHotEncoder()
    enc.fit(df_train[[column_name]])
    
    df_train = transform_data(enc, df_train, column_name)
    df_test = transform_data(enc, df_test, column_name)
    
    # Drops the old non-encoded data
    df_train.drop(column_name,axis=1, inplace=True)
    df_test.drop(column_name, axis=1, inplace=True)
    
    return df_train, df_test

# Sets the classes that need to be encoded
to_encode = ['workclass','education', 'marital-status', 'occupation',
             'relationship', 'race', 'sex', 'native-country']

# Encodes all the categories
for category in to_encode:
    adult.train_x, adult.test_x = encode(adult.train_x, adult.test_x, category)


adult_con = copy.deepcopy(adult)


continous = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
             'hours-per-week']


adult.train_x = adult.train_x.drop(continous, axis=1)
adult.test_x = adult.test_x.drop(continous, axis=1)

In [9]:
clf = DecisionTreeClassifier(max_depth=10, min_samples_leaf=5)
clf.fit(adult.train_x,adult.train_y)
predictions = clf.predict(adult.test_x)

report = classification_report(adult.test_y, predictions, output_dict=True)

print("{: <23}{: <10}".format("True Positive Rate:",report['1']['recall']))
print("{: <23}{: <10}\n\n".format("False Positive Rate:",1-report['0']['recall']))

print(classification_report(adult.test_y, predictions))

True Positive Rate:    0.5237837837837838
False Positive Rate:   0.0775596443348886


              precision    recall  f1-score   support

           0       0.86      0.92      0.89     11359
           1       0.69      0.52      0.59      3700

    accuracy                           0.82     15059
   macro avg       0.77      0.72      0.74     15059
weighted avg       0.81      0.82      0.82     15059



In [10]:
nb = GaussianNB()
nb.fit(adult.train_x, adult.train_y.values.ravel())
predictions = nb.predict(adult.test_x)

report = classification_report(adult.test_y, predictions, output_dict=True)

print("{: <23}{: <10}".format("True Positive Rate:",report['1']['recall']))
print("{: <23}{: <10}\n\n".format("False Positive Rate:",1-report['0']['recall']))

print(classification_report(adult.test_y, predictions))

True Positive Rate:    0.93      
False Positive Rate:   0.5814772427150277


              precision    recall  f1-score   support

           0       0.95      0.42      0.58     11359
           1       0.34      0.93      0.50      3700

    accuracy                           0.54     15059
   macro avg       0.65      0.67      0.54     15059
weighted avg       0.80      0.54      0.56     15059



In [11]:
adult = adult_con
def average_binary(data):
    avg = data.mean()
    data = data.to_numpy()
    for index, item in enumerate(data):
        if item <= avg:
            data[index] = 0.0
        else:
            data[index] = 1.0
    return data

to_binary = ['age', 'fnlwgt', 'education-num', 
             'capital-gain', 'capital-loss', 
             'hours-per-week']

for category in to_binary:
    adult.train_x[category] = average_binary(adult.train_x[category])
    adult.test_x[category] = average_binary(adult.test_x[category])


In [12]:
kmeans = []
kmeans_y = []

# Build the Kmeans 
kmeans.append(KMeans(n_clusters=3))
kmeans.append(KMeans(n_clusters=5))
kmeans.append(KMeans(n_clusters=10))

for kmean in kmeans:
    kmean.fit(adult.train_x)
    
#report = classification_report(adult.test_y.tail(10), kmeans_y[2])

centers = kmeans[0].cluster_centers_
print(centers[0])
#plt.scatter(centers[:, 0], centers[:, 2], c='black', s=200, alpha=0.5);

#print(report)

[ 6.09845468e-01  4.25282778e-01  3.74940258e-01  1.21315915e-01
  6.52381711e-02  4.22733790e-01  3.36147841e-02  6.82650948e-02
  6.75083639e-01  6.04588179e-02  1.22032818e-01  4.01465668e-02
  3.98279433e-04  2.37374542e-02  2.34188307e-02  7.64696511e-03
  5.25728851e-03  1.01959535e-02  2.36577983e-02  1.53735861e-02
  2.93930221e-02  4.62800701e-02  1.85120280e-01  1.91174128e-02
  3.24597738e-01  6.63533535e-02  1.03552653e-03  2.86761192e-02
  1.90138601e-01 -2.83106871e-15  7.16902979e-04  9.99283097e-01
  4.09394740e-16 -1.88737914e-15  2.42861287e-17  7.07767178e-16
  4.81918114e-02  2.38967660e-04  1.98263502e-01  1.71658436e-01
  4.43683288e-02  3.45706548e-02  6.96192449e-02  4.03855345e-02
  7.96558866e-05  1.40035049e-01  2.96319898e-02  1.18607615e-01
  2.77202485e-02  7.66289629e-02  9.92592003e-01  2.38967660e-04
  5.49625617e-03  1.59311773e-03  2.12330153e-15  7.96558866e-05
  7.16902979e-03  2.94726780e-02  4.95459614e-02  6.13350327e-03
  9.07678827e-01  8.16013

In [13]:
knn = []
knn_y = []

knn.append(KNeighborsClassifier(3))
knn.append(KNeighborsClassifier(5))
knn.append(KNeighborsClassifier(10))

for clf in knn:
    clf.fit(adult.train_x, adult.train_y.values.ravel())
    pred = clf.predict(adult.test_x.tail(10))
    print("Accuracy: {}".format(accuracy_score(adult.test_y.tail(10), pred)))

Accuracy: 0.8
Accuracy: 0.7
Accuracy: 0.8
