In [1]:
import copy
from data import data
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer

In [2]:
# Create Column Names
columns = ['age', 'workclass', 'fnlwgt','education','education-num', 'marital-status', 
           'occupation', 'relationship', 'race', 'sex', 'capital-gain','capital-loss',
           'hours-per-week','native-country', 'money']

adult = data('./adult.data', './adult.test', columns)

# Drop Unkown
adult.train_x.replace('\?',np.nan, inplace=True, regex=True)
adult.test_x.replace('\?', np.nan, inplace=True, regex=True)
adult.train_x = adult.train_x.dropna()
adult.test_x = adult.test_x.dropna()


# Encode money category to binary and create labels
def encode_money(df):
    lb = LabelBinarizer()
    binary = lb.fit_transform(df['money'])
    df.drop('money', axis=1, inplace=True)
    return df, pd.DataFrame(binary,columns=['money'])

adult.train_x, adult.train_y = encode_money(adult.train_x)
adult.test_x, adult.test_y = encode_money(adult.test_x)


# Do the onehotencoding
def transform_data(enc, data, column):
    enc_df = pd.DataFrame(enc.transform(data[[column]]).toarray(),columns=enc.categories_[0])
    for item in enc.categories_[0]:
        data[item.strip()] = enc_df[item].to_numpy()
    return data

# Set up the one hot encoding
def encode(df_train, df_test, column_name):
    # Creates the one hot encoder
    enc = OneHotEncoder()
    enc.fit(df_train[[column_name]])
    
    df_train = transform_data(enc, df_train, column_name)
    df_test = transform_data(enc, df_test, column_name)
    
    # Drops the old non-encoded data
    df_train.drop(column_name,axis=1, inplace=True)
    df_test.drop(column_name, axis=1, inplace=True)
    
    return df_train, df_test

# Sets the classes that need to be encoded
to_encode = ['workclass','education', 'marital-status', 'occupation',
             'relationship', 'race', 'sex', 'native-country']

# Encodes all the categories
for category in to_encode:
    adult.train_x, adult.test_x = encode(adult.train_x, adult.test_x, category)


adult_con = copy.deepcopy(adult)


continous = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
             'hours-per-week']

# Drop Continious Attributes
for attribute in continous:
    adult.train_x = adult.train_x.drop(attribute, axis=1)
    adult.test_x = adult.test_x.drop(attribute, axis=1)

In [3]:
clf = DecisionTreeClassifier(max_depth=10, min_samples_leaf=5)
clf.fit(adult.train_x,adult.train_y)
predictions = clf.predict(adult.test_x)

report = classification_report(adult.test_y, predictions, output_dict=True)

print("{: <23}{: <10}".format("True Positive Rate:",report['1']['recall']))
print("{: <23}{: <10}\n\n".format("False Positive Rate:",1-report['0']['recall']))

print(classification_report(adult.test_y, predictions))

True Positive Rate:    0.524054054054054
False Positive Rate:   0.0775596443348886


              precision    recall  f1-score   support

           0       0.86      0.92      0.89     11359
           1       0.69      0.52      0.59      3700

    accuracy                           0.82     15059
   macro avg       0.77      0.72      0.74     15059
weighted avg       0.81      0.82      0.82     15059



In [4]:
nb = GaussianNB()
nb.fit(adult.train_x, adult.train_y.values.ravel())
predictions = nb.predict(adult.test_x)

report = classification_report(adult.test_y, predictions, output_dict=True)

print("{: <23}{: <10}".format("True Positive Rate:",report['1']['recall']))
print("{: <23}{: <10}\n\n".format("False Positive Rate:",1-report['0']['recall']))

print(classification_report(adult.test_y, predictions))

True Positive Rate:    0.93      
False Positive Rate:   0.5814772427150277


              precision    recall  f1-score   support

           0       0.95      0.42      0.58     11359
           1       0.34      0.93      0.50      3700

    accuracy                           0.54     15059
   macro avg       0.65      0.67      0.54     15059
weighted avg       0.80      0.54      0.56     15059



In [5]:
adult = adult_con
def average_binary(data):
    avg = data.mean()
    data = data.to_numpy()
    for index, item in enumerate(data):
        if item <= avg:
            data[index] = 0.0
        else:
            data[index] = 1.0
    return data

to_binary = ['age', 'fnlwgt', 'education-num', 
             'capital-gain', 'capital-loss', 
             'hours-per-week']

for category in to_binary:
    adult.train_x[category] = average_binary(adult.train_x[category])
    adult.test_x[category] = average_binary(adult.test_x[category])


In [6]:
kmeans = []
kmeans_y = []

# Build the Kmeans 
kmeans.append(KMeans(n_clusters=3))
kmeans.append(KMeans(n_clusters=5))
kmeans.append(KMeans(n_clusters=10))

for kmean in kmeans:
    kmean.fit(adult.train_x)
    
#report = classification_report(adult.test_y.tail(10), kmeans_y[2])

centers = kmeans[0].cluster_centers_
print(centers[0])
#plt.scatter(centers[:, 0], centers[:, 2], c='black', s=200, alpha=0.5);

#print(report)

[ 2.79013292e-01  4.78527607e-01  2.74156442e-01  5.36809816e-02
  3.46370143e-02  2.85020450e-01  2.70961145e-02  4.93353783e-02
  7.88471370e-01  2.41564417e-02  7.33640082e-02  3.70654397e-02
  5.11247444e-04  3.47648262e-02  4.89519427e-02  2.03220859e-02
  5.36809816e-03  1.16308793e-02  1.63599182e-02  1.82770961e-02
  3.11860941e-02  3.46370143e-02  1.52862986e-01  6.90184049e-03
  3.39851738e-01  3.64263804e-02  2.30061350e-03  1.21421268e-02
  2.28016360e-01  2.15362986e-01 -6.39679282e-18  4.98466258e-03
  2.31339468e-02  6.91845603e-01  4.66513292e-02  1.80214724e-02
  7.70705521e-02  7.66871166e-04  1.69350716e-01  8.85736196e-02
  4.69069530e-02  9.61145194e-02  7.01687117e-02  1.21037832e-01
  8.94683027e-04  1.00843558e-01  2.50511247e-02  1.08256646e-01
  2.85020450e-02  6.64621677e-02 -6.49480469e-15  5.31186094e-01
  5.54703476e-02  3.17612474e-01  9.57310838e-02 -6.31439345e-16
  1.13752556e-02  2.95245399e-02  1.01738241e-01  8.56339468e-03
  8.48798569e-01 -1.72084

In [7]:
knn = []
knn_y = []

knn.append(KNeighborsClassifier(3))
knn.append(KNeighborsClassifier(5))
knn.append(KNeighborsClassifier(10))

for clf in knn:
    clf.fit(adult.train_x, adult.train_y.values.ravel())
    pred = clf.predict(adult.test_x.tail(10))
    print("Accuracy: {}".format(accuracy_score(adult.test_y.tail(10), pred)))

Accuracy: 0.8
Accuracy: 0.9
Accuracy: 0.9
