In [1]:
import copy
from data import data
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [2]:
# Create Column Names
columns = ['age', 'workclass', 'fnlwgt','education','education-num', 'marital-status', 
           'occupation', 'relationship', 'race', 'sex', 'capital-gain','capital-loss',
           'hours-per-week','native-country', 'income']

adult = data('./adult.data', './adult.test', columns)

# Drop Unkown
adult.train_x = adult.train_x[(adult.train_x.values !='?').all(axis=1)]
adult.test_x = adult.test_x[(adult.test_x.values !='?').all(axis=1)]

# Remove Periods
adult.train_x['income'] = adult.train_x['income'].str.replace("\.","",regex=True)
adult.test_x['income'] = adult.test_x['income'].str.replace("\.","",regex=True)

adult.train_y = adult.train_x['income']
adult.test_y = adult.test_x['income']
adult.train_x.drop('income', axis=1, inplace=True)
adult.test_x.drop('income', axis=1, inplace=True)

# Do the onehotencoding
def transform_data(enc, data, column):
    enc_df = pd.DataFrame(enc.transform(data[[column]]).toarray(),columns=enc.categories_[0])
    for item in enc.categories_[0]:
        data[item.strip()] = enc_df[item].to_numpy()
    return data

# Set up the one hot encoding
def encode(df_train, df_test, column_name):
    # Creates the one hot encoder
    enc = OneHotEncoder()
    enc.fit(df_train[[column_name]])
    
    df_train = transform_data(enc, df_train, column_name)
    df_test = transform_data(enc, df_test, column_name)
    
    # Drops the old non-encoded data
    df_train.drop(column_name,axis=1, inplace=True)
    df_test.drop(column_name, axis=1, inplace=True)
    
    return df_train, df_test

# Sets the classes that need to be encoded
to_encode = ['workclass','education', 'marital-status', 'occupation',
             'relationship', 'race', 'sex', 'native-country']

# Encodes all the categories
for category in to_encode:
    adult.train_x, adult.test_x = encode(adult.train_x, adult.test_x, category)


adult_con = copy.deepcopy(adult)


continous = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
             'hours-per-week']


adult.train_x = adult.train_x.drop(continous, axis=1)
adult.test_x = adult.test_x.drop(continous, axis=1)

In [3]:
clf = DecisionTreeClassifier()
clf.fit(adult.train_x,adult.train_y)
predictions = clf.predict(adult.test_x)

print(classification_report(adult.test_y, predictions))

              precision    recall  f1-score   support

       <=50K       0.86      0.89      0.88     11360
        >50K       0.63      0.56      0.59      3700

    accuracy                           0.81     15060
   macro avg       0.75      0.73      0.74     15060
weighted avg       0.81      0.81      0.81     15060



In [4]:
nb = GaussianNB()
nb.fit(adult.train_x, adult.train_y)
predictions = nb.predict(adult.test_x)

print(classification_report(adult.test_y, predictions))

              precision    recall  f1-score   support

       <=50K       0.95      0.42      0.58     11360
        >50K       0.34      0.93      0.50      3700

    accuracy                           0.54     15060
   macro avg       0.65      0.67      0.54     15060
weighted avg       0.80      0.54      0.56     15060



In [5]:
adult = adult_con
def average_binary(data):
    avg = data.mean()
    data = data.to_numpy()
    for index, item in enumerate(data):
        if item <= avg:
            data[index] = 0.0
        else:
            data[index] = 1.0
    return data

to_binary = ['age', 'fnlwgt', 'education-num', 
             'capital-gain', 'capital-loss', 
             'hours-per-week']

for category in to_binary:
    adult.train_x[category] = average_binary(adult.train_x[category])
    adult.test_x[category] = average_binary(adult.test_x[category])


In [6]:
kmeans = []
kmeans_y = []

# Build the Kmeans 
kmeans.append(KMeans(n_clusters=3))
kmeans.append(KMeans(n_clusters=5))
kmeans.append(KMeans(n_clusters=10))

for kmean in kmeans:
    kmean.fit(adult.train_x)
    print(kmean.cluster_centers_)

[[ 6.09876543e-01  4.25248905e-01  3.74990044e-01  1.21306252e-01
   6.52329749e-02  4.22700119e-01  3.36121067e-02  6.82596575e-02
   6.75029869e-01  6.04540024e-02  1.22102748e-01  4.01433692e-02
   3.98247710e-04  2.37355635e-02  2.34169654e-02  7.64635603e-03
   5.25686977e-03  1.01951414e-02  2.36559140e-02  1.53723616e-02
   2.93906810e-02  4.62763839e-02  1.85185185e-01  1.91158901e-02
   3.24571884e-01  6.63480685e-02  1.03544405e-03  2.86738351e-02
   1.90123457e-01  6.38378239e-16  7.16845878e-04  9.99283154e-01
  -1.33573708e-16  2.22044605e-16 -2.63677968e-16  1.87350135e-16
   4.81879729e-02  2.38948626e-04  1.98247710e-01  1.71724413e-01
   4.43647949e-02  3.45679012e-02  6.96136997e-02  4.03823178e-02
   7.96495420e-05  1.40023895e-01  2.96296296e-02  1.18598168e-01
   2.77180406e-02  7.66228594e-02  9.92592593e-01  2.38948626e-04
   5.49581840e-03  1.59299084e-03 -1.45716772e-15  7.96495420e-05
   7.16845878e-03  2.94703305e-02  4.95420151e-02  6.13301474e-03
   9.07686

[[ 5.14945652e-01  3.93342391e-01  3.94701087e-01 ...  8.79076087e-01
   2.03804348e-03  6.79347826e-04]
 [ 5.51236749e-01  4.46744069e-01 -2.77555756e-16 ...  8.90206966e-01
   1.76678445e-03  7.57193337e-04]
 [ 8.88217523e-02  4.52870091e-01  1.19335347e-01 ...  9.19335347e-01
   3.62537764e-03  3.02114804e-04]
 ...
 [ 5.77870564e-01  4.32985386e-01  5.55111512e-16 ...  9.47390397e-01
   1.25260960e-03  4.17536534e-04]
 [ 7.04619388e-01  3.84515290e-01  3.88578059e-16 ...  9.38191282e-01
   1.30123617e-03  3.25260652e-19]
 [ 3.76700962e-01  4.46067043e-01  9.99668105e-01 ...  9.27314968e-01
   1.65947561e-03  1.08420217e-18]]


In [7]:
knn = []
knn_y = []

knn.append(KNeighborsClassifier(3))
knn.append(KNeighborsClassifier(5))
knn.append(KNeighborsClassifier(10))

for clf in knn:
    clf.fit(adult.train_x, adult.train_y.values.ravel())
    pred = clf.predict(adult.test_x.tail(10))
    print("Accuracy: {}".format(accuracy_score(adult.test_y.tail(10), pred)))

Accuracy: 0.8
Accuracy: 0.9
Accuracy: 0.9


In [8]:
clf = SVC(kernel='poly')
clf.fit(adult.train_x, adult.train_y)
pred = clf.predict(adult.test_x)
print(classification_report(adult.test_y, pred))

              precision    recall  f1-score   support

       <=50K       0.87      0.93      0.90     11360
        >50K       0.72      0.57      0.64      3700

    accuracy                           0.84     15060
   macro avg       0.79      0.75      0.77     15060
weighted avg       0.83      0.84      0.83     15060



In [9]:
clf = MLPClassifier(alpha=1e-03, hidden_layer_sizes=(5,2), max_iter=500)
clf.fit(adult.train_x, adult.train_y)

MLPClassifier(alpha=0.001, hidden_layer_sizes=(5, 2), max_iter=500)

In [10]:
pred = clf.predict(adult.test_x)
print(classification_report(adult.test_y, pred))

              precision    recall  f1-score   support

       <=50K       0.88      0.91      0.90     11360
        >50K       0.69      0.63      0.66      3700

    accuracy                           0.84     15060
   macro avg       0.79      0.77      0.78     15060
weighted avg       0.84      0.84      0.84     15060

