In [12]:
import copy
from data import data
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder, LabelBinarizer
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

In [13]:
# Create Column Names
columns = ['age', 'workclass', 'fnlwgt','education','education-num', 'marital-status', 
           'occupation', 'relationship', 'race', 'sex', 'capital-gain','capital-loss',
           'hours-per-week','native-country', 'income']

adult = data('./adult.data', './adult.test', columns)

# Drop Unkown
adult.train_x = adult.train_x[(adult.train_x.values !='?').all(axis=1)]
adult.test_x = adult.test_x[(adult.test_x.values !='?').all(axis=1)]

# Remove Periods
adult.train_x['income'] = adult.train_x['income'].str.replace("\.","",regex=True)
adult.test_x['income'] = adult.test_x['income'].str.replace("\.","",regex=True)

adult.train_y = adult.train_x['income']
adult.test_y = adult.test_x['income']
adult.train_x.drop('income', axis=1, inplace=True)
adult.test_x.drop('income', axis=1, inplace=True)

# Do the onehotencoding
def transform_data(enc, data, column):
    enc_df = pd.DataFrame(enc.transform(data[[column]]).toarray(),columns=enc.categories_[0])
    for item in enc.categories_[0]:
        data[item.strip()] = enc_df[item].to_numpy()
    return data

# Set up the one hot encoding
def encode(df_train, df_test, column_name):
    # Creates the one hot encoder
    enc = OneHotEncoder()
    enc.fit(df_train[[column_name]])
    
    df_train = transform_data(enc, df_train, column_name)
    df_test = transform_data(enc, df_test, column_name)
    
    # Drops the old non-encoded data
    df_train.drop(column_name,axis=1, inplace=True)
    df_test.drop(column_name, axis=1, inplace=True)
    
    return df_train, df_test

# Sets the classes that need to be encoded
to_encode = ['workclass','education', 'marital-status', 'occupation',
             'relationship', 'race', 'sex', 'native-country']

# Encodes all the categories
for category in to_encode:
    adult.train_x, adult.test_x = encode(adult.train_x, adult.test_x, category)


adult_con = copy.deepcopy(adult)


continous = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
             'hours-per-week']


adult.train_x = adult.train_x.drop(continous, axis=1)
adult.test_x = adult.test_x.drop(continous, axis=1)

In [14]:
clf = DecisionTreeClassifier()
clf.fit(adult.train_x,adult.train_y)
predictions = clf.predict(adult.test_x)

print(classification_report(adult.test_y, predictions))

              precision    recall  f1-score   support

       <=50K       0.86      0.89      0.88     11360
        >50K       0.63      0.56      0.59      3700

    accuracy                           0.81     15060
   macro avg       0.75      0.73      0.73     15060
weighted avg       0.80      0.81      0.81     15060



In [15]:
nb = GaussianNB()
nb.fit(adult.train_x, adult.train_y)
predictions = nb.predict(adult.test_x)

print(classification_report(adult.test_y, predictions))

              precision    recall  f1-score   support

       <=50K       0.95      0.42      0.58     11360
        >50K       0.34      0.93      0.50      3700

    accuracy                           0.54     15060
   macro avg       0.65      0.67      0.54     15060
weighted avg       0.80      0.54      0.56     15060



In [16]:
adult = adult_con
def average_binary(data):
    avg = data.mean()
    data = data.to_numpy()
    for index, item in enumerate(data):
        if item <= avg:
            data[index] = 0.0
        else:
            data[index] = 1.0
    return data

to_binary = ['age', 'fnlwgt', 'education-num', 
             'capital-gain', 'capital-loss', 
             'hours-per-week']

for category in to_binary:
    adult.train_x[category] = average_binary(adult.train_x[category])
    adult.test_x[category] = average_binary(adult.test_x[category])


In [17]:
kmeans = []
kmeans_y = []

# Build the Kmeans 
kmeans.append(KMeans(n_clusters=3))
kmeans.append(KMeans(n_clusters=5))
kmeans.append(KMeans(n_clusters=10))

for kmean in kmeans:
    kmean.fit(adult.train_x)
    print(kmean.cluster_centers_)

[[ 4.20363934e-01  4.21386220e-01  3.11694950e-01  5.37722347e-02
   3.44510325e-02  1.69699448e-01  3.15886322e-02  8.42363525e-02
   7.81230832e-01  1.28808015e-02  4.00736046e-02  4.94786342e-02
   5.11142916e-04  2.55571458e-02  3.79268043e-02  1.24718871e-02
   4.39582907e-03  7.05377223e-03  1.34941730e-02  1.21652014e-02
   4.03802903e-02  4.65140053e-02  1.55591903e-01  8.28051523e-03
   3.17521979e-01  5.20343488e-02  1.43120016e-03  8.89388673e-03
   2.56287058e-01  2.58536087e-01  1.22674300e-03  1.51298303e-01
   1.93212022e-02  4.40809650e-01  5.86792067e-02  7.01288080e-02
   2.56798201e-01  3.03576608e-18  2.20813740e-02  1.16847270e-01
   6.64485790e-03  1.67654876e-02  5.55101206e-02  1.79717849e-01
   1.38008587e-02  1.52422817e-01  7.76937232e-03  1.27581272e-01
   3.48599468e-02  9.20057248e-03  1.02228583e-04  3.64547127e-01
   3.94602331e-02  2.00470251e-01  2.51789000e-01  1.43631159e-01
   1.09384584e-02  3.00552034e-02  1.43017788e-01  8.89388673e-03
   8.07094

[[7.40563530e-01 4.15736310e-01 9.92557150e-01 ... 9.14938862e-01
  2.12652844e-03 5.31632111e-04]
 [5.15316542e-01 3.94145677e-01 3.92784207e-01 ... 8.78829135e-01
  2.72294078e-03 6.80735194e-04]
 [7.88208907e-02 4.86062160e-01 1.43543736e-01 ... 8.98429990e-01
  4.80615187e-03 3.20410125e-04]
 ...
 [9.11214953e-02 4.46261682e-01 2.97062750e-02 ... 9.19893191e-01
  3.67156208e-03 3.33778371e-04]
 [7.16517857e-01 4.11830357e-01 1.01283482e-01 ... 9.14899554e-01
  1.11607143e-03 2.79017857e-04]
 [3.73820755e-01 4.81132075e-01 3.67688679e-01 ... 9.13207547e-01
  1.41509434e-03 2.35849057e-04]]


In [18]:
knn = []
knn_y = []

knn.append(KNeighborsClassifier(3))
knn.append(KNeighborsClassifier(5))
knn.append(KNeighborsClassifier(10))

for clf in knn:
    clf.fit(adult.train_x, adult.train_y.values.ravel())
    pred = clf.predict(adult.test_x.tail(10))
    print("Accuracy: {}".format(accuracy_score(adult.test_y.tail(10), pred)))

Accuracy: 0.8
Accuracy: 0.9
Accuracy: 0.9


In [19]:
clf = SVC(kernel='poly')
clf.fit(adult.train_x, adult.train_y)
pred = clf.predict(adult.test_x)
print(classification_report(adult.test_y, pred))

              precision    recall  f1-score   support

       <=50K       0.87      0.93      0.90     11360
        >50K       0.72      0.57      0.64      3700

    accuracy                           0.84     15060
   macro avg       0.79      0.75      0.77     15060
weighted avg       0.83      0.84      0.83     15060



In [38]:
clf = MLPClassifier(alpha=1e-03, hidden_layer_sizes=(5,2), max_iter=500)
clf.fit(adult.train_x, adult.train_y)

MLPClassifier(alpha=0.001, hidden_layer_sizes=(5, 2), max_iter=500)

In [39]:
pred = clf.predict(adult.test_x)
print(classification_report(adult.test_y, pred))

              precision    recall  f1-score   support

       <=50K       0.88      0.92      0.90     11360
        >50K       0.71      0.60      0.65      3700

    accuracy                           0.84     15060
   macro avg       0.79      0.76      0.78     15060
weighted avg       0.84      0.84      0.84     15060

