In [57]:
 #Load common modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
from sklearn.exceptions import DataConversionWarning

warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [58]:
# Assign colum names to the dataset
names = ["age", "workclass", "fnlwgt","education", "education_num", "marital_status", "occupation", "relationship", "race", "sex", "capital_gain", "capital_loss", "hours_per_week", "native_country", "class"]

# Load data
data = pd.read_csv("./data/adult.data", names=names)

test = pd.read_csv("./data/adult.test", names=names)

In [59]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [60]:
test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
0,|1x3 Cross validator,,,,,,,,,,,,,,
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.


In [61]:
test = test.drop(test.index[0])

test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
5,18,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K.


In [62]:
# Split to response variables and target variable
X_train = data.drop("class", axis=1)
y_train = data["class"]

X_test = test.drop("class", axis=1)
y_test = test["class"]

In [63]:
# Print number of instances and features for the data
print("Number of instances: %d" % X_train.shape[0])
print("Number of features: %d" % X_train.shape[1])

# Count class labels
unique, counts = np.unique(y_train, return_counts=True)
print("Number of class labels:")
print(dict(zip(unique, counts)))

Number of instances: 32561
Number of features: 14
Number of class labels:
{' <=50K': 24720, ' >50K': 7841}


In [64]:
# Print number of instances and features for the test
print("Number of instances: %d" % X_test.shape[0])
print("Number of features: %d" % X_test.shape[1])

# Count class labels
unique, counts = np.unique(y_test, return_counts=True)
print("Number of class labels:")
print(dict(zip(unique, counts)))

Number of instances: 16281
Number of features: 14
Number of class labels:
{' <=50K.': 12435, ' >50K.': 3846}


In [65]:
# Seperate numerical and categorical attributes
X_train_num = X_train[["age", "fnlwgt", "education_num", "capital_gain", "capital_loss","hours_per_week"]]
X_train_cat = X_train[["workclass","education","marital_status","occupation","relationship","race","sex","native_country"]]

In [68]:
# Seperate numerical and categorical attributes
X_test_num = X_test[["age", "fnlwgt", "education_num", "capital_gain", "capital_loss","hours_per_week"]]
X_test_cat = X_test[["workclass","education","marital_status","occupation","relationship","race","sex","native_country"]]

In [72]:
# Preprocessing
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Train data
num_attribs = list(X_train_num)
cat_attribs = list(X_train_cat)

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("std_scaler", StandardScaler()),
])

full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])

X_train_tr = full_pipeline.fit_transform(X_train)


# Test data
num_attribs_test = list(X_test_num)
cat_attribs_test = list(X_test_cat)

num_pipeline_test = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("std_scaler", StandardScaler()),
])

full_pipeline_test = ColumnTransformer([
    ("num", num_pipeline_test, num_attribs_test),
    ("cat", OneHotEncoder(), cat_attribs_test),
])


X_test_tr = full_pipeline_test.fit_transform(X_test)

In [73]:
# k Nearest Neighbor
from kNN import KNNClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [
    {"k": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "metric": ["euclidean", "manhattan", "chebyshev"]}
]

classifier = KNNClassifier()
grid_search = GridSearchCV(classifier, param_grid, cv=5)
grid_search.fit(X_train_tr, y_train)

print("Best parameters: ", grid_search.best_params_)
print("All results:")
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
    print(params, ":", mean_score)

kNN_model = grid_search.best_estimator_

ValueError: inconsistent shapes

In [None]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()
classifier.fit(X_train, y_train)
nb_model = classifier

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier

param_grid = [
    {'max_depth': [1, 5, 10, 50, 100]}
]

tree_clf = DecisionTreeClassifier()
grid_search = GridSearchCV(tree_clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)

print("Best parameters: ", grid_search.best_params_)
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
    print(mean_score, params)

tree_model = grid_search.best_estimator_


In [None]:
import itertools
from sklearn.metrics import classification_report, confusion_matrix

# Function to plot the confusion matrix
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black", fontsize=20)

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

    
class_names = np.unique(y_test)

models = {'kNN': kNN_model, 'Naive Bayes': nb_model, 'Decision Tree': tree_model}

for name, model in models.items():
    model.fit(X_train, y_train)  # Train model
    y_pred = model.predict(X_test)  # make predictions on test set