In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Read in training data and test data
df_train = pd.read_csv('datasets\\adult.data', header=None, delimiter=', ', engine='python')
#print(df_train.head())

df_test = pd.read_csv('datasets\\adult.test', skiprows=1, header=None, delimiter=', ', engine='python')
#print(df_test.head())

In [3]:
# Add headers to training data and test data
df_train.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'annual-income']

df_test.columns = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'annual-income']

In [4]:
# Checking the training data and test data
print(df_train.head())
print(df_train.shape)
print(df_train.info())

print(df_test.head())
print(df_test.shape)
print(df_test.info())

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country annual-income  
0          2174             0              40  United-States         <=50K  
1             0             

In [5]:
# Check for missing values
print(df_train.isnull().sum())
print(df_test.isnull().sum())

# Check for duplicates
print("Number of duplicates in training data set:", df_train.duplicated().sum())
print("Number of duplicates in test data set:", df_test.duplicated().sum())

# We can see that there are 24 duplicates in training data set and 5 duplicates in test data set

# Remove duplicates for both data sets
df_train.drop_duplicates(inplace=True)
df_test.drop_duplicates(inplace=True)

print("Number of duplicates in training data set:", df_train.duplicated().sum())
print("Number of duplicates in test data set:", df_test.duplicated().sum())

print(df_train.shape)
print(df_test.shape)

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
annual-income     0
dtype: int64
age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
annual-income     0
dtype: int64
Number of duplicates in training data set: 24
Number of duplicates in test data set: 5
Number of duplicates in training data set: 0
Number of duplicates in test data set: 0
(32537, 15)
(16276, 15)


In [6]:
# Check for unusual values
for i in df_train.select_dtypes(include="object").columns:
    print(df_train[i].value_counts())
    print("***"*10)

for i in df_test.select_dtypes(include="object").columns:
    print(df_test[i].value_counts())
    print("***"*10)

# We can see that some columns such as workclass, occupation, and native-country have '?' values

workclass
Private             22673
Self-emp-not-inc     2540
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64
******************************
education
HS-grad         10494
Some-college     7282
Bachelors        5353
Masters          1722
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           645
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           332
1st-4th           166
Preschool          50
Name: count, dtype: int64
******************************
marital-status
Married-civ-spouse       14970
Never-married            10667
Divorced                  4441
Separated                 1025
Widowed                    993
Married-spouse-absent      418
Married-AF-spouse           23
Name: count, dtype: int64
**********************

In [7]:
# Remove rows that have the value '?' in any of the columns
df_train.replace('?', np.nan, inplace=True)
df_test.replace('?', np.nan, inplace=True)

df_train.dropna(inplace=True)
df_test.dropna(inplace=True)

print(df_train.head())
print(df_test.head())
print(df_train.shape)
print(df_test.shape)

   age         workclass  fnlwgt  education  education-num  \
0   39         State-gov   77516  Bachelors             13   
1   50  Self-emp-not-inc   83311  Bachelors             13   
2   38           Private  215646    HS-grad              9   
3   53           Private  234721       11th              7   
4   28           Private  338409  Bachelors             13   

       marital-status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital-gain  capital-loss  hours-per-week native-country annual-income  
0          2174             0              40  United-States         <=50K  
1             0             

In [8]:
# Exploratory data analysis of training data and test data
#print(df_train.describe().T)
#print(df_test.describe().T)

In [9]:
# Training and test data sets
X_train = df_train.drop('annual-income', axis=1)
y_train = df_train['annual-income']
#print(X_train.head())
#print(y_train.head())

X_test = df_test.drop('annual-income', axis=1)
y_test = df_test['annual-income']
y_test.replace('<=50K.', '<=50K', inplace=True)
y_test.replace('>50K.', '>50K', inplace=True)
#print(X_test.head())
#print(y_test.head())

In [10]:
# Convert categorical features to numbers
from sklearn.preprocessing import OneHotEncoder

categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
one_hot_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

one_hot_transform_X_train = one_hot_encoder.fit_transform(X_train[categorical_features])
X_train = pd.concat([X_train, one_hot_transform_X_train], axis=1).drop(columns=categorical_features)
#print(X_train)

one_hot_transform_X_test = one_hot_encoder.transform(X_test[categorical_features])
X_test = pd.concat([X_test, one_hot_transform_X_test], axis=1).drop(columns=categorical_features)
#print(X_test)

In [11]:
# Import K Neighbors Classifier
from sklearn.neighbors import KNeighborsClassifier
kn_classifier = KNeighborsClassifier()
kn_classifier.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [12]:
# Train the K Neighbors Classifier
import time
start_time = time.process_time()
kn_classifier.fit(X_train, y_train)
print(time.process_time() - start_time)

0.015625


In [13]:
# Evaluating the K Neighbors classifier model using the training data and test data
print("Accuracy on training data:", kn_classifier.score(X_train, y_train))
print("Accuracy on test data:", kn_classifier.score(X_test, y_test))

# At first the accuracy was 0

start_time = time.process_time()
y_predict = kn_classifier.predict(X_test)
print(time.process_time() - start_time)

Accuracy on training data: 0.8308172135770928
Accuracy on test data: 0.7693125207572236
5.859375


In [14]:
# Use GridSearchCV to tune hyper parameters of K Neighors classifier model
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

parameters_grid = {
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21],
    'weights': ['uniform', 'distance']
}

start_time = time.process_time()
grid_search = GridSearchCV(kn_classifier, parameters_grid, cv=5, n_jobs=-1)

grid_search.fit(X_train, y_train)
print(time.process_time() - start_time)

print("Best parameters for K Neighbors classifier:", grid_search.best_params_)
best_kn = grid_search.best_estimator_

start_time = time.process_time()
y_predict = best_kn.predict(X_test)
print(time.process_time() - start_time)

print('Accuracy of tuned K Neighbors classifier:', accuracy_score(y_test, y_predict))
print('Classification Report for tuned K Neighbors classifier:')
print(classification_report(y_test, y_predict))

1.015625
Best parameters for K Neighbors classifier: {'metric': 'manhattan', 'n_neighbors': 21, 'weights': 'distance'}
21.90625
Accuracy of tuned K Neighbors classifier: 0.7982729990036532
Classification Report for tuned K Neighbors classifier:
              precision    recall  f1-score   support

       <=50K       0.81      0.96      0.88     11355
        >50K       0.73      0.29      0.41      3700

    accuracy                           0.80     15055
   macro avg       0.77      0.63      0.65     15055
weighted avg       0.79      0.80      0.76     15055



In [15]:
# Import Decision Tree classifier
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()
print(dt_classifier.get_params())

#Train the Decision Tree classifier
start_time = time.process_time()
dt_classifier.fit(X_train, y_train)
print(time.process_time() - start_time)

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'random_state': None, 'splitter': 'best'}
0.03125


In [16]:
# Evaluating the Decision Tree classifier model using the training data and test data
print("Accuracy on training data:", dt_classifier.score(X_train, y_train))

print("Accuracy on test data:", dt_classifier.score(X_test, y_test))

start_time = time.process_time()
y_predict = dt_classifier.predict(X_test)
print(time.process_time() - start_time)

Accuracy on training data: 0.9999668203988188
Accuracy on test data: 0.80631019594819
0.0


In [17]:
# Use GridSearchCV to tune hyper parameters of Decision Tree classifier model
parameters_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 3, 5, 7, 9, 11],
    'max_features': [None, 'sqrt', 'log2'],
    'max_leaf_nodes': [2, 4, 6, 8, 10, 12, 14, 16],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 4, 6]
}

start_time = time.process_time()
grid_search = GridSearchCV(dt_classifier, parameters_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
print(time.process_time() - start_time)

print("Best parameters for Decision Tree classifier:", grid_search.best_params_)
best_dt = grid_search.best_estimator_

start_time = time.process_time()
y_predict = best_dt.predict(X_test)
print(time.process_time() - start_time)

print('Accuracy of tuned Decision Tree classifier:', accuracy_score(y_test, y_predict))
print('Classification Report for tuned Decision Tree classifier:')
print(classification_report(y_test, y_predict))

86.953125
Best parameters for Decision Tree classifier: {'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': 16, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.015625
Accuracy of tuned Decision Tree classifier: 0.8467618731318499
Classification Report for tuned Decision Tree classifier:
              precision    recall  f1-score   support

       <=50K       0.86      0.96      0.90     11355
        >50K       0.79      0.51      0.62      3700

    accuracy                           0.85     15055
   macro avg       0.82      0.73      0.76     15055
weighted avg       0.84      0.85      0.83     15055

