In [77]:
#Import statements
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [78]:
#Load in the data
data = pd.read_csv('income_evaluation.csv', skipinitialspace=True)
#Check to see if there are any invalid values
print(data.isnull().values.any())
#Replace the invalid values with NaN
data[data == '?'] = np.nan
#Replace the nans with the mode
for col in data.columns:
    data[col].fillna(data[col].mode()[0], inplace=True)
#Check to see if there are any missing values
print(data.isna().values.any())
#Check for any duplicated rows
print(data.duplicated().any())
#Delete duplicated rows
data.drop_duplicates(inplace=True)

False
False
True


In [80]:
#Take a first look at the data
data.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32537.0,32537.0,32537.0,32537.0,32537.0,32537.0
mean,38.585549,189780.8,10.081815,1078.443741,87.368227,40.440329
std,13.637984,105556.5,2.571633,7387.957424,403.101833,12.346889
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,236993.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [81]:
#Create a map for our target threshold in income
income_map = {'<=50K': 1, '>50K': 0}
#Apply the map to the income column
data['income'] = data['income'].map(income_map)
#Check the data
data.describe(include='all')


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
count,32537.0,32537,32537.0,32537,32537.0,32537,32537,32537,32537,32537,32537.0,32537.0,32537.0,32537,32537.0
unique,,8,,16,,7,14,6,5,2,,,,41,
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,
freq,,24509,,10494,,14970,5979,13187,27795,21775,,,,29735,
mean,38.585549,,189780.8,,10.081815,,,,,,1078.443741,87.368227,40.440329,,0.759074
std,13.637984,,105556.5,,2.571633,,,,,,7387.957424,403.101833,12.346889,,0.427652
min,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,0.0
25%,28.0,,117827.0,,9.0,,,,,,0.0,0.0,40.0,,1.0
50%,37.0,,178356.0,,10.0,,,,,,0.0,0.0,40.0,,1.0
75%,48.0,,236993.0,,12.0,,,,,,0.0,0.0,45.0,,1.0


In [82]:
#Encode the categorical variables
from sklearn.preprocessing import LabelEncoder
#Create a list of the categorical variables
categorical = data.select_dtypes(include=['object'])
print(categorical.keys())
#Apply Label Encoding to each column
labelencoder = LabelEncoder()
for col in categorical:
    data[col] = labelencoder.fit_transform(data[col])

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')


In [83]:
#Training and Test split
from sklearn.model_selection import train_test_split
x = data[data.keys()[0:-1]].drop(['fnlwgt'], axis = 1)
y = data[data.keys()[-1]]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 0)


In [84]:
#Import the models from sklearn
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

In [85]:
#Create a variety of models to compare the performance of each model
models = {
        'CatBoost': CatBoostClassifier(verbose=False, task_type='GPU'),
        #'XGBoost': XGBClassifier(tree_method='gpu_hist'),
        #'LGBM': LGBMClassifier(device='gpu'),
        'Gradient Boosting': GradientBoostingClassifier(),
        'RandomForestClassifier': RandomForestClassifier(),
        'DecisionTreeClassifier': DecisionTreeClassifier(),
        'KNeighborsClassifier': KNeighborsClassifier(),
        #'LogisticRegression': LogisticRegression(),
        'GaussianNB': GaussianNB(),
        'SVC': SVC(),
    }

scores = []
predictions = []
for model_name, model in models.items():
    start = time.time()
    model.fit(x_train, y_train)
    probs = model.predict(x_test)
    #predictions.append(probs)
    accuracy = accuracy_score(y_test, probs)
    scores.append((model_name, accuracy))
    end = time.time()
    print(f'{model_name} Accuracy: {accuracy} Time: {end - start}')

#Ensemble the models using a voting classifier
start = time.time()
voting_clf = VotingClassifier(estimators=[(name, model) for name, model in models.items()], voting='hard')
voting_clf.fit(x_train, y_train)
voting_clf.score(x_test, y_test)
end = time.time()
print(f'Voting Classifier Accuracy: {voting_clf.score(x_test, y_test)} Time: {end - start}')
    


CatBoost Accuracy: 0.8736939151813153 Time: 32.52740144729614
Gradient Boosting Accuracy: 0.8666256914566687 Time: 1.3440580368041992
RandomForestClassifier Accuracy: 0.8526941200573653 Time: 1.7893614768981934
DecisionTreeClassifier Accuracy: 0.8121286621593936 Time: 0.060515642166137695
KNeighborsClassifier Accuracy: 0.8396844908830158 Time: 1.1137478351593018
GaussianNB Accuracy: 0.8029092399098545 Time: 0.011319875717163086
SVC Accuracy: 0.7999385371850031 Time: 17.72130298614502
Voting Classifier Accuracy: 0.8637574267568121 Time: 53.48845982551575


In [88]:
#Get Feature importance of best performing models 
print(models['CatBoost'].get_feature_importance(prettified=True))

        Feature Id  Importances
0     capital-gain    22.794864
1     relationship    18.780052
2              age    16.003635
3    education-num     8.715774
4     capital-loss     8.683193
5   hours-per-week     7.978368
6       occupation     5.994970
7   marital-status     5.389662
8              sex     2.209831
9        education     1.812240
10       workclass     1.027981
11  native-country     0.315103
12            race     0.294326
