# Lab 3 - Naive Bayes and Neural Networks

## Data

In [132]:
#| echo: False
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix
from xgboost import XGBClassifier

In [3]:
data = pd.read_csv("/Users/Bnkes/Desktop/GitHub/AdvancedMachineLearning/Data/IncomeClassificationData/income_evaluation.csv")
data.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"] # Remove leading spaces from existing column names
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


Seeing as the data is already clean, I am going through it further and removing values that occur in only a very small sample of the cases (<1%) as these may be biased and will also complicate parameter tuning through grid searching.

In [5]:
data = data[(data["workclass"] != " Without-pay") & (data["workclass"] != " Never-worked")] # Remove these two sets of values due to them being a very small portion of the data set (21/32561 values)
data = data[data["education"] != " Preschool"] # Removed these values due to it making up a small and likely very biased sample (51/32561)
data = data[data["marital-status"] != " Married-AF-spouse"] # Removed these values due to small number of rows (23/32561)
data = data[data["occupation"] != " Armed-Forces"] # Removed these values due to small number of rows (23/32561)
data["region-us"] = np.where(np.isin(data["native-country"], [" United-States", " Outlying-US(Guam-USVI-etc)", " Canada"]), 1, 0) # The majority of the rows (29101/32561) are from the US, due to the wide spread of other values I am going to create a column for each major geographical region
data["region-americas"] = np.where(np.isin(data["native-country"], [" Mexico", " Puerto-Rico", " El-Salvador", " Cuba", " Jamaica", " Columbia", " Haiti", " Dominican-Republiic", " Guatemala", " Nicaragua", " Peru", " Ecuador", " Trinidad&Tobago", " Honduras"]), 1, 0)
data["region-asia-oceania"] = np.where(np.isin(data["native-country"], [" Philippines", " India", " China", " Vietnam", " Japan", " Taiwan", " Iran", " Hong", " Thailand", " Cambodia", " Laos"]), 1, 0)
data["region-europe"] = np.where(np.isin(data["native-country"], [" Germany", " England", " Italy", " Poland", " Portugal", " France", " Greece", " Iceland", " Hungary", " Scotland", " Holand-Netherlands"]), 1, 0)
data = data.drop("native-country", axis = 1)
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,income,region-us,region-americas,region-asia-oceania,region-europe
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,<=50K,1,0,0,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,<=50K,1,0,0,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,<=50K,1,0,0,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,<=50K,1,0,0,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,<=50K,0,1,0,0


Now that the data is cleaned and some features have been added, we can create the training and test splits and attempt to classify income.

In [7]:
X = data.drop("income", axis = 1)
y = data["income"]
y = LabelEncoder().fit_transform(y) # Label encoding y turns the numeric values into zeroes and ones

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

## Naive Bayes
The first model we will try will be a Naive Bayes model. It is an easy to fit model that has good baseline performance, therefore we will be comparing the performance of the following models to the Naive Bayes model until one of them performs better.

In [11]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        # ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number)) # Naive bayes does not like negative numbers
    ]
)

In [13]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("naive-bayes", CategoricalNB())
    ]
)

parameters = {
    "naive-bayes__alpha": [.1, .2, .3, .4, .5, .6, .7, .8, .9, 1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='f1', n_jobs=-1, verbose = 2)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [15]:
# For when the dropdown menu doesn't work
gscv_fitted.best_params_

{'naive-bayes__alpha': 0.3}

In [17]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("naive-bayes", CategoricalNB())
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

In [19]:
y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual >= $50k", "Actual < $50k"], columns=["Predicted >= $50k", "Predicted < $50k"])

cm_df

Unnamed: 0,Predicted >= $50k,Predicted < $50k
Actual >= $50k,19311,5316
Actual < $50k,1832,5998


In [21]:
f1 = f1_score(y_true=y, y_pred = y_pred)
print(f"F1 Score for Naive Bayes Classifier: {f1}")

F1 Score for Naive Bayes Classifier: 0.626619306310071


The Naive Bayes model has started off strong with an F1 score of .63. This is a good score and I believe it may be difficult for the Neural Network that we are trying next to compete with it.

## Neural Network
Now we will attempt to use a Neural Network for our classification problem. Neural Networks are much more computationally challenging to compute and do not always perform well. Desptite these issues, they are worth attempting as Neural Networks can be extremely accurate classifiers as well. I will also be grid searching for each activation function, as some of them tend to fail when the networks get larger and I would prefer to be able to remove those as they occur.

In [11]:
ct = ColumnTransformer(
    [
        ("dummify", OneHotEncoder(sparse_output = False, handle_unknown='ignore', drop="first"), make_column_selector(dtype_include=object)),
        ("standardize", StandardScaler(), make_column_selector(dtype_include=np.number))
    ]
)

In [66]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("neural-network", MLPClassifier(solver="lbfgs", max_iter=5000, early_stopping=True, activation = "identity"))
    ]
)

parameters = {
    "neural-network__hidden_layer_sizes": [(2, ), (4, ), (6, ), (8, ), (10, ), (20, ), (30, ), (40, ), (50, ), (60, ), (80, ), (100, ), (200, )],
    "neural-network__alpha": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='f1', n_jobs=1, verbose = 1)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 91 candidates, totalling 455 fits


In [80]:
pd.DataFrame(gscv_fitted.cv_results_).sort_values(by = "rank_test_score", ascending = True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_neural-network__alpha,param_neural-network__hidden_layer_sizes,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
38,4.270849,1.216873,0.019903,0.002458,0.0001,"(200,)","{'neural-network__alpha': 0.0001, 'neural-netw...",0.653738,0.652646,0.667832,0.671383,0.667363,0.662592,0.007808,1
27,0.22654,0.009992,0.013302,0.001123,0.0001,"(4,)","{'neural-network__alpha': 0.0001, 'neural-netw...",0.653751,0.652189,0.667366,0.671388,0.668058,0.662551,0.007955,2
68,0.330158,0.02381,0.013002,0.000316,0.1,"(8,)","{'neural-network__alpha': 0.1, 'neural-network...",0.653047,0.65289,0.6676,0.672091,0.666434,0.662412,0.007939,3
5,0.545295,0.020438,0.013102,0.0002,1e-06,"(20,)","{'neural-network__alpha': 1e-06, 'neural-netwo...",0.652817,0.652174,0.668064,0.670913,0.668057,0.662405,0.00816,4
10,1.492162,0.065158,0.016203,0.001806,1e-06,"(80,)","{'neural-network__alpha': 1e-06, 'neural-netwo...",0.652342,0.652632,0.668066,0.671621,0.667363,0.662405,0.008226,5
75,1.457555,0.099129,0.015503,0.001817,0.1,"(80,)","{'neural-network__alpha': 0.1, 'neural-network...",0.651622,0.652189,0.668064,0.671853,0.668291,0.662404,0.008678,6
40,0.278549,0.018892,0.017003,0.001924,0.001,"(4,)","{'neural-network__alpha': 0.001, 'neural-netwo...",0.651868,0.652174,0.668298,0.671383,0.668291,0.662403,0.008552,7
61,1.043483,0.034326,0.013803,0.000679,0.01,"(60,)","{'neural-network__alpha': 0.01, 'neural-networ...",0.652327,0.652661,0.667831,0.671621,0.667362,0.66236,0.008191,8
41,0.337159,0.010596,0.016103,0.002417,0.001,"(6,)","{'neural-network__alpha': 0.001, 'neural-netwo...",0.652817,0.651945,0.6669,0.671383,0.668754,0.66236,0.008276,9
85,1.458156,0.028734,0.014503,0.002026,1.0,"(40,)","{'neural-network__alpha': 1.0, 'neural-network...",0.653061,0.65193,0.6676,0.672798,0.666203,0.662318,0.008324,10


In [82]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("neural-network", MLPClassifier(solver="lbfgs", max_iter=5000, early_stopping=True, activation = "logistic"))
    ]
)

parameters = {
    "neural-network__hidden_layer_sizes": [(2, ), (4, ), (6, ), (8, ), (10, ), (20, ), (30, ), (40, ), (50, ), (60, ), (80, ), (100, ), (200, )],
    "neural-network__alpha": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='f1', n_jobs=1, verbose = 1)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 91 candidates, totalling 455 fits


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("

In [83]:
pd.DataFrame(gscv_fitted.cv_results_).sort_values(by = "rank_test_score", ascending = True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_neural-network__alpha,param_neural-network__hidden_layer_sizes,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
27,3.170556,2.083517,0.013202,0.000245,0.0001,"(4,)","{'neural-network__alpha': 0.0001, 'neural-netw...",0.685343,0.680704,0.692519,0.690215,0.699353,0.689627,0.006345,1
41,3.489012,2.38877,0.015202,0.002089,0.001,"(6,)","{'neural-network__alpha': 0.001, 'neural-netwo...",0.669655,0.688559,0.700269,0.686247,0.695064,0.687959,0.010396,2
68,1.681195,0.362548,0.014402,0.002311,0.1,"(8,)","{'neural-network__alpha': 0.1, 'neural-network...",0.670556,0.682165,0.693089,0.692254,0.698574,0.687328,0.009919,3
79,0.753132,0.184625,0.014602,0.00201,1.0,"(4,)","{'neural-network__alpha': 1.0, 'neural-network...",0.681943,0.683902,0.690884,0.686275,0.692962,0.687193,0.004151,4
54,2.241293,0.600087,0.013102,0.000375,0.01,"(6,)","{'neural-network__alpha': 0.01, 'neural-networ...",0.664581,0.686848,0.690859,0.693408,0.697548,0.686649,0.011569,5
67,1.859326,0.501048,0.013702,0.000748,0.1,"(6,)","{'neural-network__alpha': 0.1, 'neural-network...",0.670356,0.687761,0.690223,0.686557,0.697849,0.686549,0.008998,6
16,2.594155,1.255486,0.014102,0.002223,1e-05,"(8,)","{'neural-network__alpha': 1e-05, 'neural-netwo...",0.671843,0.691046,0.68296,0.68858,0.697674,0.686421,0.008685,7
2,2.976322,1.798874,0.013502,0.000707,1e-06,"(6,)","{'neural-network__alpha': 1e-06, 'neural-netwo...",0.67284,0.683403,0.686639,0.681235,0.705163,0.685856,0.010679,8
82,2.392319,0.430839,0.013302,0.0004,1.0,"(10,)","{'neural-network__alpha': 1.0, 'neural-network...",0.671503,0.684046,0.687755,0.685172,0.700171,0.685729,0.009146,9
42,2.56755,0.445211,0.015103,0.002577,0.001,"(8,)","{'neural-network__alpha': 0.001, 'neural-netwo...",0.665984,0.675396,0.702576,0.68388,0.699391,0.685445,0.013929,10


Of the two sets of Neural Network cross validations, the network using the logistic activation function performed better in the cross validations. Therefore, I will use that for the final Neural Network model below.

In [107]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("neural-network", MLPClassifier(solver="lbfgs", max_iter=5000, early_stopping=True, activation = "logistic", alpha = .0001, hidden_layer_sizes = (4,)))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

In [109]:
y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual >= $50k", "Actual < $50k"], columns=["Predicted >= $50k", "Predicted < $50k"])

cm_df

Unnamed: 0,Predicted >= $50k,Predicted < $50k
Actual >= $50k,22909,1718
Actual < $50k,2849,4981


In [114]:
f1 = f1_score(y_true=y, y_pred = y_pred)
print(f"F1 Score for Neural Network Classifier: {f1}")

F1 Score for Neural Network Classifier: 0.6856631564457293


Comparing the Neural Network to the Naive Bayes classifier, the Neural Network performed slightly better but at the cost of a large amount of computational resources and time. As the increase was only marginal (~.06), I don't believe that using the Neural Network would be the best solution for scalability in a real world application.

## Random Forest

In [98]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("forest", RandomForestClassifier(n_jobs = -1))
    ]
)

parameters = {
    "forest__min_samples_leaf": [1, 2, 3, 4, 5, 10, 15, 25],
    "forest__min_samples_split": [2, 3, 4, 5, 10, 15, 25],
    "forest__ccp_alpha": [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='f1', n_jobs=1, verbose = 1)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 392 candidates, totalling 1960 fits


In [99]:
pd.DataFrame(gscv_fitted.cv_results_).sort_values(by = "rank_test_score", ascending = True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_forest__ccp_alpha,param_forest__min_samples_leaf,param_forest__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
68,0.360282,0.021863,0.044408,0.001908,1e-06,2,15,"{'forest__ccp_alpha': 1e-06, 'forest__min_samp...",0.674197,0.690806,0.689487,0.700972,0.701926,0.691477,0.010023,1
117,0.363464,0.007167,0.042907,0.000663,1e-05,1,15,"{'forest__ccp_alpha': 1e-05, 'forest__min_samp...",0.675361,0.691228,0.689727,0.699964,0.700069,0.69127,0.009039,2
69,0.376837,0.020732,0.103318,0.118922,1e-06,2,25,"{'forest__ccp_alpha': 1e-06, 'forest__min_samp...",0.678014,0.686717,0.696198,0.69697,0.697756,0.691131,0.007684,3
118,0.360957,0.017879,0.042907,0.002084,1e-05,1,25,"{'forest__ccp_alpha': 1e-05, 'forest__min_samp...",0.677065,0.688432,0.690757,0.695245,0.702061,0.690712,0.008256,4
63,0.37488,0.013314,0.042807,0.000749,1e-06,2,2,"{'forest__ccp_alpha': 1e-06, 'forest__min_samp...",0.681738,0.684435,0.69191,0.696307,0.698501,0.690578,0.006531,5
121,0.37624,0.015792,0.042507,0.000316,1e-05,2,4,"{'forest__ccp_alpha': 1e-05, 'forest__min_samp...",0.679392,0.683688,0.700207,0.696307,0.692308,0.69038,0.007758,6
6,0.377846,0.018212,0.106318,0.126632,0.0,1,25,"{'forest__ccp_alpha': 1e-07, 'forest__min_samp...",0.672322,0.685633,0.693324,0.703187,0.696747,0.690243,0.010599,7
7,0.396005,0.023395,0.044408,0.001985,0.0,2,2,"{'forest__ccp_alpha': 1e-07, 'forest__min_samp...",0.675656,0.687478,0.694962,0.698378,0.694715,0.690238,0.00811,8
18,0.377552,0.026581,0.043307,0.001504,0.0,3,10,"{'forest__ccp_alpha': 1e-07, 'forest__min_samp...",0.673288,0.682093,0.69909,0.696499,0.699374,0.690069,0.010531,9
122,0.365566,0.019646,0.043507,0.000548,1e-05,2,5,"{'forest__ccp_alpha': 1e-05, 'forest__min_samp...",0.681706,0.68533,0.693878,0.698242,0.690642,0.689959,0.005899,10


In [123]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("forest", RandomForestClassifier(n_jobs = -1, ccp_alpha = 1e-6, min_samples_leaf=2, min_samples_split=15))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

In [125]:
y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual >= $50k", "Actual < $50k"], columns=["Predicted >= $50k", "Predicted < $50k"])

cm_df

Unnamed: 0,Predicted >= $50k,Predicted < $50k
Actual >= $50k,23569,1058
Actual < $50k,2466,5364


In [127]:
f1 = f1_score(y_true=y, y_pred = y_pred)
print(f"F1 Score for Random Forest Classifier: {f1}")

F1 Score for Random Forest Classifier: 0.7527364580409767


The Random Forest model has outperformed both the Neural Network and the Naive Bayes Classifier by a large margin. It was also much less computationally expensive to run and much easier to tune. This makes it a great choice for scalability and it shows impressive values for its metric. For one last model we will attempt to use XGBoost.

## XGBoost

In [139]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("xgb", XGBClassifier())
    ]
)

parameters = {
    "xgb__booster": ["gbtree", "gblinear", "dart"],
    "xgb__reg_alpha": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0],
    "xgb__reg_lambda": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0]
}

gscv = GridSearchCV(my_pipeline, parameters, cv = 5, scoring='f1', n_jobs=1, verbose = 1)
gscv_fitted = gscv.fit(X, y)
test_scores = gscv_fitted.cv_results_["mean_test_score"]
gscv_fitted.best_estimator_

Fitting 5 folds for each of 147 candidates, totalling 735 fits


In [141]:
pd.DataFrame(gscv_fitted.cv_results_).sort_values(by = "rank_test_score", ascending = True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_xgb__booster,param_xgb__reg_alpha,param_xgb__reg_lambda,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
31,0.183232,0.001721,0.021304,0.00051,gbtree,0.01,0.001,"{'xgb__booster': 'gbtree', 'xgb__reg_alpha': 0...",0.708007,0.707209,0.714765,0.715834,0.713263,0.711815,0.00354,1
129,8.378569,0.253187,0.060561,0.002985,dart,0.01,0.001,"{'xgb__booster': 'dart', 'xgb__reg_alpha': 0.0...",0.708007,0.707209,0.714765,0.715834,0.713263,0.711815,0.00354,1
28,0.185433,0.004924,0.020904,0.0002,gbtree,0.01,1e-06,"{'xgb__booster': 'gbtree', 'xgb__reg_alpha': 0...",0.709874,0.705115,0.712513,0.714683,0.714286,0.711294,0.003525,3
34,0.183132,0.003185,0.021404,0.000584,gbtree,0.01,0.0,"{'xgb__booster': 'gbtree', 'xgb__reg_alpha': 0...",0.709874,0.705115,0.712513,0.714683,0.714286,0.711294,0.003525,3
29,0.183232,0.004457,0.021104,0.0002,gbtree,0.01,1e-05,"{'xgb__booster': 'gbtree', 'xgb__reg_alpha': 0...",0.709874,0.705115,0.712513,0.714683,0.714286,0.711294,0.003525,3
126,8.430578,0.2362,0.062111,0.001882,dart,0.01,1e-06,"{'xgb__booster': 'dart', 'xgb__reg_alpha': 0.0...",0.709874,0.705115,0.712513,0.714683,0.714286,0.711294,0.003525,3
127,8.492789,0.217443,0.061311,0.001601,dart,0.01,1e-05,"{'xgb__booster': 'dart', 'xgb__reg_alpha': 0.0...",0.709874,0.705115,0.712513,0.714683,0.714286,0.711294,0.003525,3
132,8.539897,0.437811,0.061611,0.002889,dart,0.01,0.0,"{'xgb__booster': 'dart', 'xgb__reg_alpha': 0.0...",0.709874,0.705115,0.712513,0.714683,0.714286,0.711294,0.003525,3
39,0.193834,0.004803,0.021704,0.00051,gbtree,0.1,0.01,"{'xgb__booster': 'gbtree', 'xgb__reg_alpha': 0...",0.702997,0.70332,0.710615,0.721593,0.715065,0.710718,0.007092,9
137,8.484887,0.344231,0.061111,0.001828,dart,0.1,0.01,"{'xgb__booster': 'dart', 'xgb__reg_alpha': 0.1...",0.702997,0.70332,0.710615,0.721593,0.715065,0.710718,0.007092,9


In [145]:
my_pipeline = Pipeline(
    [
        ("preprocessing", ct),
        ("xgb", XGBClassifier(booster = "gbtree", reg_alpha = .01, reg_lambda = .001))
    ]
)

fitted_pipeline = my_pipeline.fit(X_train, y_train)

In [147]:
y_pred = fitted_pipeline.predict(X)

cm = confusion_matrix(y_true = y, y_pred = y_pred)

cm_df = pd.DataFrame(cm, index=["Actual >= $50k", "Actual < $50k"], columns=["Predicted >= $50k", "Predicted < $50k"])

cm_df

Unnamed: 0,Predicted >= $50k,Predicted < $50k
Actual >= $50k,23553,1074
Actual < $50k,2062,5768


In [149]:
f1 = f1_score(y_true=y, y_pred = y_pred)
print(f"F1 Score for XGBoost Classifier: {f1}")

F1 Score for XGBoost Classifier: 0.7862595419847328


In the end the best model of all was the XGBoost model. It also had the huge benefit of being extremely fast to run and easy to tune.

## Final Thoughts

While Neural Networks and Naive Bayes are interesting machine learning models, they are both fairly intensive computationally. As can be seen above, both are bested easily on this dataset by Random Forests and XGBoost. This makes them less practical to run and while they definitely have their use cases, those cases tend to be for more complex problems than simple classification.