In [50]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_curve, auc, precision_recall_curve, f1_score

In [51]:
df_adult_data = pd.read_csv('adult.csv')
df_adult_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [52]:
X = df_adult_data.drop(columns=['fnlwgt'])
y = df_adult_data['income']

In [53]:
X

Unnamed: 0,age,workclass,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [54]:
y

0        <=50K
1        <=50K
2        <=50K
3        <=50K
4        <=50K
         ...  
32556    <=50K
32557    <=50K
32558     >50K
32559    <=50K
32560    <=50K
Name: income, Length: 32561, dtype: object

In [55]:
categorical_onehot = ['relationship', 'race', 'sex']
categorical_binary = ['workclass', 'marital.status', 'occupation', 'native.country']
categorical_ordinal = ['education']

In [56]:
one_hot_encode_pipline = Pipeline(
    steps=[
        (
            'Imputer', SimpleImputer(strategy='constant', fill_value=np.NaN).fit(pd.DataFrame(df_adult_data[categorical_onehot]))
        ),
        (
            'OneHotEncoder', OneHotEncoder()
        )
    ]
)

In [57]:
binary_encode_pipeline = Pipeline(
    steps=[
        (
            'Imputer', SimpleImputer(strategy='constant', fill_value=np.NaN).fit(pd.DataFrame(df_adult_data[categorical_binary]))
        ),
        (
            'BinaryEncoder', OrdinalEncoder()
        )
    ]
)

In [58]:
ordinal_pipeline = Pipeline(
    steps=[
        (
            'Imputer', SimpleImputer(strategy='constant', fill_value=np.NaN).fit(pd.DataFrame(df_adult_data[categorical_ordinal]))
        ),
        (
            'OrdinaEncoder', OneHotEncoder()
        )
    ]
)

In [59]:
transformer = ColumnTransformer([
    ('onehot', one_hot_encode_pipline, categorical_onehot),
    ('binary', binary_encode_pipeline, categorical_binary),
    ('ordinal', ordinal_pipeline, categorical_ordinal)
])

In [60]:
LR_pipe = make_pipeline(transformer, LogisticRegression(solver='liblinear', random_state=10))

In [61]:
LR_pipe

In [62]:
DTC_pipe = make_pipeline(transformer, DecisionTreeClassifier(max_depth=5, criterion='entropy', random_state=10))

In [63]:
DTC_pipe

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10, stratify=y)

In [65]:
LR_model = LR_pipe.fit(X_train, y_train)

In [66]:
LR_model.score(X_train, y_train)

0.8203316953316954

In [67]:
DTC_model = DTC_pipe.fit(X_train, y_train)

In [68]:
DTC_model.score(X_train, y_train)

0.8191470691470691

In [69]:
y_pred_LR = LR_pipe.predict(X_test)

In [70]:
y_pred_DTC = DTC_pipe.predict(X_test)

In [73]:
print("Logistic Regression Classification Report:\n", classification_report(y_test, y_pred_LR))

Logistic Regression Classification Report:
               precision    recall  f1-score   support

       <=50K       0.84      0.95      0.89      7417
        >50K       0.73      0.42      0.54      2352

    accuracy                           0.82      9769
   macro avg       0.78      0.69      0.71      9769
weighted avg       0.81      0.82      0.81      9769



In [72]:
print("Decision Tree Classification Report:\n", classification_report(y_test, y_pred_DTC))

Decision Tree Classification Report:
               precision    recall  f1-score   support

       <=50K       0.83      0.96      0.89      7417
        >50K       0.74      0.40      0.52      2352

    accuracy                           0.82      9769
   macro avg       0.79      0.68      0.70      9769
weighted avg       0.81      0.82      0.80      9769



In [None]:
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear', 'saga']
}

In [90]:
grid_search = GridSearchCV(
    estimator=LR_pipe, 
    param_grid=param_grid)
grid_search.fit(X_train, y_train)

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 415, in __call__
    return estimator.score(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/pipeline.py", line 993, in score
    Xt = transform.transform(Xt)
         ^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/utils/_set_output.py", line 295, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/compose/_column_transformer.py", line 1014, in transform
    Xs = self._call_func_on_transformers(
         ^^^^^^^^^^

In [91]:
grid_search.best_params_

{'logisticregression__C': 0.01, 'logisticregression__solver': 'liblinear'}

In [92]:
grid_search.score(X_train, y_train)

0.8194980694980695

In [93]:
LR_model.score(X_test, y_test)

0.8237281195618794

In [94]:
grid_search.score(X_test, y_test)

0.8214760978605794