In [221]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [222]:
df = pd.read_csv("../datasets/adult_income.csv")
# df.shape # (48842, 15) gave me this
# df.info() # shows no null columns but can be filled with non-null values
# df.isin(['?', 'NA', '']).sum() # found null values replaced with characters so hidden values
df.replace("?", np.nan, inplace=True)
# df.isin(['?', 'NA', '']).sum() # no we can see the data has no values that are hidden
# df.info() #info now shows the missing values

feature engineering

In [223]:
df["capital_net"] = df['capital-gain'] - df["capital-loss"]
df["hours_category"] = pd.cut(
    x=df["hours-per-week"],
    bins=(0, 20, 40, 60, 100),
    labels=(["part-time", "full-time", "over-time", "extreme"]),
    include_lowest=True
)

df.drop(columns= ["fnlwgt", "education", "capital-loss", 'capital-gain', "hours-per-week"], inplace=True)

In [224]:
df.isna().sum() # now after feature engineering we have to fill up the empty spaces in work class, occupation, native-country

age                   0
workclass          2799
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
native-country      857
income                0
capital_net           0
hours_category        0
dtype: int64

In [225]:
df["native-country"].describe()
df["native-country"].unique()  #this shows that max country is USA more than 97% so can apply imputer with most freq

array(['United-States', nan, 'Peru', 'Guatemala', 'Mexico',
       'Dominican-Republic', 'Ireland', 'Germany', 'Philippines',
       'Thailand', 'Haiti', 'El-Salvador', 'Puerto-Rico', 'Vietnam',
       'South', 'Columbia', 'Japan', 'India', 'Cambodia', 'Poland',
       'Laos', 'England', 'Cuba', 'Taiwan', 'Italy', 'Canada', 'Portugal',
       'China', 'Nicaragua', 'Honduras', 'Iran', 'Scotland', 'Jamaica',
       'Ecuador', 'Yugoslavia', 'Hungary', 'Hong', 'Greece',
       'Trinadad&Tobago', 'Outlying-US(Guam-USVI-etc)', 'France',
       'Holand-Netherlands'], dtype=object)

In [226]:
df["native-country"] = (
    df["native-country"]
      .fillna(df["native-country"]
                .mode()
                .iloc[0])
) # extraxts the mode(max freq) and replaces it where it was null

In [227]:
df["educational-num"].describe()
df["occupation"].unique()

array(['Machine-op-inspct', 'Farming-fishing', 'Protective-serv', nan,
       'Other-service', 'Prof-specialty', 'Craft-repair', 'Adm-clerical',
       'Exec-managerial', 'Tech-support', 'Sales', 'Priv-house-serv',
       'Transport-moving', 'Handlers-cleaners', 'Armed-Forces'],
      dtype=object)

In [228]:
 ## using this to compare values of occupation according to educational-num to figure out the best possible i settled on binning using the values
# 1–6, 7–8, 9, 10–12, 13–16; occupations: Other-service, Craft-repair, Craft-repair, Adm-clerical, Prof-specialty
df.groupby("educational-num")["occupation"].value_counts()

educational-num  occupation       
1                Other-service        22
                 Farming-fishing      17
                 Machine-op-inspct    12
                 Craft-repair          6
                 Handlers-cleaners     5
                                      ..
16               Transport-moving      2
                 Farming-fishing       1
                 Machine-op-inspct     1
                 Priv-house-serv       1
                 Protective-serv       1
Name: count, Length: 209, dtype: int64

In [229]:
# Custom imputation for occupation based on education-num bins
def impute_occupation(row):
    if pd.isna(row["occupation"]):
        edu = row["educational-num"]
        if 1 <= edu <= 6:
            return "Other-service"  # Dominant for low education
        elif 7 <= edu <= 8:
            return "Other-service"  # Dominant for mid-low
        elif edu == 9:
            return "Craft-repair"  # Dominant for high school
        elif 10 <= edu <= 12:
            return "Adm-clerical"  # Strong for some college to associate’s
        elif 13 <= edu <= 16:
            return "Prof-specialty"  # Dominant for bachelor’s to doctorate
    return row["occupation"]

# Apply imputation
df["occupation"] = df.apply(impute_occupation, axis=1)

In [230]:
df["occupation"].describe()

count            48842
unique              14
top       Craft-repair
freq              6924
Name: occupation, dtype: object

In [231]:
df.isna().sum() #left to fill in workclass

age                   0
workclass          2799
educational-num       0
marital-status        0
occupation            0
relationship          0
race                  0
gender                0
native-country        0
income                0
capital_net           0
hours_category        0
dtype: int64

In [232]:
df.groupby("occupation")["workclass"].value_counts()

occupation        workclass       
Adm-clerical      Private             4208
                  Federal-gov          487
                  Local-gov            421
                  State-gov            375
                  Self-emp-not-inc      70
                                      ... 
Transport-moving  Local-gov            156
                  State-gov             60
                  Self-emp-inc          38
                  Federal-gov           37
                  Without-pay            1
Name: count, Length: 86, dtype: int64

In [233]:
# Apply imputation
df["workclass"] = df["workclass"].fillna("Unknown")

In [234]:
df.isna().sum() # no more null values

age                0
workclass          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
native-country     0
income             0
capital_net        0
hours_category     0
dtype: int64

In [235]:
df.sample(5)

Unnamed: 0,age,workclass,educational-num,marital-status,occupation,relationship,race,gender,native-country,income,capital_net,hours_category
17329,20,Private,10,Never-married,Sales,Not-in-family,White,Female,United-States,<=50K,0,over-time
33173,25,Private,9,Never-married,Prof-specialty,Own-child,White,Male,United-States,<=50K,0,full-time
25309,57,Unknown,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,United-States,<=50K,0,part-time
4305,29,Private,13,Never-married,Prof-specialty,Not-in-family,Asian-Pac-Islander,Male,Taiwan,<=50K,0,full-time
30067,53,Private,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,United-States,>50K,0,over-time


In [236]:
num_columns = [0, 2, 9]  # age, educational-num, capital_net
char_columns = [1, 3, 4, 5, 6, 7, 8, 10]  # workclass, marital-status, occupation, relationship, race, sex, native-country, hours_category
X = df.drop(columns=["income"])
Y = (df["income"] == ">50K").astype(int)  # Binary: 0 for <=50K, 1 for >50K

transform_x = ColumnTransformer([
    ("scaler", StandardScaler(), num_columns),
    ("encoder", OneHotEncoder(drop="first", sparse_output=False), char_columns)
], remainder="passthrough")

pipeline = Pipeline([
    ("preprocessor", transform_x),
    ("classifier", LogisticRegression())
])

In [237]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)

In [238]:
pipeline.fit(X_train, Y_train)
Y_pred = pipeline.predict(X_test)

In [239]:
# Evaluation
print(f"\nAccuracy: {accuracy_score(Y_test, Y_pred):.2f}")
print("Classification Report:")
print(classification_report(Y_test, Y_pred, target_names=["<=50K", ">50K"]))
print(f"ROC-AUC: {roc_auc_score(Y_test, pipeline.predict_proba(X_test)[:, 1]):.2f}")

# Feature importance
coef = pipeline.named_steps["classifier"].coef_[0]
feature_names = pipeline.named_steps["preprocessor"].get_feature_names_out()
print("\nFeature Importance:")
print(pd.DataFrame({"Feature": feature_names, "Coefficient": coef}).sort_values(by="Coefficient", ascending=False))

# Validate distributions
print("\nWorkclass distribution after imputation:")
print(df["workclass"].value_counts())
print("\nOccupation distribution after imputation:")
print(df["occupation"].value_counts())
print("\nNative-country distribution after imputation:")
print(df["native-country"].value_counts().head())


Accuracy: 0.85
Classification Report:
              precision    recall  f1-score   support

       <=50K       0.88      0.93      0.90      9259
        >50K       0.73      0.59      0.65      2952

    accuracy                           0.85     12211
   macro avg       0.80      0.76      0.78     12211
weighted avg       0.84      0.85      0.84     12211

ROC-AUC: 0.91

Feature Importance:
                                       Feature  Coefficient
12  encoder__marital-status_Married-civ-spouse     1.839071
2                          scaler__capital_net     1.823047
11   encoder__marital-status_Married-AF-spouse     1.696351
59             encoder__native-country_Ireland     1.216025
34                  encoder__relationship_Wife     1.064380
..                                         ...          ...
45             encoder__native-country_Ecuador    -0.939375
9                   encoder__workclass_Unknown    -1.299052
42            encoder__native-country_Columbia    -1.400546