In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
df = pd.read_csv("../datasets/adult_income.csv")
# df.shape # (48842, 15) gave me this
# df.info() # shows no null columns but can be filled with non-null values
# df.isin(['?', 'NA', '']).sum() # found null values replaced with characters so hidden values
df.replace("?", np.nan, inplace=True)
# df.isin(['?', 'NA', '']).sum() # no we can see the data has no values that are hidden
# df.info() #info now shows the missing values

feature engineering

In [None]:
df["capital_net"] = df['capital-gain'] - df["capital-loss"]
df["hours_category"] = pd.cut(
    x=df["hours-per-week"],
    bins=(0, 20, 40, 60, 100),
    labels=(["part-time", "full-time", "over-time", "extreme"]),
    include_lowest=True
)

df.drop(columns= ["fnlwgt", "education", "capital-loss", 'capital-gain', "hours-per-week"], inplace=True)

In [None]:
df.isna().sum() # now after feature engineering we have to fill up the empty spaces in work class, occupation, native-country

In [None]:
df["native-country"].describe()
df["native-country"].unique()  #this shows that max country is USA more than 97% so can apply imputer with most freq

In [None]:
df["native-country"] = (
    df["native-country"]
      .fillna(df["native-country"]
                .mode()
                .iloc[0])
) # extraxts the mode(max freq) and replaces it where it was null

In [None]:
df["educational-num"].describe()
df["occupation"].unique()

In [None]:
 ## using this to compare values of occupation according to educational-num to figure out the best possible i settled on binning using the values
# 1–6, 7–8, 9, 10–12, 13–16; occupations: Other-service, Craft-repair, Craft-repair, Adm-clerical, Prof-specialty
df.groupby("educational-num")["occupation"].value_counts()

In [None]:
# Custom imputation for occupation based on education-num bins
def impute_occupation(row):
    if pd.isna(row["occupation"]):
        edu = row["educational-num"]
        if 1 <= edu <= 6:
            return "Other-service"  # Dominant for low education
        elif 7 <= edu <= 8:
            return "Other-service"  # Dominant for mid-low
        elif edu == 9:
            return "Craft-repair"  # Dominant for high school
        elif 10 <= edu <= 12:
            return "Adm-clerical"  # Strong for some college to associate’s
        elif 13 <= edu <= 16:
            return "Prof-specialty"  # Dominant for bachelor’s to doctorate
    return row["occupation"]

# Apply imputation
df["occupation"] = df.apply(impute_occupation, axis=1)

In [None]:
df["occupation"].describe()

In [None]:
df.isna().sum() #left to fill in workclass

In [None]:
df.groupby("occupation")["workclass"].value_counts()

In [None]:
# Apply imputation
df["workclass"] = df["workclass"].fillna("Unknown")

In [None]:
df.isna().sum() # no more null values

In [None]:
df.sample(5)

In [None]:
num_columns = [0, 2, 9]  # age, educational-num, capital_net
char_columns = [1, 3, 4, 5, 6, 7, 8, 10]  # workclass, marital-status, occupation, relationship, race, sex, native-country, hours_category
X = df.drop(columns=["income"])
Y = (df["income"] == ">50K").astype(int)  # Binary: 0 for <=50K, 1 for >50K

transform_x = ColumnTransformer([
    ("scaler", StandardScaler(), num_columns),
    ("encoder", OneHotEncoder(drop="first", sparse_output=False), char_columns)
], remainder="passthrough")

pipeline = Pipeline([
    ("preprocessor", transform_x),
    ("classifier", LogisticRegression())
])

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)

In [None]:
pipeline.fit(X_train, Y_train)
Y_pred = pipeline.predict(X_test)

In [None]:
# Evaluation
print(f"\nAccuracy: {accuracy_score(Y_test, Y_pred):.2f}")
print("Classification Report:")
print(classification_report(Y_test, Y_pred, target_names=["<=50K", ">50K"]))
print(f"ROC-AUC: {roc_auc_score(Y_test, pipeline.predict_proba(X_test)[:, 1]):.2f}")

# Feature importance
coef = pipeline.named_steps["classifier"].coef_[0]
feature_names = pipeline.named_steps["preprocessor"].get_feature_names_out()
print("\nFeature Importance:")
print(pd.DataFrame({"Feature": feature_names, "Coefficient": coef}).sort_values(by="Coefficient", ascending=False))

# Validate distributions
print("\nWorkclass distribution after imputation:")
print(df["workclass"].value_counts())
print("\nOccupation distribution after imputation:")
print(df["occupation"].value_counts())
print("\nNative-country distribution after imputation:")
print(df["native-country"].value_counts().head())