In [132]:
from sys import hash_info
from unittest.mock import inplace

import pandas as pd
import numpy as np
from pygments.lexer import include
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

In [133]:
df = pd.read_csv("../datasets/adult_income.csv")
# df.shape # (48842, 15) gave me this
# df.info() # shows no null columns but can be filled with non-null values
# df.isin(['?', 'NA', '']).sum() # found null values replaced with characters so hidden values
df.replace("?", np.nan, inplace=True)
# df.isin(['?', 'NA', '']).sum() # no we can see the data has no values that are hidden
# df.info() #info now shows the missing values

In [134]:
df["capital_net"] = df['capital-gain'] - df["capital-loss"]
df["hours_category"] = pd.cut(
    x=df["hours-per-week"],
    bins=(0, 20, 40, 60, 100),
    labels=(["part-time", "full-time", "over-time", "extreme"]),
    include_lowest=True
)

df.drop(columns= ["fnlwgt", "education", "capital-loss", 'capital-gain', "hours-per-week"], inplace=True)

In [135]:
df.isna().sum() # now after feature engineering we have to fill up the empty spaces in work class, occupation, native-country

age                   0
workclass          2799
educational-num       0
marital-status        0
occupation         2809
relationship          0
race                  0
gender                0
native-country      857
income                0
capital_net           0
hours_category        0
dtype: int64

In [136]:
# df["native-country"].describe()
# df["native-country"].unique()  this shows that max country is USA more than 97% so can apply imputer with most freq

In [137]:
df["native-country"] = (
    df["native-country"]
      .fillna(df["native-country"]
                .mode()
                .iloc[0])
) ## extraxts the mode(max freq) and replaces it where it was null

In [138]:
df["educational-num"].describe()
df["occupation"].unique()

array(['Machine-op-inspct', 'Farming-fishing', 'Protective-serv', nan,
       'Other-service', 'Prof-specialty', 'Craft-repair', 'Adm-clerical',
       'Exec-managerial', 'Tech-support', 'Sales', 'Priv-house-serv',
       'Transport-moving', 'Handlers-cleaners', 'Armed-Forces'],
      dtype=object)

In [139]:
df.groupby("educational-num")["occupation"].value_counts() ## using this to compare values of occupation according to educational-num to figure out the best possible i settled on binning using the values
# 1–6, 7–8, 9, 10–12, 13–16; occupations: Other-service, Craft-repair, Craft-repair, Adm-clerical, Prof-specialty

educational-num  occupation       
1                Other-service        22
                 Farming-fishing      17
                 Machine-op-inspct    12
                 Craft-repair          6
                 Handlers-cleaners     5
                                      ..
16               Transport-moving      2
                 Farming-fishing       1
                 Machine-op-inspct     1
                 Priv-house-serv       1
                 Protective-serv       1
Name: count, Length: 209, dtype: int64

In [140]:
# Custom imputation for occupation based on education-num bins
def impute_occupation(row):
    if pd.isna(row["occupation"]):
        edu = row["educational-num"]
        if 1 <= edu <= 6:
            return "Other-service"  # Dominant for low education
        elif 7 <= edu <= 8:
            return "Other-service"  # Dominant for mid-low
        elif edu == 9:
            return "Craft-repair"  # Dominant for high school
        elif 10 <= edu <= 12:
            return "Adm-clerical"  # Strong for some college to associate’s
        elif 13 <= edu <= 16:
            return "Prof-specialty"  # Dominant for bachelor’s to doctorate
    return row["occupation"]

# Apply imputation
df["occupation"] = df.apply(impute_occupation, axis=1)

In [141]:
df["occupation"].describe()

count            48842
unique              14
top       Craft-repair
freq              6924
Name: occupation, dtype: object

In [142]:
df.isna().sum()

age                   0
workclass          2799
educational-num       0
marital-status        0
occupation            0
relationship          0
race                  0
gender                0
native-country        0
income                0
capital_net           0
hours_category        0
dtype: int64