# Import all the necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score,classification_report,ConfusionMatrixDisplay

## Read the data

In [2]:
df = pd.read_csv("adult.csv")

In [3]:
df.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K


## Data Cleaning 

In [4]:
df.drop_duplicates(inplace=True)
df.index = range(0,df.shape[0],1)

In [5]:
df.rename(columns = {
    "education.num" : "education_num","marital.status" : "marital_status","capital.gain" : "capital_gain",
    "capital.loss" : "capital_loss","hours.per.week" : "hours_per_week","native.country" : "native_country"
},inplace = True)

## Deal with Outliers

In [6]:
cat = []
con = []
for i in df.columns:
    if df[i].dtype == 'object':
        cat.append(i)
    else:
        con.append(i)

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
Q = pd.DataFrame(sc.fit_transform(df[con]),columns=con)

In [8]:
W = []
for i in Q.columns:
    W.extend(list(Q[(Q[i]>3)|(Q[i]<-3)].index))

In [9]:
from numpy import unique
outliers = list(unique(W))

In [10]:
df = df.drop(index=outliers,axis=0)

In [11]:
df.index = range(0,df.shape[0],1)

In [12]:
df.shape

(29809, 15)

In [13]:
df.drop(labels=["education_num","fnlwgt"],axis = 1,inplace=True)

## Encoding dependent variable

In [14]:
from sklearn.preprocessing import LabelEncoder
# create a LabelEncoder object
le = LabelEncoder()
df["income"] = le.fit_transform(df["income"])

In [15]:
df.head(3)

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,62,Local-gov,Bachelors,Married-civ-spouse,Adm-clerical,Wife,White,Female,0,1258,38,United-States,0
1,72,Local-gov,Doctorate,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1258,40,United-States,0
2,76,?,Prof-school,Married-civ-spouse,?,Husband,White,Male,0,1258,20,United-States,0


In [16]:
df["workclass"] = df["workclass"].replace("?",np.nan)
df["occupation"] = df["occupation"].replace("?",np.nan)
df["native_country"] = df["native_country"].replace("?",np.nan)

In [17]:
X = df.iloc[:,:-1]
y = df["income"]

## Split Data into trai test set

In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [241]:
X_train.head(3)

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
24908,34,,HS-grad,Married-civ-spouse,,Husband,Black,Male,0,0,10,United-States
6777,73,,HS-grad,Widowed,,Not-in-family,White,Female,0,0,8,United-States
18137,31,Private,HS-grad,Divorced,Craft-repair,Not-in-family,White,Male,0,0,45,United-States


# Create Machine Learning Pipeline

### Imputer

In [242]:
imputer = ColumnTransformer(
        [("imputer",SimpleImputer(strategy="most_frequent"),[1,4,11])],
    remainder="passthrough" 
)

### One Hot Encoding

In [244]:
encoder = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[0,1,2,4,5,6,7,8])
],remainder='passthrough')

### Scaling numerical features in between 0 to 1

In [245]:
scaler = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,100))
])

### create Logistic Regression Model

In [246]:
lr_model = LogisticRegression(C=100, class_weight='balanced', penalty='l1', solver='saga',max_iter=1000)

# Final Pipeline

In [247]:
from sklearn import set_config
set_config(display='diagram')

In [248]:
pipe = Pipeline([
    ('imputer', imputer),        # Impute missing values
    ('encoder', encoder),        # One-hot encode specified columns
    ('scaler', scaler),          # Scale specified columns
    ('model', lr_model)          # Logistic Regression model
])

### Pipeline Overview

In [249]:
pipe.fit(X_train,y_train)

## Explore Pipeline

In [250]:
pipe.named_steps.imputer.named_transformers_

{'imputer': SimpleImputer(strategy='most_frequent'),
 'remainder': 'passthrough'}

In [251]:
pipe.named_steps.encoder.transformers_

[('ohe_sex_embarked',
  OneHotEncoder(handle_unknown='ignore', sparse_output=False),
  [0, 1, 2, 4, 5, 6, 7, 8]),
 ('remainder', 'passthrough', [3, 9, 10, 11])]

In [252]:
pipe.named_steps.scaler.get_params()

{'n_jobs': None,
 'remainder': 'drop',
 'sparse_threshold': 0.3,
 'transformer_weights': None,
 'transformers': [('scale', MinMaxScaler(), slice(0, 100, None))],
 'verbose': False,
 'verbose_feature_names_out': True,
 'scale': MinMaxScaler(),
 'scale__clip': False,
 'scale__copy': True,
 'scale__feature_range': (0, 1)}

In [253]:
y_pred = pipe.predict(X_test)

In [254]:
accuracy_score(y_test,y_pred)

0.8061053337806106

## Exporting Pipeline

In [255]:
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))