## Importing Libraries

In [61]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV,RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE

In [37]:
df = pd.read_csv('/Users/arfatshaikh/Documents/Machine-Learning-1/Income Classificaion/income_evaluation.csv')

## Data Checks

In [6]:
df.shape

(32561, 15)

In [38]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlwgt          32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [9]:
df.isna().sum()

age                0
 workclass         0
 fnlwgt            0
 education         0
 education-num     0
 marital-status    0
 occupation        0
 relationship      0
 race              0
 sex               0
 capital-gain      0
 capital-loss      0
 hours-per-week    0
 native-country    0
 income            0
dtype: int64

In [14]:
df.columns.str.strip()

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [15]:
df.columns = df.columns.str.strip()

In [16]:
df.drop(['fnlwgt'], axis=1)

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [11]:
for i in df.columns:
    if df[i].dtype == 'object':
        print(i)
        print(df[i].value_counts())
        print('\n')
        print('********'*10)

workclass
workclass
Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: count, dtype: int64


********************************************************************************
education
education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: count, dtype: int64


********************************************************************************
marital-status
marital-status
Married-civ-spouse       14976
Never-married            10683
Divorced                  4443
Separated                 1025
Wi

In [12]:
df.select_dtypes(include=['int64']).describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [13]:
df.drop(columns=['fnlwgt'], inplace=True)

In [14]:
df[df['capital-gain'] == 0].shape[0]

29849

In [15]:
df[df['capital-loss'] == 0].shape[0]

31042

In [16]:
df['workclass'] = df['workclass'].str.strip()  # Remove leading/trailing whitespaces
df['workclass'].replace({'?': "Not Given", "Self-emp-not-inc": "No Income", "Without-pay": "No Income", "Never-worked": "No Income"}, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['workclass'].replace({'?': "Not Given", "Self-emp-not-inc": "No Income", "Without-pay": "No Income", "Never-worked": "No Income"}, inplace=True)


In [17]:
df['workclass'].value_counts()

workclass
Private         22696
No Income        2562
Local-gov        2093
Not Given        1836
State-gov        1298
Self-emp-inc     1116
Federal-gov       960
Name: count, dtype: int64

In [18]:
df[['education', 'education-num']].drop_duplicates().sort_values(by=['education-num'], ascending=True)

Unnamed: 0,education,education-num
224,Preschool,1
160,1st-4th,2
56,5th-6th,3
15,7th-8th,4
6,9th,5
77,10th,6
3,11th,7
415,12th,8
2,HS-grad,9
10,Some-college,10


In [19]:
df['marital-status'] = df['marital-status'].str.strip()  # Remove leading/trailing whitespaces
df['marital-status'].replace({'?': "Not Given", "Married-civ-spouse": "Married", "Married-AF-spouse": "Married", "Separated": "Married","Never-married":"Not Married",'Separeted':"Divorced","Married-spouse-absent":"Divorced"},inplace=True)
df['marital-status'].unique()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['marital-status'].replace({'?': "Not Given", "Married-civ-spouse": "Married", "Married-AF-spouse": "Married", "Separated": "Married","Never-married":"Not Married",'Separeted':"Divorced","Married-spouse-absent":"Divorced"},inplace=True)


array(['Not Married', 'Married', 'Divorced', 'Widowed'], dtype=object)

In [20]:
df['marital-status'].value_counts()

marital-status
Married        16024
Not Married    10683
Divorced        4861
Widowed          993
Name: count, dtype: int64

In [21]:
df['occupation'] = df['occupation'].str.strip()  # Remove leading/trailing whitespaces
df['occupation'].replace({'?': "Not Given"},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['occupation'].replace({'?': "Not Given"},inplace=True)


In [22]:
df['native-country'] = df['native-country'].str.strip()  # Remove leading/trailing whitespaces
df['native-country'].replace({'?': "Not Given"},inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['native-country'].replace({'?': "Not Given"},inplace=True)


In [23]:
df['relationship'] = df['relationship'].str.strip() 
relationship_mapping = {
    'Husband': 'Spouse',
    'Wife': 'Spouse',
    'Own-child': 'Dependent',
    'Other-relative': 'Extended Family',
    'Unmarried': 'Adult Non-Spouse',
    'Not-in-family': 'Non-family'
}
df['relationship_grouped'] = df['relationship'].map(relationship_mapping)

In [24]:
df['relationship_grouped'].value_counts()

relationship_grouped
Spouse              14761
Non-family           8305
Dependent            5068
Adult Non-Spouse     3446
Extended Family       981
Name: count, dtype: int64

In [25]:
for i in df.columns:
    if df[i].dtype == 'object':
        print(i)
        print(df[i].value_counts())
        print('\n')
        print('********'*10)

workclass
workclass
Private         22696
No Income        2562
Local-gov        2093
Not Given        1836
State-gov        1298
Self-emp-inc     1116
Federal-gov       960
Name: count, dtype: int64


********************************************************************************
education
education
HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
Assoc-voc        1382
11th             1175
Assoc-acdm       1067
10th              933
7th-8th           646
Prof-school       576
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: count, dtype: int64


********************************************************************************
marital-status
marital-status
Married        16024
Not Married    10683
Divorced        4861
Widowed          993
Name: count, dtype: int64


********************************************************************************
occupation
oc

In [27]:
df.select_dtypes('object').nunique().sort_values(ascending=True)

sex                      2
income                   2
marital-status           4
race                     5
relationship_grouped     5
relationship             6
workclass                7
occupation              15
education               16
native-country          42
dtype: int64

In [28]:
eduction_mapping = df[['education', 'education-num']].drop_duplicates().sort_values(by=['education-num'], ascending=True).to_dict(orient='records')

In [29]:
df.drop(columns=['education','relationship'], inplace=True)

In [88]:
df['income'] = df['income'].str.strip()  # Remove leading/trailing whitespaces

In [106]:
X = df.copy()

In [107]:
X.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,relationship_grouped
0,39,State-gov,13,Not Married,Adm-clerical,White,Male,2174,0,40,United-States,<=50K,Non-family
1,50,No Income,13,Married,Exec-managerial,White,Male,0,0,13,United-States,<=50K,Spouse
2,38,Private,9,Divorced,Handlers-cleaners,White,Male,0,0,40,United-States,<=50K,Non-family
3,53,Private,7,Married,Handlers-cleaners,Black,Male,0,0,40,United-States,<=50K,Spouse
4,28,Private,13,Married,Prof-specialty,Black,Female,0,0,40,Cuba,<=50K,Spouse


In [108]:
sex = pd.get_dummies(X[['sex']],prefix='sex').astype(int)
martial = pd.get_dummies(X[['marital-status']],prefix='marital-status').astype(int)
race = pd.get_dummies(X[['race']],prefix='race').astype(int)
relation = pd.get_dummies(X[['relationship_grouped']],prefix='relationship').astype(int)
workclass = pd.get_dummies(X[['workclass']],prefix='workclass').astype(int)
X = pd.concat([X,sex,martial,race,relation,workclass],axis=1)

In [109]:
le = LabelEncoder()
X['occupation'] = le.fit_transform(X['occupation'])
X['native-country'] = le.fit_transform(X['native-country'])

In [110]:
X.drop(columns=['sex', 'marital-status', 'relationship_grouped','race', 'workclass'], inplace=True)

In [111]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   age                            32561 non-null  int64 
 1   education-num                  32561 non-null  int64 
 2   occupation                     32561 non-null  int64 
 3   capital-gain                   32561 non-null  int64 
 4   capital-loss                   32561 non-null  int64 
 5   hours-per-week                 32561 non-null  int64 
 6   native-country                 32561 non-null  int64 
 7   income                         32561 non-null  object
 8   sex_ Female                    32561 non-null  int64 
 9   sex_ Male                      32561 non-null  int64 
 10  marital-status_Divorced        32561 non-null  int64 
 11  marital-status_Married         32561 non-null  int64 
 12  marital-status_Not Married     32561 non-null  int64 
 13  m

In [112]:
X['income']

0        <=50K
1        <=50K
2        <=50K
3        <=50K
4        <=50K
         ...  
32556    <=50K
32557     >50K
32558    <=50K
32559    <=50K
32560     >50K
Name: income, Length: 32561, dtype: object

In [113]:
X['income'].replace({'<=50K': 0, '>50K': 1}, inplace=True)

  X['income'].replace({'<=50K': 0, '>50K': 1}, inplace=True)


In [114]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype
---  ------                         --------------  -----
 0   age                            32561 non-null  int64
 1   education-num                  32561 non-null  int64
 2   occupation                     32561 non-null  int64
 3   capital-gain                   32561 non-null  int64
 4   capital-loss                   32561 non-null  int64
 5   hours-per-week                 32561 non-null  int64
 6   native-country                 32561 non-null  int64
 7   income                         32561 non-null  int64
 8   sex_ Female                    32561 non-null  int64
 9   sex_ Male                      32561 non-null  int64
 10  marital-status_Divorced        32561 non-null  int64
 11  marital-status_Married         32561 non-null  int64
 12  marital-status_Not Married     32561 non-null  int64
 13  marital-status_W

In [115]:
scaler = StandardScaler()
X[['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']] = scaler.fit_transform(X[['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']])

In [116]:
sm = SMOTE(random_state=42)

In [117]:
X.head()

Unnamed: 0,age,education-num,occupation,capital-gain,capital-loss,hours-per-week,native-country,income,sex_ Female,sex_ Male,...,relationship_Extended Family,relationship_Non-family,relationship_Spouse,workclass_Federal-gov,workclass_Local-gov,workclass_No Income,workclass_Not Given,workclass_Private,workclass_Self-emp-inc,workclass_State-gov
0,0.030671,1.134739,0,0.148453,-0.21666,-0.035429,39,0,0,1,...,0,1,0,0,0,0,0,0,0,1
1,0.837109,1.134739,3,-0.14592,-0.21666,-2.222153,39,0,0,1,...,0,0,1,0,0,1,0,0,0,0
2,-0.042642,-0.42006,5,-0.14592,-0.21666,-0.035429,39,0,0,1,...,0,1,0,0,0,0,0,1,0,0
3,1.057047,-1.197459,5,-0.14592,-0.21666,-0.035429,39,0,0,1,...,0,0,1,0,0,0,0,1,0,0
4,-0.775768,1.134739,10,-0.14592,-0.21666,-0.035429,4,0,1,0,...,0,0,1,0,0,0,0,1,0,0


In [118]:
X['income'].value_counts()

income
0    24720
1     7841
Name: count, dtype: int64

In [134]:
X.shape

(32561, 31)

In [120]:
X_sm = sm.fit_resample(x,y)

In [132]:
X_sm[0].shape, X_sm[1].shape

((49440, 30), (49440,))

In [133]:
X_sm[1].value_counts()

income
0    24720
1    24720
Name: count, dtype: int64

In [135]:
# Split data into training and testing sets
X_train, X_temp, y_train, y_temp = train_test_split(X_sm[0], X_sm[1], test_size=0.3, random_state=42)

# Split the temporary set into validation and testing sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Print the shapes of the splits
print("Training set:", X_train.shape, y_train.shape)
print("Validation set:", X_val.shape, y_val.shape)
print("Testing set:", X_test.shape, y_test.shape)

Training set: (34608, 30) (34608,)
Validation set: (7416, 30) (7416,)
Testing set: (7416, 30) (7416,)


In [175]:
RV = RandomizedSearchCV(
    estimator=log1,
    param_distributions={
        'C': [0.001, 0.01, 0.1, 1, 10, 100,1000,10000],
        'penalty': ['l1', 'l2','elasticnet'],
        'max_iter': [100, 500, 1000, 5000, 10000],
        'solver': ['saga']
    },
    n_iter=10,
    cv=3,
    random_state=42,
    scoring= 'accuracy'
)

In [176]:
RV.fit(X_train, y_train)
print("Best parameters found: ", RV.best_params_)
print("Best score found: ", RV.best_score_)
y_val_pred = RV.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
y_test_pred = RV.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

12 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py", line 1203, in fit
    raise

Best parameters found:  {'solver': 'saga', 'penalty': 'l2', 'max_iter': 10000, 'C': 10}
Best score found:  0.840210355987055
Validation Accuracy: 0.8310409924487594
Test Accuracy: 0.8375134843581445


In [177]:
log1 = LogisticRegression(max_iter=10000, random_state=42, penalty='l2', C=10,solver='saga')
log1.fit(X_train, y_train)
y_pred = log1.predict(X_val)
print("Training Accuracy:", accuracy_score(y_train, log1.predict(X_train)))
print("Validation Accuracy:", accuracy_score(y_val, y_pred))
# Evaluate the model on the test set
y_test_pred = log1.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))  

Training Accuracy: 0.8406437817845585
Validation Accuracy: 0.8310409924487594
Test Accuracy: 0.8375134843581445


In [204]:
confusion_matrix = confusion_matrix(y_test, y_test_pred)
print("Confusion Matrix:\n", confusion_matrix)
print("Classification Report:\n", classification_report(y_test, y_test_pred))

Confusion Matrix:
 [[2940  697]
 [ 333 3446]]
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.81      0.85      3637
           1       0.83      0.91      0.87      3779

    accuracy                           0.86      7416
   macro avg       0.87      0.86      0.86      7416
weighted avg       0.86      0.86      0.86      7416



In [206]:
from sklearn.metrics import confusion_matrix, classification_report
confusion_matrix_val = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:\n", confusion_matrix_val)
print("Classification Report:\n", classification_report(y_val, y_pred))

Confusion Matrix:
 [[2988  766]
 [ 487 3175]]
Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.80      0.83      3754
           1       0.81      0.87      0.84      3662

    accuracy                           0.83      7416
   macro avg       0.83      0.83      0.83      7416
weighted avg       0.83      0.83      0.83      7416



In [197]:
DT = DecisionTreeClassifier(random_state=42, max_depth=10, min_samples_split = 2, min_samples_leaf=5,criterion='entropy')
DT.fit(X_train, y_train)
print("Training Accuracy:", accuracy_score(y_train, DT.predict(X_train)))
y_val_pred = DT.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
y_test_pred = DT.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

Training Accuracy: 0.8564204808136847
Validation Accuracy: 0.8356256742179072
Test Accuracy: 0.8450647249190939


In [203]:
RF = RandomForestClassifier(random_state=42, n_estimators=1000, max_depth=15, min_samples_split=2, min_samples_leaf=5)
RF.fit(X_train, y_train)
print("Training Accuracy:", accuracy_score(y_train, RF.predict(X_train)))
y_val_pred = RF.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
y_test_pred = RF.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

Training Accuracy: 0.8749711049468331
Validation Accuracy: 0.8532901833872708
Test Accuracy: 0.8611111111111112


In [209]:
GBDT = GradientBoostingClassifier(random_state=42, n_estimators=1000, max_depth = 5, min_samples_split=2, min_samples_leaf=5)
GBDT.fit(X_train, y_train)
print("Training Accuracy:", accuracy_score(y_train, GBDT.predict(X_train)))
y_val_pred = GBDT.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
y_test_pred = GBDT.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_test_pred))

Training Accuracy: 0.9471509477577439
Validation Accuracy: 0.9087108953613808
Test Accuracy: 0.9083063646170443


## Wrapping Code into Functions

In [62]:
def data_cleaning(df):
    df.columns = df.columns.str.strip()
    df.drop(columns=['fnlwgt'], axis = 1,inplace=True)
    strip_columns = ['workclass','marital-status','occupation','native-country','relationship','income']
    for col in strip_columns:
        df[col] = df[col].str.strip()
    
    ## Data Cleaning
    df['workclass'].replace({'?': "Not Given", "Self-emp-not-inc": "No Income", "Without-pay": "No Income", "Never-worked": "No Income"}, inplace=True)
    df['marital-status'].replace({'?': "Not Given", "Married-civ-spouse": "Married", "Married-AF-spouse": "Married", "Separated": "Married","Never-married":"Not Married",'Separeted':"Divorced","Married-spouse-absent":"Divorced"},inplace=True)
    df['occupation'].replace({'?': "Not Given"},inplace=True)
    df['native-country'].replace({'?': "Not Given"},inplace=True)

    relationship_mapping = {
    'Husband': 'Spouse',
    'Wife': 'Spouse',
    'Own-child': 'Dependent',
    'Other-relative': 'Extended Family',
    'Unmarried': 'Adult Non-Spouse',
    'Not-in-family': 'Non-family'
    }
    df['relationship_grouped'] = df['relationship'].map(relationship_mapping)

    eduction_mapping = df[['education', 'education-num']].drop_duplicates().sort_values(by=['education-num'], ascending=True).to_dict(orient='records')
    df.drop(columns=['education','relationship'], inplace=True)

    return df


def pre_processing(X):
    X = data_cleaning(X)

    # Encoiding categorical variables
    sex = pd.get_dummies(X[['sex']],prefix='sex').astype(int)
    martial = pd.get_dummies(X[['marital-status']],prefix='marital-status').astype(int)
    race = pd.get_dummies(X[['race']],prefix='race').astype(int)
    relation = pd.get_dummies(X[['relationship_grouped']],prefix='relationship').astype(int)
    workclass = pd.get_dummies(X[['workclass']],prefix='workclass').astype(int)
    X = pd.concat([X,sex,martial,race,relation,workclass],axis=1)
    le = LabelEncoder()
    X['occupation'] = le.fit_transform(X['occupation'])
    X['native-country'] = le.fit_transform(X['native-country'])
    X.drop(columns=['sex', 'marital-status', 'relationship_grouped','race', 'workclass'], inplace=True)
    X['income'].replace({'<=50K': 0, '>50K': 1}, inplace=True)

    # Scaling numerical features
    scaler = StandardScaler()
    X[['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']] = scaler.fit_transform(X[['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']])

    sm = SMOTE(random_state=42)

    X_sm = sm.fit_resample(X.drop(columns=['income']),X['income'])

    return X,X_sm[0], X_sm[1]

## Searching best model using RandomSearchCV

In [54]:
model = {
    'LogisticRegression' : {
        "model" : LogisticRegression(),
        "params": {
            'C' : [0.01,0.1,1,10,100,1000,10000],
            'penalty': ['l1', 'l2','elasticnet'],
            'max_iter': [100, 500, 1000,5000]
        }

    },
    'DecisionTreeClassifier' : {
        "model" : DecisionTreeClassifier(),
        "params": {
            'max_depth': [5, 10, 15, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 5]
        }
    },
    'RandomForestClassifier' : {
        "model" : RandomForestClassifier(),
        "params": {
            'n_estimators': [100, 200, 500, 1000],
            'max_depth': [5, 10, 15, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 5]
        }
    },
    'GradientBoostingClassifier' : {
        "model" : GradientBoostingClassifier(),
        "params": {
            'n_estimators': [100, 200, 500, 1000],
            'max_depth': [3, 5, 7, 10],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 5],
            'learning_rate': [0.01, 0.1, 0.2, 0.5, 1.0]
        }
    },
    'SVC' : {
        "model" : SVC(),
        "params": {
            'C': [0.01, 0.1, 1, 10, 100, 1000],
            'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
            'degree': [2, 3, 4, 5],
            'gamma': ['scale', 'auto']
        }
    }
}




In [58]:
for name, mp in model.items():
    print(f"\nRunning RandomizedSearchCV for {name}...")
    search = RandomizedSearchCV(
        estimator=mp['model'],
        param_distributions=mp['params'],
        n_iter=10,
        scoring='accuracy',
        cv=5,
        random_state=42,
    )
    search.fit(X_train, y_train)
    best_model[name] = search.best_estimator_
    print(f"Best Score for {name}: {search.best_score_:.4f}")
    print(f"Best Params for {name}: {search.best_params_}")


Running RandomizedSearchCV for LogisticRegression...


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best Score for LogisticRegression: 0.8402
Best Params for LogisticRegression: {'penalty': 'l2', 'max_iter': 5000, 'C': 100}

Running RandomizedSearchCV for DecisionTreeClassifier...
Best Score for DecisionTreeClassifier: 0.8546
Best Params for DecisionTreeClassifier: {'min_samples_split': 10, 'min_samples_leaf': 5, 'max_depth': 20}

Running RandomizedSearchCV for RandomForestClassifier...
Best Score for RandomForestClassifier: 0.8759
Best Params for RandomForestClassifier: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 20}

Running RandomizedSearchCV for GradientBoostingClassifier...
Best Score for GradientBoostingClassifier: 0.9057
Best Params for GradientBoostingClassifier: {'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 5, 'learning_rate': 0.1}

Running RandomizedSearchCV for SVC...
Best Score for SVC: 0.8434
Best Params for SVC: {'kernel': 'poly', 'gamma': 'auto', 'degree': 3, 'C': 0.1}
