In [1]:
import pandas as pd

adult_df = pd.read_csv("adult.csv")

In [2]:
adult_df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K


In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

def create_pension_target(df):
    pension_score = (
        (df['age'] >= 65) * 3 +
        (df['age'].between(55, 64)) * 2 +
        (df['workclass'].isin(['Federal-gov', 'State-gov', 'Local-gov'])) * 3 +
        (df['occupation'].isin(['Prof-specialty', 'Exec-managerial', 'Tech-support'])) * 2 +
        (df['income'] == '>50K') * 2 +
        (df['education.num'] >= 16) * 1 +
        (df['capital.gain'] > 10000) * 1
    )
    
    df['pension_category'] = pd.cut(pension_score, 
                                   bins=[-1, 3, 6, 20], 
                                   labels=['Low', 'Medium', 'High'])
    
    return df, pension_score

df, pension_scores = create_pension_target(adult_df)
print("Pension Category Distribution:")
print(df['pension_category'].value_counts())

Pension Category Distribution:
pension_category
Low       25430
Medium     5886
High       1245
Name: count, dtype: int64


In [4]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income,pension_category
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K,Low
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K,Medium
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K,Low
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K,Low
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22,Private,310152,Some-college,10,Never-married,Protective-serv,Not-in-family,White,Male,0,0,40,United-States,<=50K,Low
32557,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K,Low
32558,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K,Low
32559,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K,Low


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant',fill_value='Unknown')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

transformer = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])

transformed_array = transformer.fit_transform(df)

feature_names = transformer.get_feature_names_out()

df_transformed = pd.DataFrame(transformed_array, columns=feature_names)

In [6]:
import pandas as pd

cat_pipeline = transformer.named_transformers_['cat']
encoder = cat_pipeline.named_steps['encoder']

encoding_maps = []
for i, col_name in enumerate(categorical_cols):
    for code, category in enumerate(encoder.categories_[i]):
        encoding_maps.append({
            'column': col_name,
            'category': category,
            'encoded_value': code
        })

encoding_df = pd.DataFrame(encoding_maps)
print(encoding_df)

               column      category  encoded_value
0           workclass             ?              0
1           workclass   Federal-gov              1
2           workclass     Local-gov              2
3           workclass  Never-worked              3
4           workclass       Private              4
..                ...           ...            ...
102            income         <=50K              0
103            income          >50K              1
104  pension_category          High              0
105  pension_category           Low              1
106  pension_category        Medium              2

[107 rows x 3 columns]


In [7]:
encoding_df.to_csv("mapping.csv")

In [38]:
df_transformed

Unnamed: 0,num__age,num__fnlwgt,num__education.num,num__capital.gain,num__capital.loss,num__hours.per.week,cat__workclass,cat__education,cat__marital.status,cat__occupation,cat__relationship,cat__race,cat__sex,cat__native.country,cat__income,cat__pension_category
0,90.0,77053.0,9.0,0.0,4356.0,40.0,0.0,11.0,6.0,0.0,1.0,4.0,0.0,39.0,0.0,1.0
1,82.0,132870.0,9.0,0.0,4356.0,18.0,4.0,11.0,6.0,4.0,1.0,4.0,0.0,39.0,0.0,2.0
2,66.0,186061.0,10.0,0.0,4356.0,40.0,0.0,15.0,6.0,0.0,4.0,2.0,0.0,39.0,0.0,1.0
3,54.0,140359.0,4.0,0.0,3900.0,40.0,4.0,5.0,0.0,7.0,4.0,4.0,0.0,39.0,0.0,1.0
4,41.0,264663.0,10.0,0.0,3900.0,40.0,4.0,15.0,5.0,10.0,3.0,4.0,0.0,39.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,22.0,310152.0,10.0,0.0,0.0,40.0,4.0,15.0,4.0,11.0,1.0,4.0,1.0,39.0,0.0,1.0
32557,27.0,257302.0,12.0,0.0,0.0,38.0,4.0,7.0,2.0,13.0,5.0,4.0,0.0,39.0,0.0,1.0
32558,40.0,154374.0,9.0,0.0,0.0,40.0,4.0,11.0,2.0,7.0,0.0,4.0,1.0,39.0,1.0,1.0
32559,58.0,151910.0,9.0,0.0,0.0,40.0,4.0,11.0,6.0,1.0,4.0,4.0,0.0,39.0,0.0,1.0


In [40]:
from sklearn.model_selection import train_test_split

X = df_transformed.drop(['cat__pension_category'], axis=1)
y = df_transformed['cat__pension_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)



In [41]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd
import numpy as np

def evaluate_model(model, X_test, y_test, model_name="Model"):
    """
    Basic model evaluation function
    """
    # Predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # Print results
    print(f"╔{'═'*50}╗")
    print(f"║ {model_name:^48} ║")
    print(f"╠{'═'*50}╣")
    print(f"║ {'Accuracy:':<20} {accuracy:.4f} {'│':<5} {' ':20} ║")
    print(f"║ {'Precision:':<20} {precision:.4f} {'│':<5} {' ':20} ║")
    print(f"║ {'Recall:':<20} {recall:.4f} {'│':<5} {' ':20} ║")
    print(f"║ {'F1-Score:':<20} {f1:.4f} {'│':<5} {' ':20} ║")
    print(f"╚{'═'*50}╝")
    
    # Classification report
    print(f"\n📊 Classification Report:")
    print("─" * 60)
    print(classification_report(y_test, y_pred, zero_division=0))
    
    # Confusion matrix
    print(f"\n🎯 Confusion Matrix:")
    print("─" * 40)
    cm = confusion_matrix(y_test, y_pred)
    print(cm)
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1_score': f1,
        'predictions': y_pred,
        'probabilities': y_pred_proba
    }

In [32]:
X_train

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
937,28,Private,303954,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,1848,42,United-States,>50K
24931,65,Private,93318,Some-college,10,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,45,United-States,<=50K
9643,52,Local-gov,74784,Masters,14,Married-civ-spouse,Prof-specialty,Wife,White,Female,0,0,40,United-States,>50K
32303,20,?,99891,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
541,36,Private,156667,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,1902,50,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8138,20,Self-emp-inc,182200,HS-grad,9,Never-married,Machine-op-inspct,Own-child,White,Female,0,0,30,United-States,<=50K
28315,52,Local-gov,153064,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
29542,39,Private,290922,Masters,14,Never-married,Prof-specialty,Own-child,White,Female,0,0,40,United-States,<=50K
24057,19,?,184308,Some-college,10,Married-civ-spouse,?,Wife,White,Female,0,0,30,United-States,<=50K


In [42]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf_classifier = RandomForestClassifier(random_state=0, n_jobs=-1)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

grid_search = GridSearchCV(
    estimator=rf_classifier,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)




Fitting 5 folds for each of 162 candidates, totalling 810 fits


0,1,2
,estimator,RandomForestC...andom_state=0)
,param_grid,"{'max_depth': [10, 20, ...], 'max_features': ['sqrt', 'log2'], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...], ...}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,200
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [43]:
evaluate_model(grid_search, X_test, y_test, 'Random Forest')

╔══════════════════════════════════════════════════╗
║                  Random Forest                   ║
╠══════════════════════════════════════════════════╣
║ Accuracy:            0.9802 │                          ║
║ Precision:           0.9800 │                          ║
║ Recall:              0.9802 │                          ║
║ F1-Score:            0.9799 │                          ║
╚══════════════════════════════════════════════════╝

📊 Classification Report:
────────────────────────────────────────────────────────────
              precision    recall  f1-score   support

         0.0       0.97      0.88      0.92       249
         1.0       0.99      1.00      0.99      5087
         2.0       0.96      0.93      0.94      1177

    accuracy                           0.98      6513
   macro avg       0.97      0.93      0.95      6513
weighted avg       0.98      0.98      0.98      6513


🎯 Confusion Matrix:
────────────────────────────────────────
[[ 218    0   31]
 [  

{'accuracy': 0.9801934592353754,
 'precision': 0.9800078647025647,
 'recall': 0.9801934592353754,
 'f1_score': 0.9799425591442534,
 'predictions': array([1., 2., 2., ..., 2., 1., 1.], shape=(6513,)),
 'probabilities': array([[0.   , 1.   , 0.   ],
        [0.   , 0.48 , 0.52 ],
        [0.   , 0.085, 0.915],
        ...,
        [0.015, 0.015, 0.97 ],
        [0.   , 1.   , 0.   ],
        [0.   , 1.   , 0.   ]], shape=(6513, 3))}

In [46]:
import joblib

joblib.dump(grid_search.best_estimator_, "rf_estimator")


['rf_estimator']