In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Loading the dataset
df = pd.read_csv("adult 3.csv")  # adjust the path if needed
print("Initial data shape:", df.shape)
display(df.head())

Initial data shape: (48842, 15)


Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [14]:
df.isna().sum()

age                0
workclass          0
fnlwgt             0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [15]:
df.shape

(45222, 14)

In [16]:
df.occupation.value_counts()

occupation
2     6020
9     6008
3     5984
0     5540
11    5408
7     4808
6     2970
13    2316
5     2046
4     1480
12    1420
10     976
8      232
1       14
Name: count, dtype: int64

In [17]:
df.gender.value_counts()

gender
1    30527
0    14695
Name: count, dtype: int64

In [None]:
# Clean and preprocess the data
# Strip whitespace from object type columns
for col in df.select_dtypes(['object']).columns:
    df[col] = df[col].str.strip()

In [5]:
# Replace '?' values with NaN and drop such rows
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)
print("After removing missing:", df.shape)

After removing missing: (45222, 15)


In [6]:
# Optionally drop columns that are not needed
if 'education' in df.columns:
    df.drop('education', axis=1, inplace=True)  # 'education' is redundant if 'educational-num' exists

In [None]:
# Encoding the categorical variables
cat_cols = df.select_dtypes(include=["object"]).columns.drop(['income'])
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    le_dict[col] = le  # store encoder for each column

In [None]:
# Encoding the target(o/p)
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})

In [9]:
# Final feature and target separation
X = df.drop('income', axis=1)
y = df['income']

In [None]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Fit a classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [None]:
#Predict and evaluate
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test accuracy: {acc:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Test accuracy: 0.8568
Classification Report:
               precision    recall  f1-score   support

           0       0.88      0.93      0.91      6842
           1       0.75      0.62      0.68      2203

    accuracy                           0.86      9045
   macro avg       0.82      0.78      0.79      9045
weighted avg       0.85      0.86      0.85      9045



In [None]:
#Creating Pickle File(binary file)
import pickle
with open("rf_model.pkl", "wb") as f:
    pickle.dump(clf, f)
with open("encoders.pkl", "wb") as f:
    pickle.dump(le_dict, f)
