In [1]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
# Step 2: Load the dataset
df = pd.read_csv("Adult.csv")

# Display basic information
print(df.info())
print(df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education.num   32561 non-null  int64 
 5   marital.status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital.gain    32561 non-null  int64 
 11  capital.loss    32561 non-null  int64 
 12  hours.per.week  32561 non-null  int64 
 13  native.country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None
   age workclass  fnlwgt     education  education.num marital.status  \
0   90         ?

In [3]:
# Step 3: Explore the dataset
print(df.describe(include='all'))
print("\nMissing values per column:\n", df.isnull().sum())
print("\nData types:\n", df.dtypes)


                 age workclass        fnlwgt education  education.num  \
count   32561.000000     32561  3.256100e+04     32561   32561.000000   
unique           NaN         9           NaN        16            NaN   
top              NaN   Private           NaN   HS-grad            NaN   
freq             NaN     22696           NaN     10501            NaN   
mean       38.581647       NaN  1.897784e+05       NaN      10.080679   
std        13.640433       NaN  1.055500e+05       NaN       2.572720   
min        17.000000       NaN  1.228500e+04       NaN       1.000000   
25%        28.000000       NaN  1.178270e+05       NaN       9.000000   
50%        37.000000       NaN  1.783560e+05       NaN      10.000000   
75%        48.000000       NaN  2.370510e+05       NaN      12.000000   
max        90.000000       NaN  1.484705e+06       NaN      16.000000   

            marital.status      occupation relationship   race    sex  \
count                32561           32561        

In [4]:
# Replace '?' with NaN
df = df.replace('?', np.nan)

# Check missing values again
print(df.isnull().sum())

# Option 1: Impute missing categorical columns with mode
for col in df.select_dtypes(include='object').columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Option 2: Impute numerical columns with mean
for col in df.select_dtypes(include=['int64', 'float64']).columns:
    df[col].fillna(df[col].mean(), inplace=True)


age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
income               0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [5]:
# Identify numerical columns
num_cols = df.select_dtypes(include=['int64', 'float64']).columns

# Apply StandardScaler
scaler_standard = StandardScaler()
df_standard_scaled = df.copy()
df_standard_scaled[num_cols] = scaler_standard.fit_transform(df[num_cols])

# Apply MinMaxScaler
scaler_minmax = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[num_cols] = scaler_minmax.fit_transform(df[num_cols])


When to use which:

StandardScaler – when features follow a normal distribution (e.g., regression, PCA).

MinMaxScaler – when features have varying scales or need normalization (e.g., neural networks, KNN, SVM).

In [6]:
# Identify categorical columns
cat_cols = df.select_dtypes(include='object').columns

# Columns with less than 5 unique categories → One-Hot Encoding
one_hot_cols = [col for col in cat_cols if df[col].nunique() < 5]

# Columns with 5 or more categories → Label Encoding
label_cols = [col for col in cat_cols if df[col].nunique() >= 5]

# Apply One-Hot Encoding
df_encoded = pd.get_dummies(df, columns=one_hot_cols, drop_first=True)

# Apply Label Encoding
labelencoder = LabelEncoder()
for col in label_cols:
    df_encoded[col] = labelencoder.fit_transform(df_encoded[col])



✅ Pros & Cons:

Encoding Type	   Pros	                                    Cons
One-Hot	       * No ordinal relationship assumed	  *Increasesdimensionality
Label	        * Simple, efficient                    *Can mislead ML models                                                           into thinking                                                                categories are                                                                    ordinal

In [12]:
print(df.columns.tolist())


['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'capital.gain', 'capital.loss', 'hours.per.week', 'native.country', 'income']


In [13]:
# Use the correct column names as per your dataset
df_encoded['age_Group'] = pd.cut(df['age'], bins=[0, 25, 45, 65, 100],
                                 labels=['Young', 'Adult', 'Middle_Aged', 'Senior'])

df_encoded['Work_Hours_Type'] = pd.cut(df['hours.per.week'], bins=[0, 25, 40, 60, 100],
                                       labels=['Part_Time', 'Full_Time', 'Over_Time', 'Extreme'])


✅ Rationale:

Age_Group: Converts continuous variable into meaningful categories

Work_Hours_Type: Captures work intensity (helpful in predicting income)

In [16]:
# Check skewness
print(df[num_cols].skew())

# Example: log-transform a skewed column (like 'capital-gain')
df_encoded['capital.gain_log'] = np.log1p(df_encoded['capital.gain'])



age                0.558743
fnlwgt             1.446980
education.num     -0.311676
capital.gain      11.953848
capital.loss       4.594629
hours.per.week     0.227643
dtype: float64


✅ Why log-transform?

*Reduces the effect of extreme outliers

*Makes the feature distribution more normal, improving model performance



In [17]:
print(df_encoded.head())
print(df_encoded.shape)


   age  workclass  fnlwgt  education  education.num  marital.status  \
0   90          3   77053         11              9               6   
1   82          3  132870         11              9               6   
2   66          3  186061         15             10               6   
3   54          3  140359          5              4               0   
4   41          3  264663         15             10               5   

   occupation  relationship  race  capital.gain  capital.loss  hours.per.week  \
0           9             1     4             0          4356              40   
1           3             1     4             0          4356              18   
2           9             4     2             0          4356              40   
3           6             4     4             0          3900              40   
4           9             3     4             0          3900              40   

   native.country  sex_Male  income_>50K    Age_Group    age_Group  \
0              3