In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler , OneHotEncoder, LabelEncoder

df = pd.read_csv('adult_with_headers.csv')

# --- Basic Data Exploration ---
print("--- Dataset Info ---")
df.info()

# --- Handle Missing Values ---
# ' ?' is a common missing value representation in this dataset
df.replace(' ?', np.nan, inplace=True)
# Impute missing categorical values with the mode
for col in df.select_dtypes(include='object').columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)

print(f"\nTotal missing values after imputation: {df.isnull().sum().sum()}")

# Identify numerical features (excluding 'fnlwgt' which is a sampling weight)
numerical_cols = df.select_dtypes(include=np.number).columns.drop('fnlwgt')

# --- Apply Scaling Techniques ---
# Create a copy
df_scaled = df.copy()

# Standard Scaling
scaler_standard = StandardScaler()
df_scaled[numerical_cols] = scaler_standard.fit_transform(df[numerical_cols])
print("\n--- Numerical features after Standard Scaling (first 5 rows) ---")
print(df_scaled[numerical_cols].head())

# Min-Max Scaling
scaler_minmax = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[numerical_cols] = scaler_minmax.fit_transform(df_minmax_scaled[numerical_cols])
print("\n--- Numerical features after Min-Max Scaling (first 5 rows) ---")
print(df_minmax_scaled[numerical_cols].head())


--- Dataset Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26875 entries, 0 to 26874
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             26875 non-null  int64  
 1   workclass       26875 non-null  object 
 2   fnlwgt          26875 non-null  int64  
 3   education       26874 non-null  object 
 4   education_num   26874 non-null  float64
 5   marital_status  26874 non-null  object 
 6   occupation      26874 non-null  object 
 7   relationship    26874 non-null  object 
 8   race            26874 non-null  object 
 9   sex             26874 non-null  object 
 10  capital_gain    26874 non-null  float64
 11  capital_loss    26874 non-null  float64
 12  hours_per_week  26874 non-null  float64
 13  native_country  26874 non-null  object 
 14  income          26874 non-null  object 
dtypes: float64(4), int64(2), object(9)
memory usage: 3.1+ MB

Total missing values after imputation: 4



In [None]:
# --- Apply Encoding Techniques ---
# Identify categorical features
categorical_cols = df.select_dtypes(include='object').columns

# One-Hot Encoding for categorical variables with less than 5 categories
one_hot_cols = [col for col in categorical_cols if df[col].nunique() < 5]
if one_hot_cols:
    encoder_onehot = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    one_hot_encoded = encoder_onehot.fit_transform(df[one_hot_cols])
    one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder_onehot.get_feature_names_out(one_hot_cols), index=df.index)
    df = pd.concat([df.drop(columns=one_hot_cols), one_hot_df], axis=1)
    print(f"\nOne-Hot Encoded columns: {one_hot_cols}")

# Label Encoding for categorical variables with 5 or more categories
# Update categorical_cols after one-hot encoding
categorical_cols = df.select_dtypes(include='object').columns
label_cols = [col for col in categorical_cols if df[col].nunique() >= 5]
if label_cols:
    encoder_label = LabelEncoder()
    for col in label_cols:
        df[col] = encoder_label.fit_transform(df[col])
    print(f"Label Encoded columns: {label_cols}")


print("\n--- Final Processed DataFrame (first 5 rows) ---")
print(df.head())


One-Hot Encoded columns: ['sex', 'income']
Label Encoded columns: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'native_country']

--- Final Processed DataFrame (first 5 rows) ---
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          6   77516          9           13.0               4   
1   50          5   83311          9           13.0               2   
2   38          3  215646         11            9.0               0   
3   53          3  234721          1            7.0               2   
4   28          3  338409          9           13.0               2   

   occupation  relationship  race  capital_gain  capital_loss  hours_per_week  \
0           0             1     4        2174.0           0.0            40.0   
1           3             0     4           0.0           0.0            13.0   
2           5             1     4           0.0           0.0            40.0   
3           5             0    

In [None]:
!pip install ppscore



In [None]:
import pandas as pd
import numpy as np
import ppscore as pps
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('adult_with_headers.csv')

# --- Pre-processing  ---
df.replace(' ?', np.nan, inplace=True)

# Impute missing categorical values with the mode
for col in df.select_dtypes(include='object').columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].mode()[0], inplace=True)

# Impute missing numerical values with the mean
for col in df.select_dtypes(include=np.number).columns:
    if df[col].isnull().any():
        df[col].fillna(df[col].mean(), inplace=True)

print(f"\nTotal missing values after imputation: {df.isnull().sum().sum()}")

# --- 1. Feature Engineering ---

# Create a new feature: Net Capital Gain
# This combines capital_gain and capital_loss into a single, more meaningful feature.
df['capital_net'] = df['capital_gain'] - df['capital_loss']
print("\n--- New feature 'capital_net' created ---")
print(df[['capital_gain', 'capital_loss', 'capital_net']].head())

# Create a new feature: Age Group (Categorical)
# This bins the continuous 'age' feature into meaningful groups.
age_bins = [17, 25, 45, 65, 90]
age_labels = ['Young Adult', 'Adult', 'Middle-Aged', 'Senior']
df['age_group'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, right=False)
print("\n--- New feature 'age_group' created ---")
print(df[['age', 'age_group']].head())

# Apply Log Transformation to a skewed feature
# 'capital_gain' is highly skewed (many zeros). We apply a log transformation

df['capital_gain_log'] = np.log1p(df['capital_gain']) # np.log1p(x) is log(1+x)
print("\n--- Skewed feature 'capital_gain' after log transformation ---")
print(df[['capital_gain', 'capital_gain_log']].head())

# --- 2. Feature Selection ---

# Identify and Remove Outliers using Isolation Forest
print("\n--- Outlier Detection with Isolation Forest ---")
# Use only numerical features for outlier detection
numerical_cols_for_isolation = df.select_dtypes(include=np.number).columns
iso_forest = IsolationForest(random_state=42)
outlier_labels = iso_forest.fit_predict(df[numerical_cols_for_isolation])

outliers = df[outlier_labels == -1]
print(f"Found {len(outliers)} outliers. Removing them...")
df_no_outliers = df[outlier_labels == 1].copy()
print(f"Original dataset size: {len(df)}")
print(f"Dataset size after outlier removal: {len(df_no_outliers)}")

# Apply PPS
print("\n--- Predictive Power Score (PPS) Analysis ---")
# PPS is typically used to find relationships with a target variable (e.g., 'income').

pps_income = pps.predictors(df_no_outliers, "income")

print("Strongest predictors of 'income' (PPS > 0.1):")
print(pps_income[pps_income['ppscore'] > 0.1][['x', 'ppscore']].sort_values(by='ppscore', ascending=False))

# For comparison, let's look at a standard correlation matrix
print("\n--- Standard Correlation Matrix ---")
# Note: This is now run on the df_no_outliers DataFrame.
correlation_matrix = df_no_outliers[numerical_cols_for_isolation].corr()
print(correlation_matrix)



Total missing values after imputation: 0

--- New feature 'capital_net' created ---
   capital_gain  capital_loss  capital_net
0        2174.0           0.0       2174.0
1           0.0           0.0          0.0
2           0.0           0.0          0.0
3           0.0           0.0          0.0
4           0.0           0.0          0.0

--- New feature 'age_group' created ---
   age    age_group
0   39        Adult
1   50  Middle-Aged
2   38        Adult
3   53  Middle-Aged
4   28        Adult

--- Skewed feature 'capital_gain' after log transformation ---
   capital_gain  capital_gain_log
0        2174.0          7.684784
1           0.0          0.000000
2           0.0          0.000000
3           0.0          0.000000
4           0.0          0.000000

--- Outlier Detection with Isolation Forest ---
Found 3919 outliers. Removing them...
Original dataset size: 26875
Dataset size after outlier removal: 22956

--- Predictive Power Score (PPS) Analysis ---
Strongest predictors of

**Discuss the scenarios where each scaling technique is preferred and why.**
=

In Python, StandardScaler is preferred when the data follows a normal (Gaussian) distribution or when algorithms assume data is centered around zero with unit variance (e.g., linear regression, logistic regression, SVM, PCA). It transforms features to have mean = 0 and standard deviation = 1, making it suitable for models sensitive to variance. On the other hand, MinMaxScaler is useful when you need to scale features to a fixed range, usually [0,1]. It is often used in algorithms that rely on distance metrics or when features have different units but bounded values are required. In short, use StandardScaler for normally distributed data and variance-sensitive models, and MinMaxScaler when you need normalized bounded values or when using distance-based methods.

**Discuss the pros and cons of One-Hot Encoding and Label Encoding.**

**One-Hot Encoding:**

Pros:

No ordinal relationship is introduced between categories.

Works well with algorithms that don’t assume order (like linear regression, decision trees).

Cons:

Increases dimensionality if there are many unique categories (sparse matrix).

Can make models slower and more memory-intensive.

**Label Encoding :**

Pros:

Simple and memory-efficient (assigns a unique integer to each category).

Useful for algorithms that can handle categorical integers directly (like tree-based models).

Cons:

Imposes an artificial ordinal relationship between categories, which may mislead algorithms that assume numeric meaning (like linear regression, SVM).

**Apply the PPS (Predictive Power Score) to find and discuss the relationships between features. Compare its findings with the correlation matrix.**

The Predictive Power Score (PPS) shows that features like education, occupation, marital-status, hours-per-week, and capital_gain are strong predictors of income, because PPS can capture both categorical and non-linear relationships. This makes it more flexible and informative for classification problems.

In contrast, the correlation matrix only measures linear relationships among numerical features. It highlights moderate positive correlations of age, hours-per-week, and capital_gain with income, but completely ignores categorical predictors such as education or occupation.

In short: PPS provides a broader and more realistic view of feature importance, while the correlation matrix gives only a limited numeric snapshot.