In [1]:
# DATA PREPROCESSING AND FEATURE ENGINEERING IN MACHINE LEARNING

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [7]:
file_path = '/content/adult_with_headers.csv'
df = pd.read_csv(file_path)

In [None]:
# Task 1: Data Exploration and Preprocessing

In [8]:
print("Summary Statistics:")
print(df.describe())

Summary Statistics:
                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  


In [9]:
print("\nMissing Values:")
print(df.isnull().sum())


Missing Values:
age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [10]:
print("\nData Types:")
print(df.dtypes)


Data Types:
age                int64
workclass         object
fnlwgt             int64
education         object
education_num      int64
marital_status    object
occupation        object
relationship      object
race              object
sex               object
capital_gain       int64
capital_loss       int64
hours_per_week     int64
native_country    object
income            object
dtype: object


In [None]:
#  Handle Missing Values:

In [11]:
# Handling missing values: Dropping rows with missing values
# we can choose to impute based on the distribution of each column, here we will drop
df_cleaned = df.dropna()

In [12]:
# Check for remaining missing values
print(df_cleaned.isnull().sum())

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [None]:
#  Scaling Techniques:

In [13]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [14]:
# Separate numerical columns
numerical_cols = df_cleaned.select_dtypes(include=['int64', 'float64']).columns

In [15]:
# Standard Scaling
scaler_standard = StandardScaler()
df_standard_scaled = df_cleaned.copy()
df_standard_scaled[numerical_cols] = scaler_standard.fit_transform(df_cleaned[numerical_cols])

In [16]:
# Min-Max Scaling
scaler_min_max = MinMaxScaler()
df_min_max_scaled = df_cleaned.copy()
df_min_max_scaled[numerical_cols] = scaler_min_max.fit_transform(df_cleaned[numerical_cols])

In [17]:
# Displaying first few rows after scaling
print(df_standard_scaled.head())
print(df_min_max_scaled.head())

        age          workclass    fnlwgt   education  education_num  \
0  0.030671          State-gov -1.063611   Bachelors       1.134739   
1  0.837109   Self-emp-not-inc -1.008707   Bachelors       1.134739   
2 -0.042642            Private  0.245079     HS-grad      -0.420060   
3  1.057047            Private  0.425801        11th      -1.197459   
4 -0.775768            Private  1.408176   Bachelors       1.134739   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0      0.148453      -0.21

**Discussion:**

1.Standard Scaling: Best for data where the features follow a normal distribution (Gaussian), as it centers the data around zero.

2.Min-Max Scaling: Useful when features have a predefined range, such as image data or where distribution boundaries matter (e.g., 0 to 1).


In [None]:
# Task 2: Encoding Techniques

In [None]:
# One-Hot Encoding & Label Encoding:


In [18]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


In [19]:
# Identify categorical columns
categorical_cols = df_cleaned.select_dtypes(include=['object']).columns

In [20]:
# Apply One-Hot Encoding to categorical variables with less than 5 categories
one_hot_cols = [col for col in categorical_cols if df_cleaned[col].nunique() < 5]
df_one_hot_encoded = pd.get_dummies(df_cleaned, columns=one_hot_cols)


In [21]:
# Apply Label Encoding to categorical variables with more than 5 categories
label_cols = [col for col in categorical_cols if df_cleaned[col].nunique() >= 5]
label_encoders = {}
df_label_encoded = df_one_hot_encoded.copy()

for col in label_cols:
    label_encoders[col] = LabelEncoder()
    df_label_encoded[col] = label_encoders[col].fit_transform(df_cleaned[col])


In [22]:
# Display encoded dataframe
print(df_label_encoded.head())

   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  capital_gain  capital_loss  hours_per_week  \
0           1             1     4          2174             0              40   
1           4             0     4             0             0              13   
2           6             1     4             0             0              40   
3           6             0     2             0             0              40   
4          10             5     2             0             0              40   

   native_country  sex_ Female  sex_ Male  income_ <=50K  income_ >50K  
0            

Discussion:

1.One-Hot Encoding Pros: Keeps the categorical feature information intact and is useful for non-ordinal categories.

2.One-Hot Encoding Cons: Can increase dimensionality significantly when dealing with high cardinality.

3.Label Encoding Pros: Efficient for ordinal categories or when there’s a natural ranking.

4.Label Encoding Cons: Can introduce spurious relationships when categories are non-ordinal.


In [None]:
# Task 3: Feature Engineering
 # Creating new features:

In [23]:
df_label_encoded['age_bins'] = pd.cut(df_cleaned['age'], bins=[0, 30, 60, 90], labels=['young', 'middle-aged', 'senior'])

# Check for typos and correct the column name
df_label_encoded['capital_gain_loss'] = df_cleaned['capital_gain'] - df_cleaned['capital_loss']

In [24]:
# Display the newly engineered features
print(df_label_encoded[['age_bins', 'capital_gain_loss']].head())

      age_bins  capital_gain_loss
0  middle-aged               2174
1  middle-aged                  0
2  middle-aged                  0
3  middle-aged                  0
4        young                  0


In [None]:
#Applying transformation (e.g., log transformation):

In [25]:
# Apply log transformation to the 'capital_gain' column
df_label_encoded['log_capital_gain'] = np.log1p(df_cleaned['capital_gain'])


In [26]:
# Display the result of the log transformation
print(df_label_encoded[['capital_gain', 'log_capital_gain']].head())

   capital_gain  log_capital_gain
0          2174          7.684784
1             0          0.000000
2             0          0.000000
3             0          0.000000
4             0          0.000000


# Justification:
 Log transformation helps to normalize right-skewed features, which stabilizes variance and reduces the impact of outliers.

In [None]:
# Task 4: Feature Selection

In [27]:
from sklearn.ensemble import IsolationForest


In [None]:
# Using Isolation Forest for outlier detection:

In [28]:
# Apply Isolation Forest to detect outliers
iso_forest = IsolationForest(contamination=0.05)  # Adjust contamination as necessary
outliers = iso_forest.fit_predict(df_label_encoded.select_dtypes(include=['float64', 'int64']))


In [29]:
# Filter out outliers
df_no_outliers = df_label_encoded[outliers != -1]

In [30]:
# Display result
print(f"Outliers removed: {len(df_label_encoded) - len(df_no_outliers)}")
print(df_no_outliers.head())

Outliers removed: 1628
   age  workclass  fnlwgt  education  education_num  marital_status  \
0   39          7   77516          9             13               4   
1   50          6   83311          9             13               2   
2   38          4  215646         11              9               0   
3   53          4  234721          1              7               2   
4   28          4  338409          9             13               2   

   occupation  relationship  race  capital_gain  capital_loss  hours_per_week  \
0           1             1     4          2174             0              40   
1           4             0     4             0             0              13   
2           6             1     4             0             0              40   
3           6             0     2             0             0              40   
4          10             5     2             0             0              40   

   native_country  sex_ Female  sex_ Male  income_ <=50K  incom

**Discussion:**

Outliers can distort model learning by creating noise that can skew parameter estimations, particularly for algorithms sensitive to distance or variance (e.g., linear models, SVMs).

In [None]:
# Applying Predictive Power Score (PPS):

In [2]:
import ppscore as pps

In [32]:
# Calculate the PPS matrix
pps_matrix = pps.matrix(df_no_outliers)


In [34]:
# Display PPS matrix
print(pps_matrix[['x', 'y', 'ppscore']].sort_values(by='ppscore', ascending=False).head()) # Changed 'pps' to 'ppscore'

                x              y  ppscore
0             age            age      1.0
42         fnlwgt         fnlwgt      1.0
64      education  education_num      1.0
83  education_num      education      1.0
84  education_num  education_num      1.0


In [35]:
# Correlation Matrix for comparison
corr_matrix = df_no_outliers.corr()
print("\nCorrelation Matrix:")
print(corr_matrix)


Correlation Matrix:
                        age  workclass    fnlwgt  education  education_num  \
age                1.000000   0.022234 -0.079447  -0.000959       0.037441   
workclass          0.022234   1.000000 -0.015588   0.014918       0.037665   
fnlwgt            -0.079447  -0.015588  1.000000  -0.029791      -0.047529   
education         -0.000959   0.014918 -0.029791   1.000000       0.347304   
education_num      0.037441   0.037665 -0.047529   0.347304       1.000000   
marital_status    -0.298138  -0.057110  0.030886  -0.032019      -0.054944   
occupation        -0.010420   0.237303  0.000392  -0.027133       0.101933   
relationship      -0.276943  -0.090680  0.009681  -0.007257      -0.090892   
race               0.037814   0.045574 -0.026982   0.004023       0.029194   
capital_gain       0.064762   0.018673 -0.023541   0.023313       0.094676   
capital_loss       0.032532   0.015306 -0.011656   0.021292       0.077664   
hours_per_week     0.092138   0.126678 -0.0

  corr_matrix = df_no_outliers.corr()


**Discussion:**

1.PPS: Measures the strength of a predictive relationship between two features, and is better suited for capturing non-linear relationships compared to correlation.

2.Correlation: Captures only linear relationships, making it less informative in many cases.
