# DATA PREPROCESSING AND FEATURE ENGINEERING IN MACHINE LEARNING

Task 1: Data Exploration and Preprocessing

In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
# Load the dataset
data = pd.read_csv("adult_with_headers.csv")

In [4]:
# Basic data exploration
print(data.head())

   age          workclass  fnlwgt   education  education_num  \
0   39          State-gov   77516   Bachelors             13   
1   50   Self-emp-not-inc   83311   Bachelors             13   
2   38            Private  215646     HS-grad              9   
3   53            Private  234721        11th              7   
4   28            Private  338409   Bachelors             13   

        marital_status          occupation    relationship    race      sex  \
0        Never-married        Adm-clerical   Not-in-family   White     Male   
1   Married-civ-spouse     Exec-managerial         Husband   White     Male   
2             Divorced   Handlers-cleaners   Not-in-family   White     Male   
3   Married-civ-spouse   Handlers-cleaners         Husband   Black     Male   
4   Married-civ-spouse      Prof-specialty            Wife   Black   Female   

   capital_gain  capital_loss  hours_per_week  native_country  income  
0          2174             0              40   United-States   <=50

In [5]:
print(data.describe())

                age        fnlwgt  education_num  capital_gain  capital_loss  \
count  32561.000000  3.256100e+04   32561.000000  32561.000000  32561.000000   
mean      38.581647  1.897784e+05      10.080679   1077.648844     87.303830   
std       13.640433  1.055500e+05       2.572720   7385.292085    402.960219   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.178270e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.783560e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.370510e+05      12.000000      0.000000      0.000000   
max       90.000000  1.484705e+06      16.000000  99999.000000   4356.000000   

       hours_per_week  
count    32561.000000  
mean        40.437456  
std         12.347429  
min          1.000000  
25%         40.000000  
50%         40.000000  
75%         45.000000  
max         99.000000  


In [41]:
print(data.isnull().sum())

age                        0
workclass                  0
fnlwgt                     0
education                  0
education_num              0
marital_status             0
occupation                 0
relationship               0
race                       0
sex                        0
capital_gain               0
capital_loss               0
hours_per_week             0
native_country             0
income                     0
age_scaled_standard        0
age_scaled_minmax          0
capital-gain-minus-loss    0
age_squared                0
capital-gain_log           0
dtype: int64


In [6]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB
None


In [42]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income,age_scaled_standard,age_scaled_minmax,capital-gain-minus-loss,age_squared,capital-gain_log
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,0.030671,0.30137,2174,1521,7.684784
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,0.837109,0.452055,0,2500,0.0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,-0.042642,0.287671,0,1444,0.0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,1.057047,0.493151,0,2809,0.0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,-0.775768,0.150685,0,784,0.0


In [51]:
# Handling missing values
data=data.dropna()

In [52]:
numerical_cols = ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']

In [53]:
scaler_std = StandardScaler()
data_std = data.copy()
data_std[numerical_cols] = scaler_std.fit_transform(data[numerical_cols])

In [54]:
# Min-Max Scaling
scaler_mm = MinMaxScaler()
data_mm = data.copy()
data_mm[numerical_cols] = scaler_mm.fit_transform(data[numerical_cols])

In [9]:
# Assuming 'age' and 'education-num' are numerical features
data['age_scaled_standard'] = scaler_standard.fit_transform(data[['age']])
data['age_scaled_minmax'] = scaler_minmax.fit_transform(data[['age']])

Task 2: Encoding Techniques

In [10]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [11]:
# One-Hot Encoding
onehot_cols = ['workclass', 'marital_status', 'occupation', 'relationship', 'race']
for col in onehot_cols:
    if len(data[col].unique()) < 5:
        onehot_encoder = OneHotEncoder(drop='first')
        encoded_cols = pd.DataFrame(onehot_encoder.fit_transform(data[[col]]).toarray(),
                                    columns=[col + '_' + str(i) for i in range(1, len(data[col].unique()))])
        data = pd.concat([data, encoded_cols], axis=1)

In [12]:
onehot_cols

['workclass', 'marital_status', 'occupation', 'relationship', 'race']

Task 3: Feature Engineering

In [55]:
# Create new features
data['capital-gain-minus-loss'] = data['capital_gain'] - data['capital_loss']
data['age_squared'] = data['age'] ** 2

In [56]:
# Apply log transformation to skewed numerical feature (assuming 'capital-gain' is skewed)
import numpy as np
data['capital-gain_log'] = np.log1p(data['capital_gain'])

Task 4: Feature Selection python

In [57]:
from sklearn.ensemble import IsolationForest

# Use only numeric columns for IsolationForest
numeric_cols = data.select_dtypes(include=['int64', 'float64']).columns
iso_forest = IsolationForest(random_state=42)
outlier_pred = iso_forest.fit_predict(data[numeric_cols])
data = data[outlier_pred == 1]


In [60]:
!pip install ppscore



Collecting ppscore
  Downloading ppscore-1.3.0.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: ppscore
  Building wheel for ppscore (setup.py): started
  Building wheel for ppscore (setup.py): finished with status 'done'
  Created wheel for ppscore: filename=ppscore-1.3.0-py2.py3-none-any.whl size=13201 sha256=bfd3b679c11e659a65664e59f602eee30d7571b7acafc65cc79422e56cf8e51f
  Stored in directory: c:\users\hp\appdata\local\pip\cache\wheels\7e\45\08\bb444e1bca6a2bc3795707de9edd87ec1976dd8b3570fa7abf
Successfully built ppscore
Installing collected packages: ppscore
Successfully installed ppscore-1.3.0



[notice] A new release of pip is available: 23.3.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(cl

                    x                        y   ppscore
0                 age                      age  1.000000
1                 age                workclass  0.009353
2                 age                   fnlwgt  0.000000
3                 age                education  0.073215
4                 age            education_num  0.000000
..                ...                      ...       ...
395  capital-gain_log      age_scaled_standard  0.000000
396  capital-gain_log        age_scaled_minmax  0.000000
397  capital-gain_log  capital-gain-minus-loss  0.606687
398  capital-gain_log              age_squared  0.000000
399  capital-gain_log         capital-gain_log  1.000000

[400 rows x 3 columns]
                              age    fnlwgt  education_num  capital_gain  \
age                      1.000000 -0.082404       0.031487      0.031532   
fnlwgt                  -0.082404  1.000000      -0.039761     -0.019393   
education_num            0.031487 -0.039761       1.000000      

In [64]:
import ppscore as pps

# PPS matrix
pps_matrix = pps.matrix(data)[['x', 'y', 'ppscore']]
print(pps_matrix)

# Correlation matrix
corr_matrix = data.corr()
print(corr_matrix)

  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)
  target_type = type_of_target(classes)


                    x                        y   ppscore
0                 age                      age  1.000000
1                 age                workclass  0.009353
2                 age                   fnlwgt  0.000000
3                 age                education  0.073215
4                 age            education_num  0.000000
..                ...                      ...       ...
395  capital-gain_log      age_scaled_standard  0.000000
396  capital-gain_log        age_scaled_minmax  0.000000
397  capital-gain_log  capital-gain-minus-loss  0.606687
398  capital-gain_log              age_squared  0.000000
399  capital-gain_log         capital-gain_log  1.000000

[400 rows x 3 columns]
                              age    fnlwgt  education_num  capital_gain  \
age                      1.000000 -0.082404       0.031487      0.031532   
fnlwgt                  -0.082404  1.000000      -0.039761     -0.019393   
education_num            0.031487 -0.039761       1.000000      