In [None]:
#1. Data Exploration and Preprocessing:
#Load the dataset and conduct basic data exploration (summary statistics, missing values, data types).
#Handle missing values as per the best practices (imputation, removal, etc.).
#Apply scaling techniques to numerical features:
#Standard Scaling
#Discuss the scenarios where each scaling technique is preferred and why.
#2. Encoding Techniques:
#Apply One-Hot Encoding to categorical variables with less than 5 categories.
#Discuss the pros and cons of One-Hot Encoding and Label Encoding.
#3. Feature Engineering:
#Create at least 2 new features that could be beneficial for the model. Explain the rationale behind your choices.
#Apply a transformation (e.g., log transformation) to at least one skewed numerical feature and justify your choice.
#4. Feature Selection:
#Use the Isolation Forest algorithm to identify and remove outliers. Discuss how outliers can affect model performance.
#Apply the PPS (Predictive Power Score) to find and discuss the relationships between features. Compare its findings with the correlation matrix.

In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/content/adult_with_headers.csv')  # Update the file path as necessary

# Display the first few rows of the dataset
data.head()

# Get basic information about the dataset
#print(data.info())

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
# Print summary statistics for numerical features
print(data.describe(include='all'))

# Check unique value counts for categorical variables
for column in data.select_dtypes(include=['object']).columns:
    print(f"{column}: {data[column].unique()[:10]}")  # Output first 10 unique values

                 age workclass        fnlwgt education  education_num  \
count   32561.000000     32561  3.256100e+04     32561   32561.000000   
unique           NaN         9           NaN        16            NaN   
top              NaN   Private           NaN   HS-grad            NaN   
freq             NaN     22696           NaN     10501            NaN   
mean       38.581647       NaN  1.897784e+05       NaN      10.080679   
std        13.640433       NaN  1.055500e+05       NaN       2.572720   
min        17.000000       NaN  1.228500e+04       NaN       1.000000   
25%        28.000000       NaN  1.178270e+05       NaN       9.000000   
50%        37.000000       NaN  1.783560e+05       NaN      10.000000   
75%        48.000000       NaN  2.370510e+05       NaN      12.000000   
max        90.000000       NaN  1.484705e+06       NaN      16.000000   

             marital_status       occupation relationship    race    sex  \
count                 32561            32561   

In [3]:
# Check for missing values
missing_values = data.isnull().sum()
print("Missing values per column:\n", missing_values[missing_values > 0])

Missing values per column:
 Series([], dtype: int64)


In [4]:
# Replace ' ?' with NaN
data.replace(' ?', pd.NA, inplace=True)

# Remove rows with missing values if fewer than a certain threshold
data.dropna(inplace=True)  # Alternatively, you could also use imputation

# Verify missing values after handling
print("Missing values after handling:\n", data.isnull().sum())

Missing values after handling:
 age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64


In [5]:
X = data.drop('income', axis=1)  # Feature set
y = data['income']  # Target variable

In [6]:
# Identify numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print("Numerical features:", numerical_features)
print("Categorical features:", categorical_features)

Numerical features: ['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week']
Categorical features: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country']


In [7]:
# One-hot encoding for categorical variables
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [8]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Initialize scalers
standard_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

# Apply Standard Scaling
X_standard_scaled = X.copy()
X_standard_scaled[numerical_features] = standard_scaler.fit_transform(X_standard_scaled[numerical_features])

# Apply Min-Max Scaling
X_minmax_scaled = X.copy()
X_minmax_scaled[numerical_features] = minmax_scaler.fit_transform(X_minmax_scaled[numerical_features])

In [10]:
# (Assuming `data` is loaded and cleaned previously)
categorical_features = data.select_dtypes(include=['object']).columns.tolist()
print("Categorical features:", categorical_features)

Categorical features: ['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'race', 'sex', 'native_country', 'income']


In [11]:
# Create a dictionary to hold the number of unique values for each categorical feature
unique_counts = {feature: data[feature].nunique() for feature in categorical_features}
print("Unique counts for categorical features:", unique_counts)

Unique counts for categorical features: {'workclass': 7, 'education': 16, 'marital_status': 7, 'occupation': 14, 'relationship': 6, 'race': 5, 'sex': 2, 'native_country': 41, 'income': 2}


In [14]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Initialize encoders
one_hot_encoder = OneHotEncoder(drop='first')  # drop='first' to avoid dummy variable trap
label_encoder = LabelEncoder()

# Prepare lists to separate features for each encoding technique
one_hot_encoded_features = []
label_encoded_features = []
# Apply encoding based on the number of unique categories
for feature, count in unique_counts.items():
    if count < 5:
        # One-Hot Encoding
        one_hot_encoded = one_hot_encoder.fit_transform(data[[feature]])
        one_hot_col_names = [f"{feature}_{int(i)}" for i in range(one_hot_encoded.shape[1])]
        one_hot_encoded_df = pd.DataFrame(one_hot_encoded, columns=one_hot_col_names, index=data.index)
        one_hot_encoded_features.append(one_hot_encoded_df)
    else:
        # Label Encoding
        data[feature] = label_encoder.fit_transform(data[feature])
        label_encoded_features.append(feature)
# Concatenate all one-hot encoded features to the original dataframe
if one_hot_encoded_features:
    one_hot_encoded_df = pd.concat(one_hot_encoded_features, axis=1)
    data = pd.concat([data.drop(categorical_features, axis=1), one_hot_encoded_df], axis=1)

# Display the transformed data
print(data.head())#

   age  fnlwgt  education_num  capital_gain  capital_loss  hours_per_week  \
0   39   77516             13          2174             0              40   
1   50   83311             13             0             0              13   
2   38  215646              9             0             0              40   
3   53  234721              7             0             0              40   
4   28  338409             13             0             0              40   

           sex_0 income_0  
0    (0, 0)\t1.0           
1    (0, 0)\t1.0           
2    (0, 0)\t1.0           
3    (0, 0)\t1.0           
4                          


In [15]:
def age_group(age):
    if age < 25:
        return 'Young'
    elif 25 <= age < 45:
        return 'Middle_Aged'
    else:
        return 'Senior'

# Assuming 'age' is a feature in the dataset
data['age_group'] = data['age'].apply(age_group)

In [16]:
# Creating a capital gain ratio as new feature
data['capital_gain_ratio'] = data['capital_gain'] / (data['capital_gain'] + data['capital_loss'] + 1)  # Adding 1 to avoid division by zero

In [17]:
# Check for skewness
skewed_features = data[numerical_features].apply(lambda x: x.skew()).sort_values(ascending=False)
print("Skewed features:\n", skewed_features[abs(skewed_features) > 0.5])

Skewed features:
 capital_gain    11.902682
capital_loss     4.526380
fnlwgt           1.459220
age              0.530228
dtype: float64


In [18]:
import numpy as np

# Apply log transformation
data['log_capital_gain'] = np.log1p(data['capital_gain'])  # Using log1p to avoid log(0)

In [19]:
from sklearn.ensemble import IsolationForest

# Assuming `data` holds the dataset and numerical features are identified
numerical_features = data.select_dtypes(include=[np.number]).columns.tolist()

# Initialize Isolation Forest
iso_forest = IsolationForest(contamination=0.05, random_state=42)  # Contamination parameter is set to identify the expected proportion of outliers

# Fit and predict
data['outlier'] = iso_forest.fit_predict(data[numerical_features])

# Identify outliers: -1 indicates an outlier, and 1 indicates a normal point
outliers = data[data['outlier'] == -1]
print("Number of outliers detected:", len(outliers))

# Remove outliers from the dataset
cleaned_data = data[data['outlier'] != -1].drop(columns=['outlier'])

Number of outliers detected: 1509
