In [2]:
# Data Exploration and Preprocessing:
# Load the dataset and conduct basic data exploration
import pandas as pd
df=pd.read_csv("adult_with_headers-12.csv")
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
# Handle missing values as per the best practices 
df.describe()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
count,32561.0,32561.0,32561.0,32561.0,32561.0,32561.0
mean,38.581647,189778.4,10.080679,1077.648844,87.30383,40.437456
std,13.640433,105550.0,2.57272,7385.292085,402.960219,12.347429
min,17.0,12285.0,1.0,0.0,0.0,1.0
25%,28.0,117827.0,9.0,0.0,0.0,40.0
50%,37.0,178356.0,10.0,0.0,0.0,40.0
75%,48.0,237051.0,12.0,0.0,0.0,45.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0


In [5]:
df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [7]:
#Apply scaling techniques to numerical features :

from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standard Scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['age', 'hours_per_week']])

# Min-Max Scaling
scaler = MinMaxScaler()
minmax_features = scaler.fit_transform(df[['age', 'hours_per_week']])

In [None]:
# 	Discuss the scenarios where each scaling technique is preferred and why
 Standard scaling is preferred when:
- The data is normally distributed.
- The features have different units of measurement.
- The goal is to standardize the data so that it has a mean of 0 and a standard deviation of 1.

 Min-max scaling is preferred when:
- The data is not normally distributed.
- The features have different ranges of values.
- The goal is to scale the data so that it is between 0 and 1.

# Example:
 Let's say we have two features: age and hours_per_week. Age is normally distributed with a mean of 38 and a standard deviation of 13. 
Hours_per_week is not normally distributed with a mean of 40 and a range of 1 to 99.

 If we were to use standard scaling, age would be scaled to have a mean of 0 and a standard deviation of 1. Hours_per_week would also be 
scaled to have a mean of 0 and a standard deviation of 1. However, this would not be very useful because the two features would now have the same
range of values.

 If we were to use min-max scaling, age would be scaled to be between 0 and 1. Hours_per_week would also be scaled to be between 0 and 1.
 This would be more useful because the two features would now have the same range of values and we could compare them more easily.

In [18]:
# Encoding Techniques:
# Apply One-Hot Encoding to categorical variables with less than 5 categories
# Use Label Encoding for categorical variables with more than 5 categories.
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# Identify categorical variables
categorical_cols = df.select_dtypes(include=["object"]).columns

# Apply One-Hot Encoding to categorical variables with less than 5 categories
low_cardinality_cols = [cname for cname in categorical_cols if df[cname].nunique() < 5]
OH_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(df[low_cardinality_cols]))
OH_cols_train.index = df.index
df = pd.concat([df, OH_cols_train], axis=1)

# Apply Label Encoding to categorical variables with more than 5 categories
high_cardinality_cols = [cname for cname in categorical_cols if df[cname].nunique() >= 5]
label_encoder = LabelEncoder()
for col in high_cardinality_cols:
  df[col] = label_encoder.fit_transform(df[col])

# Drop the original categorical columns
df.drop(categorical_cols, axis=1, inplace=True)

df

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,0,1,2,3
0,39,77516,13,2174,0,40,0.0,1.0,1.0,0.0
1,50,83311,13,0,0,13,0.0,1.0,1.0,0.0
2,38,215646,9,0,0,40,0.0,1.0,1.0,0.0
3,53,234721,7,0,0,40,0.0,1.0,1.0,0.0
4,28,338409,13,0,0,40,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
32556,27,257302,12,0,0,38,1.0,0.0,1.0,0.0
32557,40,154374,9,0,0,40,0.0,1.0,0.0,1.0
32558,58,151910,9,0,0,40,1.0,0.0,1.0,0.0
32559,22,201490,9,0,0,20,0.0,1.0,1.0,0.0


In [None]:
# Pros of One-Hot Encoding:

1. Easy to interpret: One-hot encoded features are binary and can be easily interpreted as the presence or absence of a category.
2. Works well with machine learning algorithms: Many machine learning algorithms, such as linear regression and logistic regression,
can directly handle one-hot encoded features.
3. Captures all possible categories: One-hot encoding creates a new binary feature for each possible category, ensuring that all
categories are represented in the data.

# Cons of One-Hot Encoding:

1. High dimensionality: One-hot encoding can significantly increase the dimensionality of the data, especially when there are many categories.
2. Can lead to overfitting: The increased dimensionality can lead to overfitting, especially when the number of samples is small.
3. Not suitable for categorical variables with a large number of categories: One-hot encoding is not suitable for categorical variables with
a large number of categories because it can create a very high-dimensional feature space.

# Pros of Label Encoding:

1. Simple to implement: Label encoding is a simple and straightforward technique that can be easily implemented.
2. Does not increase dimensionality: Label encoding does not increase the dimensionality of the data.
3. Suitable for categorical variables with a large number of categories: Label encoding can be used for categorical variables with a large
number of categories without creating a very high-dimensional feature space.

# Cons of Label Encoding:

1. Can introduce bias: Label encoding can introduce bias into the data if the categories are not ordered.
2. Can be difficult to interpret: Label encoded features are not as easy to interpret as one-hot encoded features.
3. Not suitable for machine learning algorithms that require binary features: Some machine learning algorithms, such as linear regression 
and logistic regression, require binary features and cannot directly handle label encoded features.

In [None]:
# Feature Engineering:
# Create at least 2 new features that could be beneficial for the model. Explain the rationale behind your choices.
1.Interaction Features:
Rationale: Creating features that represent the interaction between existing features can help capture more complex relationships in the data.
For example, if you have age and education_level as features, you could create an interaction feature like age * education_level.
2.Polynomial Features:
Rationale: Polynomial features are derived by raising existing features to a power, which can help capture non-linear relationships.
    For instance, if income is a feature, you might add a new feature like income^2 to capture potential non-linear effects.

In [None]:
# 	Apply a transformation (e.g., log transformation) to at least one skewed numerical feature and justify your choice
import numpy as np

# Create new features
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 45, 55, 65], labels=['Young', 'Middle-aged', 'Older Middle-aged', 'Older', 'Elderly'])
df['hours_per_week_group'] = pd.cut(df['hours_per_week'], bins=[0, 20, 40, 60, 80, 100], labels=['Part-time', 'Full-time', 'Overtime', 'Excessive Overtime', 'Extreme Overtime'])

# Apply log transformation to 'hours_per_week'
df['log_hours_per_week'] = np.log(df['hours_per_week'] + 1)

# Rationale:
- Age group: This feature captures the different stages of life that people go through, which could have an impact on their income.
- Hours per week group: This feature captures the different levels of work commitment that people have, which could also have an impact on their
income.
- Log transformation: The 'hours_per_week' feature is skewed to the right, with a long tail of people who work long hours. Taking the log of this 
feature can help to normalize its distribution and make it more suitable for use in a machine learning model.


In [None]:
# Feature Selection:
# Use the Isolation Forest algorithm to identify and remove outliers. Discuss how outliers can affect model performance.
from sklearn.ensemble import IsolationForest

# Define the model
model = IsolationForest(contamination='auto')

# Fit the model to the data
model.fit(df[['age', 'hours_per_week']])

# Get the outlier scores
outlier_scores = model.decision_function(df[['age', 'hours_per_week']])

# Get the outlier labels
outlier_labels = model.predict(df[['age', 'hours_per_week']])

# Remove the outliers
df = df[outlier_labels == 1]

# Outliers can affect model performance in a number of ways:

- They can bias the model: Outliers can cause the model to learn the wrong relationships between the features and the target variable. This can lead 
to the model making inaccurate predictions on new data.
- They can increase the variance of the model: Outliers can make the model more sensitive to noise in the data. This can lead to the model making
    unstable predictions on new data.
- They can slow down the training process: Outliers can slow down the training process of the model. This is because the model has to spend more
time trying to learn the relationships between the features and the target variable in the presence of outliers.

-> By removing outliers, we can improve the performance of our model. This is because we are reducing the amount of bias and variance in the model. 
    We are also making the training process faster.

In [11]:
# Apply the PPS (Predictive Power Score) to find and discuss the relationships between features. Compare its findings with the correlation matrix.
!pip install ppscore

Collecting ppscore
  Downloading ppscore-1.3.0.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pandas<2.0.0,>=1.0.0 (from ppscore)
  Downloading pandas-1.5.3-cp311-cp311-win_amd64.whl.metadata (12 kB)
Downloading pandas-1.5.3-cp311-cp311-win_amd64.whl (10.3 MB)
   ---------------------------------------- 0.0/10.3 MB ? eta -:--:--
   ---------------------------------------- 0.1/10.3 MB 1.7 MB/s eta 0:00:07
   - -------------------------------------- 0.4/10.3 MB 5.3 MB/s eta 0:00:02
   ---- ----------------------------------- 1.1/10.3 MB 8.5 MB/s eta 0:00:02
   -------- ------------------------------- 2.1/10.3 MB 12.3 MB/s eta 0:00:01
   ----------- ---------------------------- 2.9/10.3 MB 13.2 MB/s eta 0:00:01
   ----------------- ---------------------- 4.4/10.3 MB 16.5 MB/s eta 0:00:01
   -------------------- ------------------- 5.4/10.3 MB 17.2 MB/s eta 0:00:01
   ----------------------- ---------------- 6

In [15]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import ppscore as pps

# Calculate the PPS matrix
pps_matrix = pps.matrix(df)

# Print the PPS matrix
print(pps_matrix)

          x               y   ppscore            case  is_valid_score  \
0       age             age  1.000000  predict_itself            True   
1       age       workclass  0.011232  classification            True   
2       age          fnlwgt  0.000000      regression            True   
3       age       education  0.052315  classification            True   
4       age   education_num  0.000000      regression            True   
..      ...             ...       ...             ...             ...   
220  income    capital_gain  0.000000      regression            True   
221  income    capital_loss  0.000000      regression            True   
222  income  hours_per_week  0.000000      regression            True   
223  income  native_country  0.000000  classification            True   
224  income          income  1.000000  predict_itself            True   

                  metric  baseline_score   model_score  \
0                   None        0.000000      1.000000   
1      