In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv("/content/adult_with_headers.csv")

In [3]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
data.shape

(32561, 15)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education_num   32561 non-null  int64 
 5   marital_status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital_gain    32561 non-null  int64 
 11  capital_loss    32561 non-null  int64 
 12  hours_per_week  32561 non-null  int64 
 13  native_country  32561 non-null  object
 14  income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
data.isnull().sum()

Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education_num,0
marital_status,0
occupation,0
relationship,0
race,0
sex,0


**There are no missing values preent in our dataset**

In [7]:
data.nunique()

Unnamed: 0,0
age,73
workclass,9
fnlwgt,21648
education,16
education_num,16
marital_status,7
occupation,15
relationship,6
race,5
sex,2


# 1. Data Exploration and Preprocessing:

In [8]:
int_data = data.select_dtypes(include = ["int64"])

In [9]:
int_data

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40
...,...,...,...,...,...,...
32556,27,257302,12,0,0,38
32557,40,154374,9,0,0,40
32558,58,151910,9,0,0,40
32559,22,201490,9,0,0,20


In [10]:
cat_data = data.select_dtypes(include = ["object"])

In [11]:
cat_data

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,income
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States,<=50K
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States,>50K
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States,<=50K
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States,<=50K


In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
int_data = scaler.fit_transform(int_data)
int_data = pd.DataFrame(int_data)

In [13]:
int_data

Unnamed: 0,0,1,2,3,4,5
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.145920,-0.21666,-2.222153
2,-0.042642,0.245079,-0.420060,-0.145920,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.145920,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.145920,-0.21666,-0.035429
...,...,...,...,...,...,...
32556,-0.849080,0.639741,0.746039,-0.145920,-0.21666,-0.197409
32557,0.103983,-0.335433,-0.420060,-0.145920,-0.21666,-0.035429
32558,1.423610,-0.358777,-0.420060,-0.145920,-0.21666,-0.035429
32559,-1.215643,0.110960,-0.420060,-0.145920,-0.21666,-1.655225


**Discuss the scenarios where each scaling technique is preferred and why**

### Min-Max Scaling (Normalization):
**Scenario:** When the data has a well-defined range, and you want to ensure that all features contribute equally to the model.

**Preferred When:**
1. Uniform Distribution: Data is uniformly distributed without outliers.

2. Sensitive Algorithms: Algorithms like k-Nearest Neighbors (k-NN), Neural Networks, and Support Vector Machines (SVM) are sensitive to the scale of features and perform better with normalized data.

3. Bounded Features: When you want features to be within a specific range, typically [0, 1], which can be particularly useful for algorithms that assume bounded input.


### Standard Scaling (Z-score Normalization):

**Scenario:** When the data follows a normal distribution (or approximately normal) and you want to maintain the effects of outliers while ensuring features have a mean of 0 and a standard deviation of 1.

**Preferred When:**

1. Gaussian Distribution: The data is approximately normally distributed, making it easier to model with algorithms like Linear Regression, Logistic Regression, and Principal Component Analysis (PCA).

2. Presence of Outliers: When outliers are important and should not be scaled down excessively. Standard Scaling centers the data around the mean and accounts for variance, preserving the impact of outliers.

3. Linear Models: Algorithms that assume or benefit from normally distributed data (e.g., linear models) often perform better with standardized features.

# 2. Encoding Techniques:

In [14]:
cat_data.nunique()

Unnamed: 0,0
workclass,9
education,16
marital_status,7
occupation,15
relationship,6
race,5
sex,2
native_country,42
income,2


In [15]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cat_data["workclass"] = le.fit_transform(cat_data["workclass"])
cat_data["education"] = le.fit_transform(cat_data["education"])
cat_data["marital_status"] = le.fit_transform(cat_data["marital_status"])
cat_data["occupation"] = le.fit_transform(cat_data["occupation"])
cat_data["relationship"] = le.fit_transform(cat_data["relationship"])
cat_data["native_country"] = le.fit_transform(cat_data["native_country"])
cat_data["income"] = le.fit_transform(cat_data["income"])

In [16]:
cat_data = pd.get_dummies(cat_data, columns=['race', 'sex'])
#cat_data.drop(columns = ["race", 'sex'], inplace = True)
#Convert True/False to 1/0
cat_data = cat_data.astype(int)
cat_data

Unnamed: 0,workclass,education,marital_status,occupation,relationship,native_country,income,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White,sex_ Female,sex_ Male
0,7,9,4,1,1,39,0,0,0,0,0,1,0,1
1,6,9,2,4,0,39,0,0,0,0,0,1,0,1
2,4,11,0,6,1,39,0,0,0,0,0,1,0,1
3,4,1,2,6,0,39,0,0,0,1,0,0,0,1
4,4,9,2,10,5,5,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,4,7,2,13,5,39,0,0,0,0,0,1,1,0
32557,4,11,2,7,0,39,1,0,0,0,0,1,0,1
32558,4,11,6,1,4,39,0,0,0,0,0,1,1,0
32559,4,11,4,1,3,39,0,0,0,0,0,1,0,1


In [17]:
cat_data.shape

(32561, 14)

### One-Hot Encoding
**Pros:**

1. No Ordinal Relationships:
Avoids assigning any ordinal relationship between categories, which is useful for categorical data where no natural order exists.
2. Compatibility with ML Algorithms: Works well with algorithms that do not assume any specific order in the categorical features, like most tree-based models.

**Cons:**

1. High Dimensionality: Increases the dimensionality of the dataset, especially with features having many unique categories, leading to potential inefficiency and higher computational costs.
2. Sparsity: Results in sparse matrices, which can be less efficient to store and process, particularly with large datasets.

### Label Encoding
**Pros:**

1. Low Dimensionality: Does not increase the dimensionality of the dataset, keeping it compact and manageable.
2. Simplicity: Easy to implement and interpret, especially when categories have a meaningful ordinal relationship.

**Cons:**

1. Implied Ordinality: Assigns a numerical value to each category, which can unintentionally introduce a false sense of order, potentially misleading algorithms that interpret the labels as ordinal.
2. Bias in ML Models: Some models may be biased by the numerical order of the labels, leading to inaccurate results if no true order exists in the categories.

# 3. Feature Engineering:

In [18]:
#Feature 1: Total Capital
data['total_capital'] = data['capital_gain'] - data['capital_loss']

# Display the new features
print(data[[ 'capital_gain', 'capital_loss', 'total_capital']].head(10))

   capital_gain  capital_loss  total_capital
0          2174             0           2174
1             0             0              0
2             0             0              0
3             0             0              0
4             0             0              0
5             0             0              0
6             0             0              0
7             0             0              0
8         14084             0          14084
9          5178             0           5178


In [19]:
print("Unique education levels:")
print(data['education'].unique())

Unique education levels:
[' Bachelors' ' HS-grad' ' 11th' ' Masters' ' 9th' ' Some-college'
 ' Assoc-acdm' ' Assoc-voc' ' 7th-8th' ' Doctorate' ' Prof-school'
 ' 5th-6th' ' 10th' ' 1st-4th' ' Preschool' ' 12th']


In [20]:
#Feature 2 : Education Level
# Now, let's create a feature that combines education level and work hours
def education_work_category(row):
    if row['education'] in [' Bachelors', ' Masters', ' Doctorate']:
        education_level = 'Higher'
    elif row['education'] in [' HS-grad', ' Some-college', ' Assoc-acdm', ' Assoc-voc']:
        education_level = 'Secondary'
    else:
        education_level = 'Primary'

    if row['hours_per_week'] < 40:
        work_intensity = 'Part-time'
    elif row['hours_per_week'] == 40:
        work_intensity = 'Full-time'
    else:
        work_intensity = 'Overtime'

    return f"{education_level}-{work_intensity}"

data['edu_work_category'] = data.apply(education_work_category, axis=1)


In [21]:
# Display the new feature along with relevant columns
print("\
New feature: Education-Work Category")
print(data[['education', 'hours_per_week', 'edu_work_category']].head(15))

New feature: Education-Work Category
        education  hours_per_week    edu_work_category
0       Bachelors              40     Higher-Full-time
1       Bachelors              13     Higher-Part-time
2         HS-grad              40  Secondary-Full-time
3            11th              40    Primary-Full-time
4       Bachelors              40     Higher-Full-time
5         Masters              40     Higher-Full-time
6             9th              16    Primary-Part-time
7         HS-grad              45   Secondary-Overtime
8         Masters              50      Higher-Overtime
9       Bachelors              40     Higher-Full-time
10   Some-college              80   Secondary-Overtime
11      Bachelors              40     Higher-Full-time
12      Bachelors              30     Higher-Part-time
13     Assoc-acdm              50   Secondary-Overtime
14      Assoc-voc              40  Secondary-Full-time


### Education levels are categorized into:

**Higher:** Bachelors, Masters, Doctorate

**Secondary:** HS-grad, Some-college, Assoc-acdm, Assoc-voc

**Primary:** All other education levels

In [22]:
# Apply log transformation to 'capital_gain' (adding 1 to avoid log(0))
data['log_capital_gain'] = np.log1p(data['capital_gain'])

# Display the original and transformed 'capital_gain'
print(data[['capital_gain', 'log_capital_gain']].describe())


       capital_gain  log_capital_gain
count  32561.000000      32561.000000
mean    1077.648844          0.734621
std     7385.292085          2.454738
min        0.000000          0.000000
25%        0.000000          0.000000
50%        0.000000          0.000000
75%        0.000000          0.000000
max    99999.000000         11.512925


In [23]:
# Calculate skewness of the transformed feature
print("Skewness of log_capital_gain:", data['log_capital_gain'].skew())


Skewness of log_capital_gain: 3.096143524467517


# 4. Feature Selection:

In [24]:
# Importing necessary libraries for Isolation Forest
from sklearn.ensemble import IsolationForest
import numpy as np

# Selecting relevant features for outlier detection
# Here, we will use numerical features like hours_per_week and income_numeric
features = data[['hours_per_week', 'education_num']].copy()

# Initializing the Isolation Forest model
iso_forest = IsolationForest(contamination=0.05, random_state=42)

# Fitting the model and predicting outliers
outliers = iso_forest.fit_predict(features)

# The predictions are -1 for outliers and 1 for inliers
# We will filter out the outliers
df_cleaned = data[outliers != -1]

# Display the shape of the original and cleaned dataframe
print(f"Original dataframe shape: {data.shape}")
print(f"Cleaned dataframe shape: {df_cleaned.shape}")

Original dataframe shape: (32561, 18)
Cleaned dataframe shape: (30935, 18)


**Outliers can distort the model's training process by skewing the data distribution, leading to inaccurate model parameters, especially in sensitive models like linear regression. They can cause the model to overfit, as it tries to accommodate extreme values, which reduces its generalization to unseen data. Additionally, outliers may disproportionately influence performance metrics, making the model appear less effective or reliable.**

In [25]:
!pip install ppscore



In [26]:
import ppscore as pps

# Calculate the PPS matrix
matrix_df = pps.matrix(df_cleaned)

# Display the PPS matrix
print(matrix_df)



                    x                  y   ppscore            case  \
0                 age                age  1.000000  predict_itself   
1                 age          workclass  0.011765  classification   
2                 age             fnlwgt  0.000000      regression   
3                 age          education  0.036024  classification   
4                 age      education_num  0.000000      regression   
..                ...                ...       ...             ...   
319  log_capital_gain     native_country  0.000000  classification   
320  log_capital_gain             income  0.283044  classification   
321  log_capital_gain      total_capital  0.816181      regression   
322  log_capital_gain  edu_work_category  0.000000  classification   
323  log_capital_gain   log_capital_gain  1.000000  predict_itself   

     is_valid_score               metric  baseline_score   model_score  \
0              True                 None        0.000000      1.000000   
1          

