# 7. Feature Selection with RFE

In [None]:
%pip install Boruta

Collecting Boruta
  Downloading Boruta-0.3-py3-none-any.whl (56 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.6/56.6 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Boruta
Successfully installed Boruta-0.3
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None  # default='warn'

from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

In [2]:
imputed_train_data = pd.read_parquet("./preprocessed_data/imputed_train_data.parquet")

In [5]:
imputed_train_data.shape

(12229978, 59)

In [3]:
# Define function to calculate target column for case 1001
# Case 1001 refers to the binary values for events A, B, C and D
# A: recommended + activated
# B: NOT recommended + activated
# C: recommended + NOT activated
# D: NOT recommended + NOT activated
def calculate_target_1001(row):
    if row["ind_recommended"] == row["activation"]:
        return 1
    else:
        return 0

In [4]:
# Calculate new target column
imputed_train_data["target_1000"] = imputed_train_data.apply(lambda row: 1 if row["ind_recommended"] == 1 and row["activation"] == 1 else 0, axis=1)
imputed_train_data["target_1001"] = imputed_train_data.apply(calculate_target_1001, axis=1)

In [9]:
imputed_train_data.head(10)

Unnamed: 0,customer,merchant,ind_recommended,activation,customer_profile_01,customer_profile_02,customer_profile_03,customer_profile_04,customer_spend_01,customer_spend_02,customer_spend_03,customer_spend_04,customer_spend_05,customer_spend_06,customer_spend_07,customer_spend_13,customer_spend_16,customer_spend_18,customer_spend_19,customer_digital_activity_01,customer_digital_activity_02,customer_digital_activity_03,customer_digital_activity_05,customer_digital_activity_06,customer_digital_activity_10,customer_digital_activity_11,customer_digital_activity_12,customer_digital_activity_13,customer_digital_activity_14,customer_digital_activity_15,customer_digital_activity_16,customer_digital_activity_17,customer_digital_activity_20,customer_digital_activity_21,customer_digital_activity_22,customer_industry_spend_01,customer_industry_spend_02,customer_industry_spend_03,customer_industry_spend_04,customer_industry_spend_05,customer_merchant_03,distance_01,distance_02,distance_03,distance_04,distance_05,merchant_profile_01,merchant_profile_02,merchant_profile_03,merchant_spend_01,merchant_spend_02,merchant_spend_03,merchant_spend_04,merchant_spend_05,merchant_spend_06,merchant_spend_07,merchant_spend_08,merchant_spend_09,merchant_spend_10,target_1000,target_1001
0,168972,152285,0,0,5466.06,1700.0,58.434969,86.0,107.215862,14.0,133.0,4477.0,29719.09,782.0,306.0,3.0,1.0,0.714531,20.85,0.0,32.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.444444,0.017921,0.000468,26.686594,74.0,3682.75,138.0,111.0,0.90551,0.307692,1.219756,13.0,4.0,15.856826,101.0,0.157534,65923.0,29.781042,43.0,0.0,0.0,0.0,32.0,1429.49,48.0,49466.0,29.18,0,1
1,212404,39032,0,0,781.56,597.41,5.392089,125.0,35.552,2.0,8.0,17577.0,1051.4,52.0,43.0,1.0,1.0,0.871597,24.81,0.419355,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.641575,0.005845,0.000123,50.928261,3.0,1171.35,23.0,17.0,0.961583,3.808333,6.998555,1.0,3.808333,6.998555,403.0,0.084416,7801.0,34.643313,97.0,0.0,0.0,0.0,15.0,5646.86,163.0,3638.0,28.465,0,1
2,225178,7439,0,0,1457.84,1200.0,33.780445,180.0,31.623103,11.0,62.0,49494.0,4695.22,196.0,136.0,1.0,2.0,0.076536,32.26,0.836364,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.837872,19.0,2295.38,47.0,42.0,0.327672,11.352941,0.129853,13.5,30.822421,1.753009,406.0,0.24,12868.0,1731.0,2.0,1731.0,2.0,2.0,8.0,3462.0,2.0,3912.0,421.5,0,1
3,183948,485069,0,0,351.22,500.0,37.340085,134.0,112.277391,16.0,33.0,147211.0,5190.94,167.0,112.0,10.285714,7.0,0.173581,21.016154,0.952381,28.666667,0.0,1.0,4.0,20.0,19.0,19.0,13.0,19.0,7.0,3.0,0.0,0.786111,0.0,0.0,22.22144,4.25,323.92125,14.125,13.6875,0.769936,11.352941,1.948002,139.002695,2.0,9.000063,326.0,0.1875,23553.0,54.8,4.0,0.0,0.0,0.0,62.0,274.0,5.0,28919.0,50.0,0,1
4,210107,536004,1,0,831.67,99.0,77.794164,114.0,448.427273,5.0,8.0,45.0,11713.96,33.0,28.0,7.75,4.5,0.275255,90.645,0.754386,15.0,0.0,1.0,2.0,15.0,15.0,15.0,7.0,15.0,4.0,3.0,10.0,0.801169,0.0,0.0,368.501048,5.5,11415.2825,23.5,19.75,0.651198,11.352941,1.948002,139.002695,6.5,1.767939,326.0,0.428571,308.0,166.0,1.0,0.0,0.0,0.0,74.0,166.0,1.0,1086.0,69.509,0,0
5,227362,419583,0,0,1612.4,1451.73,73.403888,80.0,81.858,9.0,72.0,85208.35,7850.53,302.0,184.0,4.125,3.333333,0.467583,29.358333,0.212766,1.166667,0.0,1.0,1.0,5.0,5.0,5.0,4.0,5.0,3.0,3.0,0.0,1.0,0.0,0.000408,46.676478,25.272727,2240.605455,58.772727,48.954545,0.733332,11.352941,1.948002,139.002695,1.0,1.366169,307.0,0.2,16692.0,208.414211,19.0,0.0,0.0,0.0,14.0,3959.87,19.0,20952.0,147.7,0,1
6,401342,343074,0,0,0.0,0.0,0.0,193.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.581818,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.00184,1.2e-05,0.0,0.0,0.0,0.0,0.0,0.651198,11.352941,1.948002,139.002695,6.625,18.914194,406.0,0.0,24708.0,83.753333,4.0,0.0,0.0,0.0,63.0,502.52,6.0,26611.0,23.01,0,1
7,308043,266191,0,0,4830.01,1009.0,98.155236,142.0,45.241765,7.0,133.0,71610.0,16591.15,587.0,253.0,3.0,1.0,0.29872,12.14,0.243243,46.833333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.833333,0.012821,0.000266,17.01443,69.0,2535.15,149.0,109.0,0.980527,1.44,0.015816,4.0,5.76,0.063262,101.0,0.138387,73439.0,49.956705,158.0,0.0,0.0,0.0,10.0,8642.51,173.0,26181.0,40.64,0,1
8,174487,522438,0,0,423.04,57.0,24.00621,186.0,18.223538,10.0,43.0,193068.0,3721.56,166.0,95.0,6.0,1.0,0.041135,14.42,0.0,7.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.665179,0.0,0.0,29.103333,7.0,261.93,9.0,6.0,0.278287,0.320652,0.067064,23.0,7.375,1.542463,406.0,0.232704,13314.0,431.058462,13.0,0.0,0.0,0.0,15.0,5603.76,13.0,4691.0,350.555556,0,1
9,414842,143402,0,0,0.0,0.0,18.565891,79.0,1400.0,1.0,1.0,0.0,2070.25,12.0,12.0,1.0,1.0,1.190497,670.25,0.238095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.847368,0.001615,0.000557,1035.125,1.0,2070.25,2.0,2.0,-0.20637,11.352941,1.948002,139.002695,30.822421,2.830609,406.0,0.588235,1212.0,825.5,3.0,825.5,3.0,4.0,8.0,3302.0,4.0,2894.0,563.0,0,1


In [5]:
# Separate features and target
features = imputed_train_data.drop(["customer", "merchant", "ind_recommended", "activation", "target_1000", "target_1001"], axis=1)
target_1000 = imputed_train_data["target_1000"]
target_1001 = imputed_train_data["target_1001"]

In [6]:
# Feature selection with Recursive Feature Elimination (RFE)
from sklearn.feature_selection import RFE
# from sklearn.ensemble import RandomForestRegressor # Use RandomForestRegressor for regression tasks
from sklearn.ensemble import RandomForestClassifier # Use RandomForestClassifier for classification tasks
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target_1000, test_size=0.3 ,random_state=42)

# Initialize StandardScaler
scaler = StandardScaler()

# Initialize your estimator
estimator = RandomForestClassifier(n_estimators=10, random_state=10)

# Initialize RFE
# n_features_to_select parameter can be adjusted to choose the desired number of features to retain
rfe = RFE(estimator, n_features_to_select=37, step=2) # 66% of the remaining columns

# Scale your features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Fit RFE on your training data
rfe.fit(X_train, y_train)

# Get the ranking of each feature
# Features with ranking 1 are selected, higher ranking indicates less important features
feature_ranking = rfe.ranking_

# Get the selected features
selected_features = [f for i, f in enumerate(X_train.columns) if rfe.support_[i]]

print("Selected Features:")
print(selected_features)

Selected Features:
['customer_profile_01', 'customer_profile_02', 'customer_profile_03', 'customer_profile_04', 'customer_spend_01', 'customer_spend_02', 'customer_spend_03', 'customer_spend_04', 'customer_spend_05', 'customer_spend_06', 'customer_spend_07', 'customer_spend_18', 'customer_spend_19', 'customer_digital_activity_01', 'customer_digital_activity_02', 'customer_digital_activity_11', 'customer_digital_activity_20', 'customer_digital_activity_21', 'customer_digital_activity_22', 'customer_industry_spend_01', 'customer_industry_spend_02', 'customer_industry_spend_03', 'customer_industry_spend_04', 'customer_industry_spend_05', 'customer_merchant_03', 'distance_01', 'distance_02', 'distance_04', 'distance_05', 'merchant_profile_02', 'merchant_profile_03', 'merchant_spend_01', 'merchant_spend_02', 'merchant_spend_07', 'merchant_spend_08', 'merchant_spend_09', 'merchant_spend_10']


In [22]:
type(selected_features)

pandas.core.frame.DataFrame

In [23]:
# Save selected features as csv
selected_features.to_csv('./preprocessed_data/selected_features.csv', index=False)