This notebook performs Feature Selection on the cleaned dataset by applying various statistical feature selection techniques. A simple voting mechanism is implemented at the end to shortlist the features.

In [None]:
import pandas as pd
from sklearn.feature_selection import VarianceThreshold, RFECV, f_regression, mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import numpy as np

In [None]:
clean_df = pd.read_csv('data_versions/clean_data.csv')
clean_df

In [None]:
# 1. Filter Methods
# Variance Threshold
vt = VarianceThreshold(threshold=0.1)
vt.fit(clean_df)
vt_features = clean_df.columns[vt.get_support()]
print(f"Filter Method (Variance Threshold) {len(vt_features)} features: \n", vt_features)


In [None]:
vt_features.shape

In [None]:
# Supervised feature selection techniques require input and target variables
X = clean_df.drop('risk', axis=1)
y = clean_df['risk']

In [None]:
# ANOVA F-test
f_scores, p_values = f_regression(X, y)
f_pvalues = pd.Series(p_values, index=X.columns)
anova_features = f_pvalues[f_pvalues < 0.05].sort_values().index

# Due to most of the features getting low p-value, a stricter filter is applied.
anova_features = f_pvalues[f_pvalues==0]
print(f"Filter Method (ANOVA F-test) {len(anova_features)}: \n", anova_features)

In [None]:
anova_features = f_pvalues[f_pvalues==0].index
anova_features.shape

Gradient of the feature scores are plotted for all feature selection techniques to find the optimal number of features. An example using mutual information selection method is provided below:

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(np.abs(np.gradient(sorted(f_scores)[::-1]))[2:15])

# Add grid lines
plt.grid(True, which='both')
plt.grid(color='lightgray', linestyle='--', linewidth=0.5)

# Add labels and title
plt.xlabel('Feature Index')
plt.ylabel('Absolute Gradient')
plt.title('Absolute Gradient of Sorted Feature Importance Scores')

plt.show()

In [None]:
# Mutual Information
mi_scores = mutual_info_regression(X, y)
mi_pvalues = pd.Series(mi_scores, index=X.columns)
mi_features = mi_pvalues.sort_values(ascending=False).index
print("Filter Method (Mutual Information):\n", mi_features)

In [None]:
mi_features = mi_pvalues.sort_values(ascending=False)[:34].index

In [None]:
#plt.figure(figsize=(8, 6))
plt.plot(np.abs(np.gradient(sorted(mi_scores)[::-1]))[30:35])

# Add grid lines
plt.grid(True, which='both')
plt.grid(color='lightgray', linestyle='--', linewidth=0.5)

# Add labels and title
plt.xlabel('Feature Index')
plt.ylabel('Absolute Gradient')
plt.title('Absolute Gradient of Sorted Feature Importance Scores')

In [None]:

# 2. Wrapper Methods
# Recursive Feature Elimination (RFE)
model = LinearRegression()
rfe = RFECV(model)
rfe.fit(X, y)
wrapper_features = X.columns[rfe.get_support()]
print("Wrapper Method (RFE):", wrapper_features)

In [None]:
# Sequential Feature selector using Ridge Regression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import RidgeCV
model = RidgeCV().fit(X,y)
sfs_backward = SequentialFeatureSelector(
    model, direction="backward"
).fit(X, y)

In [None]:
sfs_features = X.columns[sfs_backward.support_]

In [None]:
# 3. Embedded Methods
# Random Forest
rf = RandomForestRegressor()
rf.fit(X, y)
embedded_features = X.columns[rf.feature_importances_.argsort()[::-1]]
print("Embedded Method (Random Forest):", embedded_features)

In [None]:
plt.plot(np.abs(np.gradient(sorted(rf.feature_importances_)[::-1]))[10:20])
np.abs(np.gradient(sorted(rf.feature_importances_)[::-1]))[10:20].argmax()

In [None]:
embedded_features = X.columns[rf.feature_importances_.argsort()[::-1][:14]]

In [None]:
# Combine all the feature lists into a single list
from collections import Counter
all_features = vt_features.to_list() + anova_features.to_list() + mi_features.to_list() + wrapper_features.to_list() + embedded_features.to_list() + sfs_features.tolist()

# Count the occurrences of each feature
feature_counts = Counter(all_features)

# Sort the features by their vote count
sorted_features = sorted(feature_counts.items(), key=lambda x: x[1], reverse=True)

#Print the selected features based on a voting threshold
voting_threshold = 5
selected_features = [feature for feature, votes in sorted_features if votes >= voting_threshold]
print("Selected Features:", selected_features)

In [None]:
selected_features += ['risk','event_id']

In [None]:
train_df = clean_df[selected_features]
train_df.to_csv('selected_data.csv', index=False)