# Feature Selection Workflow for High-Dimensional Datasets
This notebook outlines a step-by-step approach to reduce the number of features from 500+ to between 80 and 200, using a combination of tree-based model feature importance, filter methods, and recursive feature elimination.

## PRELUDE
- import packages 
- collect dataset

In [58]:
import sqlite3
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

import sys
import os

sys.path.insert(1, os.path.join(sys.path[0], '../..'))
from include_columns import include_columns_corr_ml

In [59]:
dataset = "games_2018-present"
db = sqlite3.connect(f"/Users/aidenflynn/ML_Python/python-nfl/Data/v2.sqlite")
df = pd.read_sql_query(f"select * from \"{dataset}\"", db, index_col="index")
df.head()

Unnamed: 0_level_0,SEASON,AWAY_TEAM_NAME,AWAY_TEAM_PREV_RANK,HOME_TEAM_NAME,HOME_TEAM_PREV_RANK,AWAY_SCORE,HOME_SCORE,WEEK,AWAY_UNIQ_STARTERS_QB,AWAY_UNIQ_STARTERS_DEFENSE,...,DIV_MATCH,SCORE,OU,OU_COVER,SPREAD,SPREAD_COVER,WIND_SPEED,TEMP,SURFACE,ROOF
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,2018,ARI,18.0,ATL,7.0,14,40,15,2,21,...,0.0,54,43.5,1.0,10.0,0.0,084,72,72084,Cardinals (deferred)
0,2018,ARI,18.0,GNB,19.0,20,17,13,2,19,...,0.0,37,41.0,0.0,13.5,0.0,wind 20 mph,34 degrees,"34 degrees, wind 20 mph",outdoors
0,2018,ARI,18.0,KAN,8.0,14,26,10,2,17,...,0.0,40,49.5,0.0,16.5,0.0,wind 5 mph,43 degrees,"43 degrees, wind 5 mph",outdoors
0,2018,ARI,18.0,LAC,15.0,10,45,12,2,18,...,0.0,55,43.0,1.0,14.0,0.0,wind 3 mph,74 degrees,"74 degrees, wind 3 mph",outdoors
0,2018,ARI,18.0,LAR,10.0,0,34,2,1,11,...,1.0,34,43.5,0.0,13.5,0.0,wind 2 mph,86 degrees,"86 degrees, wind 2 mph",outdoors


In [60]:
# fill scrm RELATED column Null with 0.0
df['AWAY_PASS_YDSSCRM_MAX'] = df['AWAY_PASS_YDSSCRM_MAX'].fillna(0.0)
df['AWAY_PASS_YDSSCRM_MEAN'] = df['AWAY_PASS_YDSSCRM_MEAN'].fillna(0.0)
df['AWAY_PASS_YDSSCRM_MIN'] = df['AWAY_PASS_YDSSCRM_MIN'].fillna(0.0)
df['AWAY_PASS_YDSSCRM_STD'] = df['AWAY_PASS_YDSSCRM_STD'].fillna(0.0)
df['HOME_PASS_YDSSCRM_MAX'] = df['HOME_PASS_YDSSCRM_MAX'].fillna(0.0)
df['HOME_PASS_YDSSCRM_MEAN'] = df['HOME_PASS_YDSSCRM_MEAN'].fillna(0.0)
df['HOME_PASS_YDSSCRM_MIN'] = df['HOME_PASS_YDSSCRM_MIN'].fillna(0.0)
df['HOME_PASS_YDSSCRM_STD'] = df['HOME_PASS_YDSSCRM_STD'].fillna(0.0)
# All other null values can be safely filled with -1: unavailable
df.fillna(-1, inplace=True) 
df.isnull().any(axis=1).sum()

df['HOME_PASS_PLAYOFF_WINPCT'] = np.where(
    (df['HOME_PASS_PLAYOFF_W'] + df['HOME_PASS_PLAYOFF_L']) > 0,
    df['HOME_PASS_PLAYOFF_W'] / (df['HOME_PASS_PLAYOFF_W'] + df['HOME_PASS_PLAYOFF_L']),
    -1
)
df['HOME_PASS_PLAYOFF_GAMES'] = df['HOME_PASS_PLAYOFF_W'] + df['HOME_PASS_PLAYOFF_L']

df['HOME_PASS_BIG_GAME_WINPCT'] = np.where(
    (df['HOME_PASS_BIG_GAME_W'] + df['HOME_PASS_BIG_GAME_L']) > 0,
    df['HOME_PASS_BIG_GAME_W'] / (df['HOME_PASS_BIG_GAME_W'] + df['HOME_PASS_BIG_GAME_L']),
    -1
)
df['HOME_PASS_BIG_GAMES'] = df['HOME_PASS_BIG_GAME_W'] + df['HOME_PASS_BIG_GAME_L']

# target home team win; drop dud and standard columns
TARGET = df['Home-Team-Win']
df = df.drop(
    [
        'ROOF', 'SURFACE', 'TEMP', 'WIND_SPEED', 'SPREAD_COVER', 'SPREAD', 'OU_COVER', 'OU', 'SCORE', 'Home-Team-Win', 'HOME_SCORE', 
        'AWAY_SCORE', 'HOME_TEAM_NAME', 'AWAY_TEAM_NAME', 'SEASON', 'AWAY_PASS_PLAYOFF_L', 'HOME_PASS_PLAYOFF_W', 'HOME_PASS_PLAYOFF_L',
        'AWAY_PASS_PLAYOFF_W','AWAY_PASS_BIG_GAME_W', 'AWAY_PASS_BIG_GAME_L', 'HOME_PASS_BIG_GAME_W', 'HOME_PASS_BIG_GAME_L'
    ], 
    axis=1
)

# split data for training
x_train, x_test, y_train, y_test = train_test_split(df, TARGET, test_size=.3)

## Step 1: Initial Pruning
- Remove constant or near-constant features.
- Remove duplicate features.
- Remove highly correlated features (> 0.95 correlation).

In [61]:
# Example: Remove constant features
selector = VarianceThreshold(threshold=0.0)
X_reduced = selector.fit_transform(x_train)

# Remove duplicates
X_reduced = pd.DataFrame(X_reduced, columns=x_train.columns[selector.get_support()])
X_reduced = X_reduced.loc[:, ~X_reduced.T.duplicated()]

# Remove highly correlated features
corr_matrix = X_reduced.corr().abs()
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
X_reduced = X_reduced.drop(columns=to_drop)


## Step 2: Model-Based Feature Importance
Use XGBoost, RandomForest, or GradientBoosting to rank feature importances.

In [69]:

import xgboost as xgb

model = xgb.XGBClassifier()
model.fit(X_reduced, y_train)

# Get feature importance
importances = model.feature_importances_
importance_df = pd.DataFrame({'feature': X_reduced.columns, 'importance': importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)

# Select top 250 features
top_features = importance_df['feature'].head(250).tolist()
X_selected = X_reduced[top_features]

model.fit(X_selected, y_train)
X_test_selected = x_test[top_features]
y_pred = model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy:.4f}")
# pd.set_option('display.max_rows', 500)
# importance_df.head(250)


XGBoost Accuracy: 0.6329


In [80]:
model = RandomForestClassifier()
model.fit(X_reduced, y_train)

# Get feature importance
importances = model.feature_importances_
importance_df = pd.DataFrame({'feature': X_reduced.columns, 'importance': importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)

# Select top 250 features
top_features = importance_df['feature'].head(250).tolist()
X_selected = X_reduced[top_features]

model.fit(X_selected, y_train)
X_test_selected = x_test[top_features]
y_pred = model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print(f"RandomForestClassifier Accuracy: {accuracy:.4f}")

RandomForestClassifier Accuracy: 0.6244


In [82]:
model = GradientBoostingClassifier()
model.fit(X_reduced, y_train)

# Get feature importance
importances = model.feature_importances_
importance_df = pd.DataFrame({'feature': X_reduced.columns, 'importance': importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)

# Select top 250 features
top_features = importance_df['feature'].head(250).tolist()
X_selected = X_reduced[top_features]

model.fit(X_selected, y_train)
X_test_selected = x_test[top_features]
y_pred = model.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print(f"GradientBoostingClassifier Accuracy: {accuracy:.4f}")

GradientBoostingClassifier Accuracy: 0.6158


## Step 3: Recursive Feature Elimination (Optional)
Apply RFE with a smaller model if further pruning is needed.

In [84]:

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

rfe_model = RandomForestClassifier()
selector = RFE(rfe_model, n_features_to_select=150, step=10)
selector = selector.fit(X_selected, y_train)

rfe_features = X_selected.columns[selector.support_].tolist()
X_rfe = X_selected[rfe_features]
X_rfe


Unnamed: 0,AWAY_FD_MEAN,AWAY_TEAM_PREV_RANK,HOME_FD_MEAN,HOME_PASS_1DPCT_MAX,AWAY_PASS_1DPCT_MAX,HOME_SACKS_MEAN,HOME_PASS_YACCMP_CAREER_STD,HOME_PASS_SK_CAREER_MEAN,HOME_PASS_YACCMP_CAREER_MEAN,HOME_PASS_1D_CAREER_MAX,...,AWAY_PASS_IAY_CAREER_MIN,AWAY_PASS_IAY_CAREER_STD,HOME_PASS_PRSS_CAREER_MEAN,AWAY_PASS_BADTH_MEAN,AWAY_PENALTIES_STD,HOME_PASS_CAYCMP_CAREER_MEAN,HOME_PASS_1DPCT_CAREER_STD,HOME_SACKS_AGAINST_STD,HOME_PASS_IAY_CAREER_MAX,HOME_PENALTIES_MEAN
0,-1.000000,30.0,-1.000000,-1.000,-1.000,-1.000000,-1.000000,3.000000,6.800000,6.0,...,187.0,106.773124,4.000000,-1.000000,-1.000000,8.500000,-1.000000,-1.000000,160.0,-1.000000
1,21.357143,26.0,20.428571,0.354,0.500,3.142857,1.595353,3.500000,5.616667,17.0,...,95.0,59.091722,10.000000,4.153846,2.848751,5.316667,0.051289,2.413333,386.0,5.500000
2,21.200000,12.0,23.666667,0.500,0.548,1.555556,1.565183,2.072727,4.927273,22.0,...,112.0,97.052343,9.327273,6.100000,2.616189,6.845455,0.087447,1.691482,518.0,7.555556
3,21.600000,19.0,23.000000,0.450,0.513,2.800000,1.828881,2.604651,5.374419,18.0,...,25.0,81.853564,9.767442,5.800000,3.507136,5.881395,0.094158,2.280351,490.0,6.800000
4,17.500000,20.0,20.800000,0.344,0.333,3.200000,1.378686,2.333333,5.436667,21.0,...,74.0,90.179716,7.655556,4.000000,3.204164,6.378889,0.078427,0.836660,571.0,5.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1354,18.818182,13.0,20.727273,0.452,0.400,2.363636,1.342909,2.900000,5.291429,26.0,...,100.0,88.801116,8.642857,5.200000,2.110579,5.611429,0.083412,1.618080,512.0,4.727273
1355,16.111111,7.0,21.375000,0.471,0.308,2.875000,1.423342,2.179487,5.014103,29.0,...,-7.0,118.086832,8.346154,1.500000,1.964971,6.356410,0.085901,1.922610,567.0,7.500000
1356,24.000000,26.0,15.333333,0.200,0.406,5.000000,3.889087,5.500000,5.250000,3.0,...,114.0,88.576118,8.500000,7.000000,2.000000,5.400000,0.092631,2.309401,140.0,5.333333
1357,20.571429,19.0,15.928571,0.435,0.411,2.214286,1.683611,2.181818,5.536364,17.0,...,193.0,81.818212,8.545455,6.857143,2.560263,6.590909,0.095574,1.869360,490.0,6.714286


## Step 4: Univariate Feature Selection (Optional)
Use mutual information or chi-squared scores to select features based on univariate statistics.

In [65]:

from sklearn.feature_selection import SelectKBest, mutual_info_classif

selector = SelectKBest(mutual_info_classif, k=100)
X_uni = selector.fit_transform(X_rfe, y_train)

univariate_features = [f for f, s in zip(rfe_features, selector.get_support()) if s]


## Step 5: Validate with Permutation Importance
Check feature relevance on your validation set.

In [66]:

from sklearn.inspection import permutation_importance

model.fit(X_rfe, y_train)
result = permutation_importance(model, X_val, y_val, n_repeats=10, random_state=42, n_jobs=-1)

perm_importances = pd.Series(result.importances_mean, index=X_rfe.columns).sort_values(ascending=False)


NameError: name 'X_val' is not defined