In [13]:
import pandas as pd

from sklearn.model_selection import train_test_split

In [64]:
baseball = pd.read_csv('data/baseball.csv')
baseball = baseball.drop(['Name', 'Age', 'Name-additional'], axis = 1)
baseball['Salary'] = baseball['Salary'].str.replace('$', '').astype(float)

baseball['C'] = baseball['Position'].apply(lambda x: 1 if 'C' in x else 0)
baseball['1B'] = baseball['Position'].apply(lambda x: 1 if '1B' in x else 0)
baseball['2B'] = baseball['Position'].apply(lambda x: 1 if '2B' in x else 0)
baseball['3B'] = baseball['Position'].apply(lambda x: 1 if '3B' in x else 0)
baseball['SS'] = baseball['Position'].apply(lambda x: 1 if 'SS' in x else 0)
baseball['OF'] = baseball['Position'].apply(lambda x: 1 if 'OF' in x else 0)

baseball['Num_Pos'] = baseball[['C', '1B', '2B', '3B', 'SS', 'OF']].sum(axis = 1)
baseball = baseball.drop(['Position'], axis = 1)

## Feature Selection Using SelectKBest

In [27]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import chi2

In [66]:
X = baseball.drop(['Salary'], axis = 1)
y = baseball['Salary']

cat_columns = ['Tm', 'Lg', 'Acquired', 'Bat']
num_columns = [col for col in X.columns if col not in cat_columns + ['C', '1B', '2B', '3B', 'SS', 'OF']]

X = pd.get_dummies(X, columns = cat_columns)

cat_columns = [col for col in X.columns if col not in num_columns]


In [67]:
# k is the number of features we want to select
cont = SelectKBest(score_func = f_classif, k = 10)
fit = cont.fit_transform(X[num_columns], y)
selected = X[num_columns].columns[cont.get_support()]
print(selected)

# k is the number of features we want
cat = SelectKBest(score_func = chi2, k = 15)
fit = cat.fit_transform(X[cat_columns], y)
selected = X[cat_columns].columns[cat.get_support()]
print(selected)

Index(['Season', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'BB', 'TB', 'GDP'], dtype='object')
Index(['Tm_ARI', 'Tm_CHC', 'Tm_CLE', 'Tm_HOU', 'Tm_KCR', 'Tm_NYM', 'Tm_NYY',
       'Tm_SDP', 'Tm_SEA', 'Tm_TBR', 'Tm_TEX', 'Tm_TOR',
       'Acquired_Amateur Free Agent', 'Acquired_Rule 5 returned',
       'Acquired_Waivers'],
      dtype='object')


## Feature Selection Using RFE

In [77]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [69]:
X = baseball.drop(['Salary'], axis = 1)
y = baseball['Salary']

cat_columns = ['Tm', 'Lg', 'Acquired', 'Bat']
num_columns = [col for col in X.columns if col not in cat_columns + ['C', '1B', '2B', '3B', 'SS', 'OF']]

In [83]:
cat_transformer = Pipeline(
    steps = [
        ('onehot', OneHotEncoder(handle_unknown = 'ignore'))
    ]
)

num_transformer = Pipeline(
    steps = [
        ('scale', StandardScaler())
    ]
)

preprocessor = ColumnTransformer(
    transformers = [
        ('cont', num_transformer, num_columns),
        ('cat', cat_transformer, cat_columns)
    ], remainder = 'passthrough'
)

X_transform = preprocessor.fit_transform(X)

In [84]:
rfecv = RFE(estimator = RandomForestRegressor(random_state = 621), step = 1, n_features_to_select = None)
rfecv.fit(X_transform, y)

In [95]:
selected_features = np.concatenate([
    np.array(num_columns),
    np.array(preprocessor.transformers_[1][1]['onehot'].get_feature_names_out(cat_columns)),
    np.array(['C', '1B', '2B', '3B', 'SS', 'OF'])
])

In [97]:
selected_features[rfecv.support_]

array(['Def-Inn', 'PO', 'A', 'E', 'DP', 'Fld%', 'Rdrs', 'Season', 'RAA',
       'WAA', 'RAR', 'WAR', 'PA', 'AB', 'R', 'H', 'HR', 'RBI', 'SB', 'CS',
       'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+', 'TB', 'GDP', 'HBP',
       'SH', 'SF', 'IBB', 'Num_Pos', 'Tm_LAA', 'Tm_LAD', 'Tm_MULTIPLE',
       'Tm_NYM', 'Tm_NYY', 'Tm_SFG', 'Tm_STL', 'Acquired_Free Agency',
       'Acquired_Traded'], dtype=object)