In [5]:
import pandas as pd
from sklearn.feature_selection import SelectKBest, f_regression
#from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Preprocessing
data = pd.read_csv('data/train.csv', index_col = 0, dtype={'multiple_full_time_jobs': 'boolean', 'combined_multiple_jobs': 'boolean'}).reset_index(drop=True)
data = data.fillna(True)
data['multiple_full_time_jobs'] = data['multiple_full_time_jobs'].astype(int)
data['combined_multiple_jobs'] = data['combined_multiple_jobs'].astype(int)

# Feature engineering
data['daysexperience'] = (pd.to_datetime('10/7/23', format = "%m/%d/%y") - pd.to_datetime(data['HIREDT'], format = "%m/%d/%y")).dt.days
data.drop(['AGY', 'HIREDT'], axis=1, inplace=True)

# Preparing the data
X = data.drop('ANNUAL', axis=1)
y = data['ANNUAL']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

numerical_features = ['RATE', 'HRSWKD', 'daysexperience']
categorical_features = ['NAME', 'MI', 'JOBCLASS', 'JC.TITLE', 'RACE', 'SEX', 'EMPTYPE', 'STATENUM', 'multiple_full_time_jobs', 'combined_multiple_jobs']
# Column transformer to handle categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create a pipeline
estimator = LinearRegression()
pipe = Pipeline([('preprocessor', preprocessor), ('estimator', estimator)])

# Fit the pipeline on the entire dataset
pipe.fit(X_train, y_train)

# Feature selection with SelectKBest
kbest_selector = SelectKBest(f_regression, k=10)
kbest_selector.fit(pipe.named_steps['preprocessor'].transform(X_train), y_train)

# Get the indices of the most important features
important_feature_indices = kbest_selector.get_support(indices=True)
# Get the original column names of the important features
important_features = pipe.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_features)

print("Important features:")
print(important_features[important_feature_indices])

Important features:
['NAME_BOARD OF CHIROPRACTIC EXAMINERS                   '
 'NAME_CREDIT UNION DEPARTMENT                           '
 'NAME_LEGISLATIVE BUDGET BOARD                          '
 'NAME_TEXAS DEPARTMENT OF TRANSPORTATION                ' 'JOBCLASS_4483'
 'JOBCLASS_E003    '
 'JC.TITLE_DOC PROC TECH (SESS)                              '
 'JC.TITLE_PSYCHOLOGIST I                                    '
 'EMPTYPE_CRF - CLASSIFIED REGULAR FULL-TIME      '
 'EMPTYPE_UTF - UNCLASSIFIED TEMPORARY FULL-TIME  ']


In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance


# Calculate permutation importances on the validation set
result = permutation_importance(pipe, X_val, y_val, n_repeats=10, random_state=0)

# Sort the features by importance
sorted_idx = result.importances_mean.argsort()

# Print the sorted features and their importances
for i in sorted_idx[::-1]:
    print(f"{X.columns[i]}: {result.importances_mean[i]:.4f} +/- {result.importances_std[i]:.4f}")


JC.TITLE: 0.3700 +/- 0.0029
JOBCLASS: 0.3544 +/- 0.0031
EMPTYPE: 0.1701 +/- 0.0014
NAME: 0.1071 +/- 0.0008
HRSWKD: 0.0224 +/- 0.0004
daysexperience: 0.0038 +/- 0.0001
RATE: 0.0010 +/- 0.0001
RACE: 0.0004 +/- 0.0000
STATENUM: 0.0001 +/- 0.0000
MI: 0.0001 +/- 0.0000
SEX: 0.0000 +/- 0.0000
combined_multiple_jobs: 0.0000 +/- 0.0000
multiple_full_time_jobs: 0.0000 +/- 0.0000
