**RFE with Cross Validation**

| Model | Features                               | macro-F1 |
| ----- | -------------------------------------- | -------- |
| RF    | final_num + final_categ                | 0.38     |
| RF    | final_num + final_categ + [try_num[0]] | 0.39     |
| RF    | final_num + final_categ + [try_num[1]] | 0.39     |
| RF    | final_num + final_categ + [try_num[2]] | 0.39     |
| ----- | ----------------------- | -------- |
| ----- | ----------------------- | -------- |

In [None]:
# experiment with RFECV

from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold

def rfe_cv(X, y, model=None, cv = 5):
    
    best_score = 0
    best_features = []

    results = {}
    
    for feature in n_features:
        
        # Perform RFE to select features
        rfe_cv = RFECV(estimator=model, step=1, cv=StratifiedKFold(cv), scoring='f1_macro')
    
        rfe_cv.fit(X, y)

        # Get selected features
        selected_features = X.columns[rfe_cv.support_]
        
        # Model predictions and classification report on the training set with selected features
        y_pred = rfe_cv.predict(X)
        print(f"Classification Report for {feature} features:\n")
        print(classification_report(y, y_pred))
        
        # Calculate the macro average F1 score
        macro_f1 = f1_score(y, y_pred, average='macro')
        print(f"Macro Avg F1 Score for {feature} features: {macro_f1:.4f}\n")
        
        # Store the results
        results[feature] = selected_features
        
        # Check if this is the best score
        if macro_f1 > best_score:
            best_score = macro_f1
            best_features = selected_features.tolist()  
    
    return best_features


In [None]:
model = LogisticRegression()
cv = 5
rfe_cv(X_train_RS, y_train,
       model = model, cv = cv)

## 4.5 Hybrid Methods

<a href="#top">Top &#129033;</a>

**Boruta**

In [None]:
from boruta import BorutaPy

In [None]:
def boruta(X_num, X_categ, y, n_estimators=250, 
                                      random_state=42, threshold=5):
    
    # Concatenate scaled and categorical features
    X_comb = pd.concat([X_num, X_categ], axis=1)

    # Define and fit Boruta with RandomForest
    rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    boruta_selector = BorutaPy(rf_model, n_estimators='auto', random_state=random_state)
    boruta_selector.fit(X_comb.values, y.values)

    # Get selected features based on Boruta selection
    selected_features = X_comb.columns[boruta_selector.support_].tolist()
    print("Selected features:", selected_features)

    # Retrieve feature importances from the Boruta results
    feature_importance = rf_model.feature_importances_[boruta_selector.support_]
    feature_importance = 100.0 * (feature_importance / feature_importance.max())

    # Sort indices of features based on importance
    sorted_idx = np.argsort(feature_importance)
    pos = np.arange(sorted_idx.shape[0]) + 0.5

    # Plot feature importances
    plt.figure(figsize=(12, 8))
    plt.barh(pos, feature_importance[sorted_idx], align='center')
    plt.yticks(pos, np.array(selected_features)[sorted_idx])
    plt.xlabel('Relative Importance')
    plt.title('Feature Importance of Selected Features Using Boruta')
    
    # Draw a line at the importance threshold
    plt.axvline(x=threshold, color='red', linestyle='--', label=f'{threshold}% Importance Threshold')
    plt.legend()
    plt.show()

In [None]:
boruta(X_train_RS, X_train[categ], y_train, 
                        n_estimators = 10)

In [None]:
# rfe_LR = ['Age at Injury',
#  'Average Weekly Wage',
#  'Birth Year',
#  'IME-4 Count',
#  'IME-4 Count Log',
#  'Accident Year',
#  'Accident Month',
#  'Assembly Year',
#  'Assembly Month',
#  'Assembly Day',
#  'C-2 Year',
#  'C-2 Month',
#  'C-2 Day',
#  'First Hearing Year']


# rfe_RF = ['Age at Injury',
#  'Average Weekly Wage',
#  'Birth Year',
#  'IME-4 Count',
#  'IME-4 Count Log',
#  'Number of Dependents',
#  'Accident Year',
#  'Accident Month',
#  'Accident Day',
#  'Assembly Year',
#  'Assembly Month',
#  'Assembly Day',
#  'C-2 Year',
#  'C-2 Month',
#  'C-2 Day',
#  'First Hearing Year']




try all combos code

In [None]:
import itertools
from sklearn.metrics import f1_score

results = {}

# Loop over all combinations of try_num and try_categ
for num_combination_size in range(len(try_num) + 1):
    for categ_combination_size in range(len(try_categ) + 1):
        # Get all combinations of specified size for try_num and try_categ
        num_combinations = list(itertools.combinations(try_num, num_combination_size))
        categ_combinations = list(itertools.combinations(try_categ, categ_combination_size))
        
        for num_subset in num_combinations:
            for categ_subset in categ_combinations:
                # Create the feature set by combining final and selected try features
                selected_features = final_num + final_categ + list(num_subset) + list(categ_subset)
                
                # Subset the data
                X_train_subset = X_train[selected_features]
                X_val_subset = X_val[selected_features]
                
                # Train the Random Forest model and evaluate on the test set
                rf = RandomForestClassifier(random_state = 1)
                rf.fit(X_train_subset, y_train)
                
                train_pred_rf = rf.predict(X_train_subset)
                val_pred_rf = rf.predict(X_val_subset)
                
                class_report = m.metrics(y_train, train_pred_rf, y_val, val_pred_rf)
                # Calculate macro F1 score
                f1_macro = f1_score(y_val, val_pred_rf, average="macro")

                
                # Store the f1_macro with the feature set as the key
                results[f"{num_subset} + {categ_subset}"] = f1_macro
                
                print(f"Combination: {num_subset} + {categ_subset} | Macro F1 Score: {f1_macro}")


In [None]:
best_features = max(results, key=results.get)
print("Best feature combination:", best_features)
print("Best macro F1 score:", results[best_features])