In [1]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

import play_song as song
import feature_selection as fs
import NA_outliers as n

In [2]:
df = pd.read_csv('./project_data/out_eda1.csv', 
                 index_col = 'Claim Identifier')

In [4]:
#df = df.dropna()

In [3]:
# Example usage in evaluate_features function:
X, y = df.drop(columns=['Claim Injury Type']), df['Claim Injury Type']

In [4]:
num = ['Age at Injury', 'Average Weekly Wage', 'Birth Year', 'IME-4 Count', 'Number of Dependents',
       'Accident Year', 'Accident Month', 'Accident Day', 'Assembly Year', 'Assembly Month', 'Assembly Day',
       'C-2 Year', 'C-2 Month', 'C-2 Day', 'First Hearing Year']

categ = ['Alternative Dispute Resolution', 'Attorney/Representative', 'Carrier Name', 'Carrier Type',
         'County of Injury', 'COVID-19 Indicator', 'District Name', 'Gender',
         'Industry Code', 'Medical Fee Region', 'WCIO Cause of Injury Code', 'WCIO Nature of Injury Code',
         'WCIO Part Of Body Code', 'Zip Code', 'C-3 Date Binary' ]

In [8]:
## EVAL function

In [9]:
from sklearn.metrics import classification_report
from collections import Counter
import time
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline


def evaluate_features(df, num, categ, 
                      rfe_features, rfe_model, k=5):
    
    start_time = time.time()
    
    X, y = df.drop(columns=['Claim Injury Type']), df['Claim Injury Type']
    kf = KFold(n_splits=k)
    selected_features = []

    for train_idx, val_idx in kf.split(X):
        
        print(f'------------ FOLD ------------ \n')
        start_time = time.time()
        
        # train & val
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        # Construct the Pipeline
        pipeline = Pipeline([
        ('imputer', FunctionTransformer(n.custom_impute, validate=False)),
        ('log_transform', FunctionTransformer(n.log_transform, validate=False)),  
        ])
    
        song.play_('audio.mp3')
        
        # Fit the pipeline on the training data
        pipeline.fit(X_train, y_train)
    
        # Numerical
        print('----LASSO----')
        lasso = fs.lasso(X_train, y_train, num)
        print('----RFE----')
        # for model in rfe_model: ## FUTURE
        rfe = fs.rfe(X_train, y_train, num, rfe_features, rfe_model)
        print('----CORR----')
        corr = fs.correlation_matrix(X_train, num)
        #print('----VAR----')
        var = fs.var(X_train, num)
        
        # Categorical
        print('----CHI2----')
        chi2 = fs.chi_squared(X_train, y_train, categ)
        print('----MI----')
        mutual_information = fs.mutual_information(X_train, y_train, categ)
        
        
        # Combine selected features from each method
        num_selected = set(lasso).union(rfe, corr, var)
        categ_selected = set(chi2).union(mutual_information)

        # Filter training and validation sets based on selected features
        num_selected = [feature for feature in num_selected if isinstance(feature, str)]
        
        #num_selected = num_selected.to_list()
        categ_selected = list(categ_selected)
        
        X_train_selected = X_train[num_selected + categ_selected]
        X_val_selected = X_val[num_selected + categ_selected]
        
        
        # Train model and record performance
        model = LogisticRegression()
        model.fit(X_train_selected, y_train)
        y_pred = model.predict(X_val_selected)
        
        # Print classification report for current fold
        print('CLASSIFICATION REPORT \n')
        print(classification_report(y_val, y_pred))

        # Save selected features for counting occurrences
        selected_features.append(num_selected + categ_selected)

        # End timing
        end_time1 = time.time()
        elapsed_time1 = end_time1 - start_time
        print(elapsed_time1)
    
    # Count occurrences of each feature across folds
    feature_counts = Counter(np.concatenate(selected_features))
    
    song.play_('audio.mp3')
    # End timing
    end_time2 = time.time()
    elapsed_time2 = end_time2 - start_time
    print(elapsed_time2)
    
    # Select only features that appear in all folds
    final_features = [feature for feature, count in feature_counts.items() if count == k]
    return final_features


In [10]:
rfe_features = [12]

evaluate_features(df, num, categ, 
                  rfe_features = rfe_features,
                  rfe_model = RandomForestClassifier(),
                  k = 2)

------------ FOLD ------------ 



TypeError: custom_impute() missing 1 required positional argument: 'var_name'

RFE n está a selecionar nenhum número de features (por enquanto), está apenas a mostrar o classification report para cada número de features inputed.

__*5 fold train: LogReg, rfe: LogReg, features=[12,13,14,15]*__

['Accident Year',
 'Number of Dependents',
 'C-2 Day',
 'Assembly Day',
 'First Hearing Year',
 'C-2 Month',
 'IME-4 Count',
 'Assembly Year',
 'Age at Injury',
 'Birth Year',
 'Accident Month',
 'Average Weekly Wage',
 'Accident Day',
 'Assembly Month',
 'C-2 Year',
 'County of Injury',
 'Carrier Name',
 'C-3 Date Binary',
 'Carrier Type',
 'District Name',
 'Zip Code',
 'Alternative Dispute Resolution',
 'Industry Code',
 'Gender',
 'Attorney/Representative',
 'WCIO Nature of Injury Code',
 'WCIO Part Of Body Code',
 'WCIO Cause of Injury Code',
 'COVID-19 Indicator',
 'Medical Fee Region']
 
  TIME: 
 
 __*5 fold train: LogReg, rfe: RF, features=[12]*__
 
 ['Assembly Day',
 'Assembly Month',
 'Assembly Year',
 'First Hearing Year',
 'Age at Injury',
 'C-2 Month',
 'Birth Year',
 'Accident Month',
 'Average Weekly Wage',
 'Accident Year',
 'IME-4 Count',
 'Number of Dependents',
 'Accident Day',
 'C-2 Day',
 'C-2 Year',
 'County of Injury',
 'Carrier Name',
 'C-3 Date Binary',
 'Carrier Type',
 'District Name',
 'Zip Code',
 'Alternative Dispute Resolution',
 'Industry Code',
 'Gender',
 'Attorney/Representative',
 'WCIO Nature of Injury Code',
 'WCIO Part Of Body Code',
 'WCIO Cause of Injury Code',
 'COVID-19 Indicator',
 'Medical Fee Region']
 
 TIME: 28min

In [None]:
song.play_('audio.mp3')

Numerical Features: Lasso, RFE, Variance Threshold, Correlation Matrix, Statistical Tests. <br> <BR>
Categorical Features: Chi-Squared Test, Information Gain, Tree-based Methods, Mutual Information.