In [1]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
def MinMaxScale(df, df_test):
    scaler = MinMaxScaler()
    scaler.fit(df)
    df = pd.DataFrame(scaler.transform(df), columns = df.columns)
    df_test = pd.DataFrame(scaler.transform(df_test), columns = df.columns)
    
    return df, df_test

def Compare_Features_Accuracy(X_List, y, Kfold = 5):
    cols = [f'List{k}' for k in range(len(X_List))]
    Results = pd.DataFrame(index = ['Naive Bayes', 'Logistic Regression', 'LightGBM', 'CatBoost'], 
                           columns = cols).fillna(0)
    
    def evaluate_model(X_train, y_train, X_test, y_test, model, scaling = False):
        if scaling:
            X_train, X_test = MinMaxScale(X_train, X_test)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        return accuracy_score(y_test, y_pred)
    
    for i in range(Kfold):
        random_state = int(60/(i + 1)) + 42
        for k in range(len(X_List)):
            X = X_List[k]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)
    
            models = {'Naive Bayes': GaussianNB(), 
                      'Logistic Regression': LogisticRegression(penalty = 'l1', solver = 'liblinear'), 
                      'LightGBM': LGBMClassifier(learning_rate = 0.01, num_leaves = 15, verbose = -1), 
                      'CatBoost': CatBoostClassifier(learning_rate = 0.1, iterations = 100, verbose = False)}
        
            for m in models.keys():
                if (m == 'Logistic Regression') | (m == 'Naive Bayes'):
                    scaling = True
                else:
                    scaling = False
                score = evaluate_model(X_train, y_train, X_test, y_test, models[m], scaling)
                Results.loc[m, f'List{k}'] += score
            
    Results /= Kfold
    return Results

In [3]:
def highlight_max(s, props = ''):
    return np.where(s == np.nanmax(s.values), props, '')

def style_results(Results):
    Results_styled = Results.style.format(precision = 4)
    Results_styled.apply(highlight_max, props = 'color:#C41E3A', axis = 1)
    return Results_styled

def Compare_Ordered_Feature_Lists(df, target, basic_list, features_list, Kfold = 5):
    y = df[target]
    
    X_list = [df[basic_list].copy()]
    print('List0: ', basic_list)
    
    for k in range(len(features_list)):
        List = basic_list + features_list[:k+1]
        print(f'List{k + 1}: ', List)
        X_list.append(df[List].copy())
        
    Results = Compare_Features_Accuracy(X_list, y, Kfold = Kfold)
    return style_results(Results)