In [17]:
import numpy as np
import pandas as pd
from selection_methods import *
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
data = pd.read_csv('../data/Telco-Customer-Churn-encoded-data-FE.csv')
target = pd.read_csv('../data/Telco-Customer-Churn-encoded-label.csv')

In [4]:
selection_methods_list = [
    MRMR,
    Xgb_Selection,
    GBM_Selection,
    Rf_Selection,
    Lasso_Selection,
    Catboost_Selection,
    RFE_Selection,
    PCA_Selection,
    Shap_Selection
]

In [7]:
for method in selection_methods_list:
    print(f'{method.__name__}: {method}')
    print(f"*****\n {method(data, target).used_model}\n *****")
    

MRMR: <class 'selection_methods.MRMR'>
*****
 <catboost.core.CatBoostClassifier object at 0x000001B045104810>
 *****
Xgb_Selection: <class 'selection_methods.Xgb_Selection'>
*****
 XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=True, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=None, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
 *****
GBM_Selection: <class 'selection_methods.GBM_Selection'>


In [8]:
importances = pd.DataFrame()
for method in selection_methods_list:
    method_name = method.__name__
    print(f'Method: {method_name}')
    model = method(data, target=target)
    model.fit()
    method_importances = model.get_importances()
    print(f'Feature importances:\n {method_importances}')
    importances_dict = pd.DataFrame(method_importances, index=[method_name])
    importances = pd.concat([importances, importances_dict])
importances

Method: MRMR


100%|██████████| 43/43 [00:06<00:00,  6.36it/s]
100%|██████████| 43/43 [00:05<00:00,  7.68it/s]
100%|██████████| 43/43 [00:04<00:00,  9.37it/s]


Feature importances:
 {'Contract_Month-to-month': 45215649.56346652, 'PaymentMethod_Mailed check': 7855.558962648914, 'OnlineSecurity_No': 1020.9929217155377, 'TechSupport_No': 982.017277079501, 'Monthly/Total_Charges': 926.9725469093004, 'tenure': 871.8858517862745, 'InternetService_Fiber optic': 847.8012618463655, 'MonthlyCharges': 807.6219620719759, 'PaymentMethod_Electronic check': 767.7975773077275, 'tenure_group': 751.200484745284, 'Contract_Two year': 680.6194131342791, 'OnlineBackup_No': 592.6867298022129, 'DeviceProtection_No': 556.988083903697, 'PaperlessBilling_Yes': 539.0847133601134, 'InternetService_DSL': 472.8045464428833, 'Contract_One year': 465.7465838934567, 'InternetService_No': 447.80600409925023, 'Dependents_Yes': 440.73396711109416, 'SeniorCitizen_1': 432.29073528817486, 'OnlineSecurity_No internet service': 403.3233779666405, 'OnlineBackup_No internet service': 368.8538981358959, 'OnlineSecurity_Yes': 351.11157681275154, 'DeviceProtection_No internet service': 3

Unnamed: 0,Contract_Month-to-month,PaymentMethod_Mailed check,OnlineSecurity_No,TechSupport_No,Monthly/Total_Charges,tenure,InternetService_Fiber optic,MonthlyCharges,PaymentMethod_Electronic check,tenure_group,...,StreamingTV_No,gender_Male,StreamingTV_Yes,OnlineBackup_Yes,DeviceProtection_Yes,StreamingMovies_Yes,MultipleLines_Yes,MultipleLines_No,MultipleLines_No phone service,PhoneService_Yes
MRMR,45215650.0,7855.559,1020.993,982.0173,926.9725,871.8859,847.8013,807.622,767.7976,751.2005,...,175.0151,42.884944,40.83073,40.43422,38.78863,28.54155,28.37159,15.84645,7.145129,6.332235
Xgb_Selection,0.33347,0.006196208,0.02464148,0.02277734,0.01467568,0.008393361,0.3236578,0.009158291,0.01149953,0.0,...,0.007758318,0.007556,0.003924814,0.008620501,0.005777318,0.01200656,0.007256907,0.0101448,0.0,0.01704112
GBM_Selection,0.3793457,0.001083684,0.06636738,0.04319286,0.1778635,0.009150428,0.083914,0.04078097,0.03893921,1.425225e-05,...,0.0007451878,0.001852,0.001330079,0.000309979,0.0004574179,0.004761454,0.0002474744,0.008117147,0.0006756656,0.002202913
Rf_Selection,0.04916054,0.009534494,0.02338999,0.02057635,0.1238771,0.08300387,0.02565892,0.1045853,0.02201728,0.03350097,...,0.008579502,0.021131,0.00849343,0.01007849,0.009322163,0.009038437,0.009736172,0.01025883,0.002855195,0.002439516
Lasso_Selection,0.0,-0.0,0.0,0.0,0.0,-0.002120989,0.0,0.003218145,0.0,0.0,...,0.0,-0.0,0.0,-0.0,-0.0,0.0,0.0,-0.0,0.0,-0.0
Catboost_Selection,5.485722,1.413549,2.602295,2.162866,10.07272,6.336968,2.586299,10.39317,2.425483,3.228826,...,0.4145521,2.490836,0.9515785,0.8399146,0.5788174,0.9005274,1.104332,1.669513,0.5259948,0.4456609
RFE_Selection,1.0,1.0,9.0,8.0,11.0,19.0,1.0,21.0,1.0,14.0,...,1.0,17.0,1.0,1.0,20.0,1.0,4.0,3.0,1.0,1.0
PCA_Selection,9.148514e-33,9.148514e-33,5.022974e-08,2.550682e-08,9.148514e-33,9.148514e-33,5.677609e-08,9.148514e-33,9.148514e-33,9.148514e-33,...,1.72556e-08,0.999757,2.049738e-10,4.168289e-08,2.776109e-08,9.148514e-33,6.523863e-08,1.059217e-07,7.846455e-08,2.064476e-07
Shap_Selection,0.8437142,0.05439763,0.2565397,0.1651233,0.5205219,0.2831978,0.1279191,0.3771491,0.1914869,0.0,...,0.04104113,0.065575,0.04938637,0.03838541,0.01593113,0.06432658,0.03743131,0.1175077,0.0,0.03616121


In [11]:
importances.to_csv('../data/feature_importances_new.csv', index=False)

In [9]:
importances.T

Unnamed: 0,MRMR,Xgb_Selection,GBM_Selection,Rf_Selection,Lasso_Selection,Catboost_Selection,RFE_Selection,PCA_Selection,Shap_Selection
Contract_Month-to-month,45215650.0,0.33347,0.379346,0.049161,0.0,5.485722,1.0,9.148514e-33,0.843714
PaymentMethod_Mailed check,7855.559,0.006196,0.001084,0.009534,-0.0,1.413549,1.0,9.148514e-33,0.054398
OnlineSecurity_No,1020.993,0.024641,0.066367,0.02339,0.0,2.602295,9.0,5.022974e-08,0.25654
TechSupport_No,982.0173,0.022777,0.043193,0.020576,0.0,2.162866,8.0,2.550682e-08,0.165123
Monthly/Total_Charges,926.9725,0.014676,0.177864,0.123877,0.0,10.07272,11.0,9.148514e-33,0.520522
tenure,871.8859,0.008393,0.00915,0.083004,-0.002121,6.336968,19.0,9.148514e-33,0.283198
InternetService_Fiber optic,847.8013,0.323658,0.083914,0.025659,0.0,2.586299,1.0,5.677609e-08,0.127919
MonthlyCharges,807.622,0.009158,0.040781,0.104585,0.003218,10.393168,21.0,9.148514e-33,0.377149
PaymentMethod_Electronic check,767.7976,0.0115,0.038939,0.022017,0.0,2.425483,1.0,9.148514e-33,0.191487
tenure_group,751.2005,0.0,1.4e-05,0.033501,0.0,3.228826,14.0,9.148514e-33,0.0


In [32]:

for method in selection_methods_list:
    accuracies = []

    features = importances.T[method.__name__]
    features = features.sort_values(ascending=False)


    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

        
    model = method(data, target).used_model

    scores = [features.index[0]]

    for i in features.index:
        if i not in scores:
            model.fit(X_train[[*scores,i]], y_train)
            y_pred = model.predict(X_test[[*scores,i]])
            score = f1_score(y_test, y_pred, average='macro')
            accuracies.append(score)
            scores.append(i)

    print(scores, method)
    print(accuracies, 'acc')
    plt.figure(figsize=(15,12))
    plt.plot(accuracies)
    plt.xticks(ticks=range(0,43), labels=[*features.index], rotation='vertical')
    plt.grid()
    plt.savefig(f'{method.__name__}_new.png')

KeyboardInterrupt: 