In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from mrmr import mrmr_classif
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import Lasso
from sklearn.feature_selection import RFE
from catboost import CatBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.decomposition import PCA
from typing import Dict
import selection_methods as smtd

In [6]:
subclasses = set()
work = [smtd.Feature_Selection]
while work:
    parent = work.pop()
    for child in parent.__subclasses__():
        if child not in subclasses:
            subclasses.add(child)
            work.append(child)
# METHODS = [x.__name__ for x in subclasses]
METHODS = list(subclasses)

In [7]:
METHODS

[selection_methods.Rf_Selection,
 selection_methods.PCA_Selection,
 selection_methods.MRMR,
 selection_methods.RFE_Selection,
 selection_methods.Lasso_Selection,
 selection_methods.Shap_Selection,
 selection_methods.GBM_Selection,
 selection_methods.Catboost_Selection,
 selection_methods.Xgb_Selection]

In [8]:
df = pd.read_csv('../data/encoded.csv')

In [9]:
df.fillna(0, inplace=True)

In [10]:
df.replace([np.inf, -np.inf], 0, inplace=True)
df.max()

gender_Male                                   1.00
SeniorCitizen_1                               1.00
Partner_Yes                                   1.00
Dependents_Yes                                1.00
PhoneService_Yes                              1.00
MultipleLines_No                              1.00
MultipleLines_No phone service                1.00
MultipleLines_Yes                             1.00
InternetService_DSL                           1.00
InternetService_Fiber optic                   1.00
InternetService_No                            1.00
OnlineSecurity_No                             1.00
OnlineSecurity_No internet service            1.00
OnlineSecurity_Yes                            1.00
OnlineBackup_No                               1.00
OnlineBackup_No internet service              1.00
OnlineBackup_Yes                              1.00
DeviceProtection_No                           1.00
DeviceProtection_No internet service          1.00
DeviceProtection_Yes           

In [11]:
data = df.drop('Churn', axis=1)
target = df['Churn']

In [12]:
importances = pd.DataFrame()

for _ in METHODS:
    method_name = _.__name__
    print(f'Method: {method_name}')
    model = _(data, target=target)
    model.fit()
    print(f'Feature importances:\n {model.get_importances()}')
    new_df = pd.DataFrame(model.get_importances(), index=[method_name])
    importances = pd.concat([importances, new_df])

Method: Rf_Selection
Feature importances:
 {'Monthly/Total_Charges': 0.12380580223378398, 'TotalCharges': 0.1227228728291543, 'TotalCharges/tenure': 0.11266508841867542, 'MonthlyCharges': 0.10890622428554292, 'Contract_Month-to-month': 0.05425600047422571, 'OnlineSecurity_No': 0.027863652807136364, 'PaymentMethod_Electronic check': 0.025814504847218433, 'InternetService_Fiber optic': 0.02537796185279866, 'TechSupport_No': 0.023887424074743504, 'gender_Male': 0.022532303539937563, 'tenure_group_0 - 11': 0.02241918399263338, 'PaperlessBilling_Yes': 0.0216184800087288, 'Partner_Yes': 0.019057908827773872, 'Contract_Two year': 0.018596059403125823, 'SeniorCitizen_1': 0.01688694055427311, 'Dependents_Yes': 0.015449944525661482, 'DeviceProtection_No': 0.014677202174384747, 'OnlineBackup_No': 0.013068338352853892, 'tenure': 0.011692109231880892, 'MultipleLines_No': 0.0111007404102539, 'OnlineBackup_Yes': 0.010727723787044873, 'PaymentMethod_Credit card (automatic)': 0.010635717545630141, 'Mul

100%|██████████| 48/48 [00:07<00:00,  6.43it/s]
100%|██████████| 48/48 [00:06<00:00,  7.05it/s]
100%|██████████| 48/48 [00:06<00:00,  7.49it/s]


Feature importances:
 {'Contract_Month-to-month': 46077019.68552643, 'PaymentMethod_Mailed check': 5965.308157861305, 'TechSupport_No': 1110.1773929484095, 'Monthly/Total_Charges': 1032.1816817053239, 'OnlineSecurity_No': 1029.7267584949716, 'PaymentMethod_Electronic check': 992.6904873312561, 'TotalCharges/tenure': 960.5523803505322, 'tenure_group_0 - 11': 840.7688953810897, 'InternetService_Fiber optic': 789.3753494014081, 'Contract_Two year': 741.9916194191692, 'OnlineBackup_No': 713.5399809592983, 'MonthlyCharges': 699.8181315325883, 'tenure': 643.7665483430388, 'DeviceProtection_No': 612.8175029387617, 'PaperlessBilling_Yes': 564.3355232628396, 'tenure_group_26 - 37': 534.8282754858046, 'Contract_One year': 520.0410786967361, 'InternetService_DSL': 502.5075407900563, 'tenure_group_65 - 76': 488.22098255343, 'SeniorCitizen_1': 465.1534954026511, 'InternetService_No': 448.7586587532255, 'Dependents_Yes': 442.66368190640566, 'OnlineSecurity_No internet service': 428.63702556069006, '

In [13]:
importances

Unnamed: 0,Monthly/Total_Charges,TotalCharges,TotalCharges/tenure,MonthlyCharges,Contract_Month-to-month,OnlineSecurity_No,PaymentMethod_Electronic check,InternetService_Fiber optic,TechSupport_No,gender_Male,...,InternetService_No,StreamingMovies_No internet service,tenure_group_65 - 76,MultipleLines_No phone service,PhoneService_Yes,DeviceProtection_No internet service,StreamingTV_No internet service,OnlineSecurity_No internet service,TechSupport_No internet service,OnlineBackup_No internet service
Rf_Selection,0.1238058,0.1227229,0.1126651,0.1089062,0.054256,0.02786365,0.0258145,0.02537796,0.02388742,0.022532,...,0.004892698,0.004018591,0.003502278,0.002893143,0.002809149,0.002796525,0.002380476,0.001800147,0.001426213,0.001122256
PCA_Selection,9.111932000000001e-33,9.111932000000001e-33,9.111932000000001e-33,9.111932000000001e-33,2.046062e-10,5.022483e-08,9.111932000000001e-33,5.44599e-08,2.699725e-08,0.999794,...,5.269234e-08,4.997204e-09,9.111932000000001e-33,6.94701e-08,1.069592e-07,3.657477e-08,2.29869e-08,4.861269e-08,2.564714e-08,4.203763e-08
MRMR,1032.182,296.0894,960.5524,699.8181,46077020.0,1029.727,992.6905,789.3753,1110.177,26.063422,...,448.7587,320.924,488.221,13.53051,11.0921,375.1804,335.0048,428.637,357.5197,398.6753
RFE_Selection,16.0,25.0,24.0,22.0,1.0,1.0,1.0,9.0,5.0,20.0,...,14.0,1.0,1.0,1.0,11.0,7.0,1.0,10.0,15.0,1.0
Lasso_Selection,0.0,-0.00010835,0.004218691,0.003821651,0.0,0.0,0.0,0.0,0.0,-0.0,...,-0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
Shap_Selection,0.5242043,0.3966729,0.3711375,0.3178379,0.8569837,0.2682331,0.1745113,0.1936614,0.1906011,0.068919,...,0.0,0.0,0.03304113,0.0,0.03426257,0.0,0.0,0.0,0.0,0.0
GBM_Selection,0.1860693,0.04372712,0.04422682,0.04671501,0.3838191,0.06823581,0.03582709,0.08157069,0.04158225,0.000926,...,0.0,0.0,0.0,0.0003543818,0.002472887,0.0,0.0,0.0,0.0,0.0
Catboost_Selection,12.6902,9.807683,10.92686,10.11001,6.448553,2.671594,2.576723,2.476236,1.919217,2.388539,...,0.1249704,0.05951262,0.311193,0.4719126,0.2919373,0.2064113,0.5130164,0.2774702,0.2147723,0.01154388
Xgb_Selection,0.01196427,0.007342538,0.008144485,0.00754316,0.3331451,0.02275703,0.009797708,0.2914353,0.01841668,0.00671,...,0.0,0.0,0.004644521,0.0,0.0173445,0.0,0.0,0.0,0.0,0.0


In [14]:
importances.index

Index(['Rf_Selection', 'PCA_Selection', 'MRMR', 'RFE_Selection',
       'Lasso_Selection', 'Shap_Selection', 'GBM_Selection',
       'Catboost_Selection', 'Xgb_Selection'],
      dtype='object')

In [15]:
for ind in importances.index:
    values_list = importances.loc[ind].tolist()
    non_zero_abs_values = [abs(value) for value in values_list if value != 0]
    smallest_non_zero = min(non_zero_abs_values)
    for col in importances.columns:
        if importances.loc[ind, col] == 0.0:
            importances.loc[ind, col] = smallest_non_zero / 100

In [16]:
importances

Unnamed: 0,Monthly/Total_Charges,TotalCharges,TotalCharges/tenure,MonthlyCharges,Contract_Month-to-month,OnlineSecurity_No,PaymentMethod_Electronic check,InternetService_Fiber optic,TechSupport_No,gender_Male,...,InternetService_No,StreamingMovies_No internet service,tenure_group_65 - 76,MultipleLines_No phone service,PhoneService_Yes,DeviceProtection_No internet service,StreamingTV_No internet service,OnlineSecurity_No internet service,TechSupport_No internet service,OnlineBackup_No internet service
Rf_Selection,0.1238058,0.1227229,0.1126651,0.1089062,0.054256,0.02786365,0.0258145,0.02537796,0.02388742,0.022532,...,0.004892698,0.004018591,0.003502278,0.002893143,0.002809149,0.002796525,0.002380476,0.001800147,0.001426213,0.001122256
PCA_Selection,9.111932000000001e-33,9.111932000000001e-33,9.111932000000001e-33,9.111932000000001e-33,2.046062e-10,5.022483e-08,9.111932000000001e-33,5.44599e-08,2.699725e-08,0.999794,...,5.269234e-08,4.997204e-09,9.111932000000001e-33,6.94701e-08,1.069592e-07,3.657477e-08,2.29869e-08,4.861269e-08,2.564714e-08,4.203763e-08
MRMR,1032.182,296.0894,960.5524,699.8181,46077020.0,1029.727,992.6905,789.3753,1110.177,26.063422,...,448.7587,320.924,488.221,13.53051,11.0921,375.1804,335.0048,428.637,357.5197,398.6753
RFE_Selection,16.0,25.0,24.0,22.0,1.0,1.0,1.0,9.0,5.0,20.0,...,14.0,1.0,1.0,1.0,11.0,7.0,1.0,10.0,15.0,1.0
Lasso_Selection,1.0835e-06,-0.00010835,0.004218691,0.003821651,1.0835e-06,1.0835e-06,1.0835e-06,1.0835e-06,1.0835e-06,1e-06,...,1.0835e-06,1.0835e-06,1.0835e-06,1.0835e-06,1.0835e-06,1.0835e-06,1.0835e-06,1.0835e-06,1.0835e-06,1.0835e-06
Shap_Selection,0.5242043,0.3966729,0.3711375,0.3178379,0.8569837,0.2682331,0.1745113,0.1936614,0.1906011,0.068919,...,9.445263e-06,9.445263e-06,0.03304113,9.445263e-06,0.03426257,9.445263e-06,9.445263e-06,9.445263e-06,9.445263e-06,9.445263e-06
GBM_Selection,0.1860693,0.04372712,0.04422682,0.04671501,0.3838191,0.06823581,0.03582709,0.08157069,0.04158225,0.000926,...,8.779772e-07,8.779772e-07,8.779772e-07,0.0003543818,0.002472887,8.779772e-07,8.779772e-07,8.779772e-07,8.779772e-07,8.779772e-07
Catboost_Selection,12.6902,9.807683,10.92686,10.11001,6.448553,2.671594,2.576723,2.476236,1.919217,2.388539,...,0.1249704,0.05951262,0.311193,0.4719126,0.2919373,0.2064113,0.5130164,0.2774702,0.2147723,0.01154388
Xgb_Selection,0.01196427,0.007342538,0.008144485,0.00754316,0.3331451,0.02275703,0.009797708,0.2914353,0.01841668,0.00671,...,3.734523e-05,3.734523e-05,0.004644521,3.734523e-05,0.0173445,3.734523e-05,3.734523e-05,3.734523e-05,3.734523e-05,3.734523e-05


In [17]:
zeros_count = (importances == 0).sum().sum()
zeros_count

0

In [18]:
importances.drop('PCA_Selection', axis='index')

multiplications = importances.apply(lambda x: abs(x).prod(), axis=0)

# Sort the values in descending order
multiplications_sorted = multiplications.sort_values(ascending=False)

# Create a DataFrame from the sorted values
multiplications_df = pd.DataFrame({'Column': multiplications_sorted.index, 'Multiplication': multiplications_sorted.values})

# Save the DataFrame as a CSV file
multiplications_df.to_csv('importance_values.csv', index=False)

# Print the DataFrame
print(multiplications_df)

                                     Column  Multiplication
0                   Contract_Month-to-month    3.916290e-10
1                               gender_Male    1.301266e-11
2                           SeniorCitizen_1    2.958674e-13
3               InternetService_Fiber optic    1.212830e-13
4                         OnlineSecurity_No    1.737474e-15
5                            TechSupport_No    1.086548e-15
6                            Dependents_Yes    5.538222e-17
7                           OnlineBackup_No    3.422399e-17
8                       InternetService_DSL    9.358941e-19
9                          MultipleLines_No    2.149437e-19
10                              Partner_Yes    4.392301e-20
11                         PhoneService_Yes    1.704134e-20
12                      DeviceProtection_No    1.212274e-20
13                         OnlineBackup_Yes    1.067402e-20
14                          TechSupport_Yes    6.595645e-21
15                           StreamingTV