In [1]:
import numpy as np
import pandas as pd
from typing import Dict
import selection_methods as smtd

In [2]:
subclasses = set()
work = [smtd.Feature_Selection]
while work:
    parent = work.pop()
    for child in parent.__subclasses__():
        if child not in subclasses:
            subclasses.add(child)
            work.append(child)
METHODS = list(subclasses)

In [3]:
METHODS

[selection_methods.RFE_Selection,
 selection_methods.Lasso_Selection,
 selection_methods.Xgb_Selection,
 selection_methods.Shap_Selection,
 selection_methods.GBM_Selection,
 selection_methods.Catboost_Selection,
 selection_methods.Rf_Selection,
 selection_methods.MRMR,
 selection_methods.PCA_Selection]

In [4]:
df = pd.concat((pd.read_csv('../data/Telco-Customer-Churn-encoded-data-FE.csv'), pd.read_csv('../data/Telco-Customer-Churn-encoded-label.csv')), axis=1)

In [5]:
df.replace([np.inf, -np.inf], 0, inplace=True)


In [6]:
df.columns

Index(['gender_Male', 'SeniorCitizen_1', 'Partner_Yes', 'Dependents_Yes',
       'PhoneService_Yes', 'MultipleLines_No',
       'MultipleLines_No phone service', 'MultipleLines_Yes',
       'InternetService_DSL', 'InternetService_Fiber optic',
       'InternetService_No', 'OnlineSecurity_No',
       'OnlineSecurity_No internet service', 'OnlineSecurity_Yes',
       'OnlineBackup_No', 'OnlineBackup_No internet service',
       'OnlineBackup_Yes', 'DeviceProtection_No',
       'DeviceProtection_No internet service', 'DeviceProtection_Yes',
       'TechSupport_No', 'TechSupport_No internet service', 'TechSupport_Yes',
       'StreamingTV_No', 'StreamingTV_No internet service', 'StreamingTV_Yes',
       'StreamingMovies_No', 'StreamingMovies_No internet service',
       'StreamingMovies_Yes', 'Contract_Month-to-month', 'Contract_One year',
       'Contract_Two year', 'PaperlessBilling_Yes',
       'PaymentMethod_Bank transfer (automatic)',
       'PaymentMethod_Credit card (automatic)',
  

In [7]:
data = df.drop('Churn', axis=1)
target = df['Churn']

In [8]:
importances = pd.DataFrame()

for _ in METHODS:
    method_name = _.__name__
    print(f'Method: {method_name}')
    model = _(data, target=target)
    model.fit()
    print(f'Feature importances:\n {model.get_importances()}')
    new_df = pd.DataFrame(model.get_importances(), index=[method_name])
    importances = pd.concat([importances, new_df])

Method: RFE_Selection
Feature importances:
 {'TotalCharges': 23, 'TotalCharges/tenure': 22, 'MonthlyCharges': 21, 'DeviceProtection_Yes': 20, 'tenure': 19, 'gender_Male': 18, 'Partner_Yes': 17, 'Dependents_Yes': 16, 'DeviceProtection_No': 15, 'SeniorCitizen_1': 14, 'tenure_group': 13, 'PaperlessBilling_Yes': 12, 'Monthly/Total_Charges': 11, 'InternetService_No': 10, 'OnlineBackup_No': 9, 'OnlineBackup_Yes': 8, 'TechSupport_Yes': 7, 'TechSupport_No': 6, 'StreamingTV_Yes': 5, 'StreamingTV_No': 4, 'MultipleLines_No phone service': 3, 'OnlineSecurity_No internet service': 2, 'PhoneService_Yes': 1, 'MultipleLines_No': 1, 'MultipleLines_Yes': 1, 'InternetService_DSL': 1, 'InternetService_Fiber optic': 1, 'OnlineSecurity_No': 1, 'OnlineSecurity_Yes': 1, 'OnlineBackup_No internet service': 1, 'DeviceProtection_No internet service': 1, 'TechSupport_No internet service': 1, 'StreamingTV_No internet service': 1, 'StreamingMovies_No': 1, 'StreamingMovies_No internet service': 1, 'StreamingMovies_Y

100%|██████████| 43/43 [00:07<00:00,  5.81it/s]
100%|██████████| 43/43 [00:07<00:00,  5.75it/s]
100%|██████████| 43/43 [00:07<00:00,  5.58it/s]

Feature importances:
 {'Contract_Month-to-month': 44867693.57400776, 'PaymentMethod_Mailed check': 1837.5254123490213, 'MonthlyCharges': 1411.6780745684919, 'Monthly/Total_Charges': 957.3017786689212, 'tenure': 949.7403718978271, 'OnlineSecurity_No': 948.9391860971717, 'TechSupport_No': 929.3885761034352, 'InternetService_Fiber optic': 908.9493186361709, 'PaymentMethod_Electronic check': 862.2263658816254, 'tenure_group': 796.4821944497849, 'Contract_Two year': 683.9243337226302, 'OnlineBackup_No': 678.9734214030833, 'SeniorCitizen_1': 549.4572788429726, 'DeviceProtection_No': 528.3549477914346, 'PaperlessBilling_Yes': 527.4132227908558, 'InternetService_No': 477.35689337200455, 'Contract_One year': 442.2081325139821, 'Dependents_Yes': 429.3188144462161, 'OnlineSecurity_No internet service': 414.9617656878607, 'OnlineBackup_No internet service': 394.29273451834837, 'Partner_Yes': 382.0462011256756, 'DeviceProtection_No internet service': 371.67176208815954, 'OnlineSecurity_Yes': 358.62




In [9]:
importances

Unnamed: 0,TotalCharges,TotalCharges/tenure,MonthlyCharges,DeviceProtection_Yes,tenure,gender_Male,Partner_Yes,Dependents_Yes,DeviceProtection_No,SeniorCitizen_1,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
RFE_Selection,23.0,22.0,21.0,20.0,19.0,18.0,17.0,16.0,15.0,14.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Lasso_Selection,-8.180496e-05,0.003949209,0.003218145,-0.0,-0.002120989,-0.0,-0.0,-0.0,0.0,0.0,...,0.0,-0.0,0.0,0.0,-0.0,-0.0,-0.0,-0.0,0.0,-0.0
Xgb_Selection,0.009501263,0.009169737,0.009158291,0.005777318,0.008393361,0.007556,0.005589,0.008352,0.005083259,0.006929,...,0.01019804,0.0,0.01200656,0.33347,0.0143575,0.01638632,0.006945182,0.006517187,0.01149953,0.006196208
Shap_Selection,0.2975569,0.3245621,0.3771491,0.01593113,0.2831978,0.065575,0.04023,0.045734,0.03005718,0.097508,...,0.04240838,0.0,0.06432658,0.8437142,0.1293914,0.2006349,0.05737625,0.03466719,0.1914869,0.05439763
GBM_Selection,0.04208893,0.04638008,0.04124097,0.0004574179,0.009181371,0.001737,0.0,0.002904,0.0,0.007842,...,0.0,4.716326e-05,0.004761454,0.3792985,0.005203307,0.00608709,0.002211407,0.0006784975,0.03894626,0.001083515
Catboost_Selection,10.06677,9.637562,10.39317,0.5788174,6.336968,2.490836,1.565176,1.750581,0.485416,2.225095,...,0.5655437,0.5717536,0.9005274,5.485722,1.586317,3.189817,1.519586,1.361056,2.425483,1.413549
Rf_Selection,0.1091628,0.1087447,0.1045853,0.009322163,0.08300387,0.021131,0.016881,0.014303,0.01166995,0.015071,...,0.00835713,0.004532442,0.009038437,0.04916054,0.007872967,0.017484,0.009733994,0.01008756,0.02201728,0.009534494
MRMR,299.8323,275.7412,1411.678,28.29668,949.7404,95.489931,382.046201,429.318814,528.3549,549.457279,...,169.4544,319.1564,57.58458,44867690.0,442.2081,683.9243,327.3879,338.019,862.2264,1837.525
PCA_Selection,9.148514e-33,9.148514e-33,9.148514e-33,2.776109e-08,9.148514e-33,0.999757,1.8e-05,1e-06,3.917533e-08,0.000222,...,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33


In [10]:
importances.index

Index(['RFE_Selection', 'Lasso_Selection', 'Xgb_Selection', 'Shap_Selection',
       'GBM_Selection', 'Catboost_Selection', 'Rf_Selection', 'MRMR',
       'PCA_Selection'],
      dtype='object')

In [11]:
for ind in importances.index:
    values_list = importances.loc[ind].tolist()
    non_zero_abs_values = [abs(value) for value in values_list if value != 0]
    smallest_non_zero = min(non_zero_abs_values)
    for col in importances.columns:
        if importances.loc[ind, col] == 0.0:
            importances.loc[ind, col] = smallest_non_zero / 100

In [12]:
importances

Unnamed: 0,TotalCharges,TotalCharges/tenure,MonthlyCharges,DeviceProtection_Yes,tenure,gender_Male,Partner_Yes,Dependents_Yes,DeviceProtection_No,SeniorCitizen_1,...,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
RFE_Selection,23.0,22.0,21.0,20.0,19.0,18.0,17.0,16.0,15.0,14.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Lasso_Selection,-8.180496e-05,0.003949209,0.003218145,8.180496e-07,-0.002120989,8.180496e-07,8.180496e-07,8.180496e-07,8.180496e-07,8.180496e-07,...,8.180496e-07,8.180496e-07,8.180496e-07,8.180496e-07,8.180496e-07,8.180496e-07,8.180496e-07,8.180496e-07,8.180496e-07,8.180496e-07
Xgb_Selection,0.009501263,0.009169737,0.009158291,0.005777318,0.008393361,0.007555894,0.00558914,0.008352404,0.005083259,0.006928868,...,0.01019804,3.924814e-05,0.01200656,0.33347,0.0143575,0.01638632,0.006945182,0.006517187,0.01149953,0.006196208
Shap_Selection,0.2975569,0.3245621,0.3771491,0.01593113,0.2831978,0.0655745,0.04022958,0.04573368,0.03005718,0.09750839,...,0.04240838,8.343856e-05,0.06432658,0.8437142,0.1293914,0.2006349,0.05737625,0.03466719,0.1914869,0.05439763
GBM_Selection,0.04208893,0.04638008,0.04124097,0.0004574179,0.009181371,0.001736675,3.848828e-07,0.002904435,3.848828e-07,0.007841896,...,3.848828e-07,4.716326e-05,0.004761454,0.3792985,0.005203307,0.00608709,0.002211407,0.0006784975,0.03894626,0.001083515
Catboost_Selection,10.06677,9.637562,10.39317,0.5788174,6.336968,2.490836,1.565176,1.750581,0.485416,2.225095,...,0.5655437,0.5717536,0.9005274,5.485722,1.586317,3.189817,1.519586,1.361056,2.425483,1.413549
Rf_Selection,0.1091628,0.1087447,0.1045853,0.009322163,0.08300387,0.02113092,0.01688149,0.01430315,0.01166995,0.01507133,...,0.00835713,0.004532442,0.009038437,0.04916054,0.007872967,0.017484,0.009733994,0.01008756,0.02201728,0.009534494
MRMR,299.8323,275.7412,1411.678,28.29668,949.7404,95.48993,382.0462,429.3188,528.3549,549.4573,...,169.4544,319.1564,57.58458,44867690.0,442.2081,683.9243,327.3879,338.019,862.2264,1837.525
PCA_Selection,9.148514e-33,9.148514e-33,9.148514e-33,2.776109e-08,9.148514e-33,0.9997572,1.847671e-05,1.000797e-06,3.917533e-08,0.0002222596,...,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33,9.148514e-33


In [13]:
zeros_count = (importances == 0).sum().sum()
zeros_count

0

In [14]:
importances.drop('PCA_Selection', axis='index')

multiplications = importances.apply(lambda x: abs(x).prod(), axis=0)

# Sort the values in descending order
multiplications_sorted = multiplications.sort_values(ascending=False)

# Create a DataFrame from the sorted values
multiplications_df = pd.DataFrame({'Column': multiplications_sorted.index, 'Multiplication': multiplications_sorted.values})

# Save the DataFrame as a CSV file
multiplications_df.to_csv('importance_values.csv', index=False)

# Print the DataFrame
print(multiplications_df)

                                     Column  Multiplication
0                               gender_Male    6.366593e-11
1                           SeniorCitizen_1    2.485007e-13
2               InternetService_Fiber optic    9.878054e-15
3                         OnlineSecurity_No    9.971156e-16
4                            TechSupport_No    8.432137e-16
5                            Dependents_Yes    1.562243e-16
6                           OnlineBackup_No    1.379537e-17
7                          MultipleLines_No    5.023242e-19
8                               Partner_Yes    2.244718e-19
9                          OnlineBackup_Yes    4.049447e-20
10                           StreamingTV_No    7.521617e-21
11                          TechSupport_Yes    4.945972e-21
12                     DeviceProtection_Yes    2.919617e-21
13                         PhoneService_Yes    1.991751e-21
14                        MultipleLines_Yes    1.657404e-21
15                       InternetService