In [1]:
import pandas as pd
import math
from sklearn.metrics import mutual_info_score
import numpy as np
import math
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from scipy.stats import entropy
from sklearn.model_selection import cross_val_score
import importlib

In [2]:
name_df = "mushrooms"
df = pd.read_csv("./datasets/"+name_df+".csv")
df = df.astype(str)
df.dtypes

Cap-shape                   object
Cap-surface                 object
Cap-color                   object
Bruises                     object
Odor                        object
Gill-attachment             object
Gill-spacing                object
Gill-size                   object
Gill-color                  object
Stalk-shape                 object
Stalk-surface-above-ring    object
Stalk-surface-below-ring    object
Stalk-color-above-ring      object
Stalk-color-below-ring      object
Veil-type                   object
Veil-color                  object
Ring-number                 object
Ring-type                   object
Spore-print-color           object
Population                  object
Habitat                     object
Class                       object
dtype: object

In [3]:
def transform_df(df):
    df_t = df.copy()
    df_t.dropna(inplace=True)
    columns = list(df_t.columns)
    for c in columns:
        unique_values = df_t[c].unique()
        df_t[c].replace(unique_values,range(1,len(unique_values)+1),inplace=True)
    return df_t

In [4]:
df_processed = transform_df(df)
instances = df_processed.shape[0]
target = df_processed["Class"] # Get target column
features = df_processed.drop("Class", axis=1) # Get features dataset without target column
dict_features = {column: i for i, column in enumerate(features.columns)}
dict_features

{'Cap-shape': 0,
 'Cap-surface': 1,
 'Cap-color': 2,
 'Bruises': 3,
 'Odor': 4,
 'Gill-attachment': 5,
 'Gill-spacing': 6,
 'Gill-size': 7,
 'Gill-color': 8,
 'Stalk-shape': 9,
 'Stalk-surface-above-ring': 10,
 'Stalk-surface-below-ring': 11,
 'Stalk-color-above-ring': 12,
 'Stalk-color-below-ring': 13,
 'Veil-type': 14,
 'Veil-color': 15,
 'Ring-number': 16,
 'Ring-type': 17,
 'Spore-print-color': 18,
 'Population': 19,
 'Habitat': 20}

In [9]:
# Function that calculate entropy
def calculate_entropy(probs):
    entropy = -sum([prob*math.log(prob) for prob in probs]) 
    return entropy

def calculate_SU(X,Y):
    # Calculate entropy for each variable. We need to get the value counts for each variable and calculate probabilities in order to compute entropy.
    entropy_feature = entropy([value/instances for value in X.value_counts()])
    # Calculate entropy for target variable
    entropy_target = entropy([value/instances for value in Y.value_counts()])
    
    # Calculate mutual information between features and target variable
    mutual_info_features_target = mutual_info_score(X, Y)
    # Compute de SU between features and target. Saved in a list of tuples with form (feature, SU value)
    SU = 2*mutual_info_features_target / (entropy_feature + entropy_target)
    return SU


### 1º paso: Eliminar aquellas variables cuyo SU(Xi,Y) sea menor que el threshold de los valores

In [10]:
dict_SU_features_target = {feature: calculate_SU(df_processed[feature], target) for feature in features}
dict_SU_features_target

{'Cap-shape': 0.0368005254052444,
 'Cap-surface': 0.022209850307795632,
 'Cap-color': 0.020545522828280715,
 'Bruises': 0.19448033620905208,
 'Odor': 0.5460779258983215,
 'Gill-attachment': 0.024168343342506778,
 'Gill-spacing': 0.12325778240100983,
 'Gill-size': 0.24337908547768705,
 'Gill-color': 0.20696237381017535,
 'Stalk-shape': 0.0075697792076871405,
 'Stalk-surface-above-ring': 0.25646151328985767,
 'Stalk-surface-below-ring': 0.226748557331665,
 'Stalk-color-above-ring': 0.1729263153661067,
 'Stalk-color-below-ring': 0.16217457008666625,
 'Veil-type': 0.0,
 'Veil-color': 0.03985090391574468,
 'Ring-number': 0.05416830043027027,
 'Ring-type': 0.25098480442071186,
 'Spore-print-color': 0.30022524254940297,
 'Population': 0.13452809448251174,
 'Habitat': 0.09581091169526226}

In [11]:
max_SU = max(dict_SU_features_target.values())
number_features = features.shape[1]
value = round(number_features / math.log(number_features))
SU_th = list(dict_SU_features_target.values())[value]
threshold_filter = min(0.1*max_SU,SU_th)
dict_SU_features_target_filter = {k:v for k,v in dict_SU_features_target.items() if v >= threshold_filter}
dict_SU_features_target_filter = dict(sorted(dict_SU_features_target_filter.items(), key=lambda item: item[1], reverse=True))
dict_SU_features_target_filter

{'Odor': 0.5460779258983215,
 'Spore-print-color': 0.30022524254940297,
 'Stalk-surface-above-ring': 0.25646151328985767,
 'Ring-type': 0.25098480442071186,
 'Gill-size': 0.24337908547768705,
 'Stalk-surface-below-ring': 0.226748557331665,
 'Gill-color': 0.20696237381017535,
 'Bruises': 0.19448033620905208,
 'Stalk-color-above-ring': 0.1729263153661067,
 'Stalk-color-below-ring': 0.16217457008666625,
 'Population': 0.13452809448251174,
 'Gill-spacing': 0.12325778240100983,
 'Habitat': 0.09581091169526226}

### SUBGROUP DISCOVERY

In [20]:
from SD import algorithm as algorithm
from SD.utils import measures as measures
from SD.utils import subgroup as subgroup

In [21]:
index_columns = list(dict_SU_features_target_filter.keys())
index_columns.append("Class")
df = df.loc[:,index_columns]

In [23]:
importlib.reload(algorithm)
importlib.reload(measures)
importlib.reload(subgroup)

searchspace = algorithm.create_nominal_selectors(df)
a = []
for target in df["Class"].unique():
        target = ["Class", target]
        #print("Class: {}".format(target[1]))
        task = algorithm.SubgroupDiscoveryTask (
                df, 
                target, 
                searchspace,
                name_df, 
                depth=df.shape[1]-1,
                alpha = 1)
        df_aux = algorithm.SubgoupDiscoverySearch().execute(task)
        df_aux.to_csv("./results/"+str(name_df)+"/FS/igsd_FS_"+str(target[1])+".csv",index=False)
        # results = algorithm.SubgoupDiscoverySearch().execute(task)
        # a.append(results)
#df_aux = pd.DataFrame(a,columns = ["time","target","size","length","redundancy","wracc","coverage","confidence","odd"])
#df_aux.to_csv("./results/"+str(name_df)+"/FS/igsd_FS_test.csv",index=False)
        

Iteration 0 of 13. Number of patterns: 1
Iteration 1 of 13. Number of patterns: 16
Iteration 2 of 13. Number of patterns: 63
Iteration 3 of 13. Number of patterns: 13
Iteration 4 of 13. Number of patterns: 7
Iteration 0 of 13. Number of patterns: 1
Iteration 1 of 13. Number of patterns: 16
Iteration 2 of 13. Number of patterns: 56
Iteration 3 of 13. Number of patterns: 37
Iteration 4 of 13. Number of patterns: 15
