# Algorithm

# Code

In [1]:
import numpy as np
import pandas as pd
   
class ReductSet:
    def __init__(self, cond_attributes, dec_attr):
        self.cond_attributes = cond_attributes
        self.dec_attr = dec_attr
        self.similarities = self.similarity(self.cond_attributes)
        self.decision_matrix = self.decisional_matrix(self.dec_attr)
    
    # Make the decision matrix
    def decisional_matrix(self, decision_attribute):
        self.decision_attribute = decision_attribute.tolist()
        self.decision_mat = [] # list []
        for i, data1 in enumerate(self.decision_attribute):
            self.temp = []
            for j, data2 in enumerate(self.decision_attribute):
                if(data1 == data2):
                    self.temp.append(1)
                else:
                    self.temp.append(0)
            self.decision_mat.append(self.temp)
        return self.decision_mat

    # Calculate the Jaccard similarity coefficient between two sets
    def jaccard_similarity(self, s1, s2):
        intersection = len(s1.intersection(s2))
        sqrt_cardinalities = np.sqrt(len(np.array(list(s1))) * len(np.array(list(s2))))
        return intersection/sqrt_cardinalities if intersection != 0 else 0

    # Calculate the similarity
    def calculate_similarity(self, c):
        ans = []
        for i in range(len(c)):
            temp = []
            for j in range(len(c)):
                j_s = self.jaccard_similarity(c[i], c[j])
                temp.append(float(format(j_s,'.2f')))
            ans.append(temp)
        return ans

    # Find similarity of each attribute
    def similarity(self, conditional_matrix):
        similarities = []
        for col in conditional_matrix.columns:
            c = conditional_matrix[col]
            s_c = self.calculate_similarity(c)
            similarities.append(s_c)
        # similarities
        return similarities

    # Finding the information entropy of each conditional attribute
    def find_information_entropy(self, sm):
        information_entropy = 0
        for i, row in enumerate(sm):
            information_entropy = information_entropy + np.log2(np.sum(row)/5)
        information_entropy = -information_entropy/5
        return information_entropy

    # Find decisional entropy
    def find_conditional_entropy(self, similarity_matrix, decision_matrix):
        decisional_entropy = 0
        for row1, row2 in zip(similarity_matrix, decision_matrix):
            r = np.sum([np.min((s1, dm1)) for s1, dm1 in zip(row1, row2)])
            decisional_entropy = decisional_entropy + np.log2(r/np.sum(row1))
        return -decisional_entropy/5

    # Find the mutual information gain ratio
    def gain_ratio(self, information_entropy, conditional_entropy, decisional_entropy):
        gainRatio = (decisional_entropy - conditional_entropy)/information_entropy
        return gainRatio

    def equivalence_class(self, similarities_matrix, indexes):
        min_values = [] 
        for row_index in range(len(similarities_matrix[0])):
            min_row_values = [min(similarities_matrix[idx][row_index][col_index] for idx in indexes) for col_index in range(len(similarities_matrix[0][0]))]
            min_values.append(min_row_values)
        return min_values
    
    def Reduct(self):
        max_gr = float('-inf')
        selected_attribute = None
        p = set()
        start = True
        while start:
            if p:
                """
                Calculate mutual information entropy:
                    GR(C,P,D) = G(C,P,D)/H({C})
                              = (H(D|P) - H(D|PU{C}))/H({C})
                """
                max_gr = float('-inf')
                selected_attribute = None
                for i, col in enumerate(self.cond_attributes.columns):
                    if col not in p:
                        new_p = p.copy()  
                        new_p.add(col)  
                        
                        # conditional entropy H(D|PU{C})
                        print("==============================================================================")
                        index_u = [self.cond_attributes.columns.get_loc(col) for col in new_p]
                        equiv_u = self.equivalence_class(self.similarities, index_u)
                        condi_entropy = float(format(self.find_conditional_entropy(equiv_u, self.decision_matrix),'.2f'))
                        # print(f"Matrix of {new_p}\n {equiv_u} ")
                        print(f"Conditional Entropy {new_p} {condi_entropy}")
                        
                        # infromaton entropy of the column H(C)
                        inf_entropy_col = float(format(self.find_information_entropy(self.calculate_similarity(list(self.cond_attributes[col]))),'.2f'))
                        print(f"Information Entropy {col} {inf_entropy_col}")
                        
                        # conditional entropy H(D|P)
                        index = [self.cond_attributes.columns.get_loc(col) for col in p]
                        equiv = self.equivalence_class(self.similarities, index)
                        cond_entropy_dec_reduct = float(format(self.find_conditional_entropy(equiv, self.decision_matrix),'.2f'))
                        print(f"Conditional Entorpy {p} {cond_entropy_dec_reduct}")

                        # gain ratio
                        gr = self.gain_ratio(decisional_entropy= cond_entropy_dec_reduct, conditional_entropy= condi_entropy, information_entropy = inf_entropy_col)
                        print(f"Gain Ratio {new_p} {gr}")
                        print("==============================================================================")
                        
                        # gr = self.gain_ratio(decisional_entropy=cond_entropy_dec_reduct, conditional_entropy=condi_entropy, information_entropy=inf_entropy_col)
                        
                        if gr > max_gr:
                            max_gr = gr
                            selected_attribute = col
            else:
                for i, sim in enumerate(self.cond_attributes.columns):
                    similarity = self.calculate_similarity(self.cond_attributes[sim].values)
                    print("==============================================================================")
                    # print(f"\nSimiliarity for {sim}\n{similarity}\n")
                    
                    dec_entropy = self.find_information_entropy(self.decision_matrix)
                    condi_entropy = self.find_conditional_entropy(similarity, self.decision_matrix)
                    print(f"\nConditional Entropy for {sim} : {condi_entropy}\n")
                    info_entropy = self.find_information_entropy(similarity)
                    print(f"\nInformation Entropy for {sim} : {info_entropy}")
                    
                    gr = self.gain_ratio(information_entropy=info_entropy, conditional_entropy=condi_entropy, decisional_entropy=dec_entropy)
                    print(f"\nGain Ratio for {sim} : {gr}\n")
                    print("==============================================================================")
                    if gr > max_gr:
                        max_gr = gr
                        selected_attribute = self.cond_attributes.columns[i]

            if max_gr > 0:
                p.add(selected_attribute)
            else:
                start = False
        return p



# Zoo data

In [2]:
data = pd.read_csv('zoo data/zoo.data')
data.head()

Unnamed: 0,aardvark,1,0,0.1,1.1,0.2,0.3,1.2,1.3,1.4,1.5,0.4,0.5,4,0.6,0.7,1.6,1.7
0,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
1,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
2,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
3,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
4,buffalo,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1


In [3]:
data.shape

(100, 18)

In [4]:
data.columns = ['animal_name', 'hair', 'feathers', 'eggs', 'milk', 'airborne', 'aquatic',' predator', 'toothed', 'backbone'
               , 'breathes', 'venomous', 'fins', 'legs', 'tail', 'domestic', 'catsize', 'type']

In [5]:
data.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize,type
0,antelope,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1
1,bass,0,0,1,0,0,1,1,1,1,0,0,1,0,1,0,0,4
2,bear,1,0,0,1,0,0,1,1,1,1,0,0,4,0,0,1,1
3,boar,1,0,0,1,0,0,1,1,1,1,0,0,4,1,0,1,1
4,buffalo,1,0,0,1,0,0,0,1,1,1,0,0,4,1,0,1,1


In [6]:
X = data.drop('type', axis = 1)
y = data['type']

In [7]:
for col in X.columns:
    X[col] = X[col].apply(lambda x: {x})

In [8]:
X.head()

Unnamed: 0,animal_name,hair,feathers,eggs,milk,airborne,aquatic,predator,toothed,backbone,breathes,venomous,fins,legs,tail,domestic,catsize
0,{antelope},{1},{0},{0},{1},{0},{0},{0},{1},{1},{1},{0},{0},{4},{1},{0},{1}
1,{bass},{0},{0},{1},{0},{0},{1},{1},{1},{1},{0},{0},{1},{0},{1},{0},{0}
2,{bear},{1},{0},{0},{1},{0},{0},{1},{1},{1},{1},{0},{0},{4},{0},{0},{1}
3,{boar},{1},{0},{0},{1},{0},{0},{1},{1},{1},{1},{0},{0},{4},{1},{0},{1}
4,{buffalo},{1},{0},{0},{1},{0},{0},{0},{1},{1},{1},{0},{0},{4},{1},{0},{1}


In [9]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for animal_name : -0.0


Information Entropy for animal_name : 46.038561897747336

Gain Ratio for animal_name : -0.8343332460740805


Conditional Entropy for hair : 32.2891568612718


Information Entropy for hair : -66.8094839970743

Gain Ratio for hair : 1.0582428634997079


Conditional Entropy for feathers : 33.58849720726998


Information Entropy for feathers : -72.0

Gain Ratio for feathers : 1.0


Conditional Entropy for eggs : 31.506256463900037


Information Entropy for eggs : -66.90855252259077

Gain Ratio for eggs : 1.044974919058715


Conditional Entropy for milk : 28.608047215923875


Information Entropy for milk : -67.01955000865382

Gain Ratio for milk : 1.000000000000001


Conditional Entropy for airborne : 38.603186552938475


Information Entropy for airborne : -70.53775631005676

Gain Ratio for airborne : 1.0918222151430736


Conditional Entropy for aquatic : 40.241110638358066


Information Entropy for aquatic : -67.58489811263735

Gain Ratio for a

In [15]:
select_zoo_attribute = list(reduct_set)
zoo_selected_feature = data[select_zoo_attribute]
zoo_selected_feature['type'] = data['type']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zoo_selected_feature['type'] = data['type']


In [17]:
zoo_selected_feature.to_csv('selected_feature_zoo.csv')

In [19]:
data.to_csv('zoo.csv')

In [102]:
data.to_csv('zoo.csv')

# Hepatitis data

In [20]:
hepitie_data = pd.read_csv('hepititis/hepatitis.data')
hepitie_data.head()

Unnamed: 0,2,30,2.1,1,2.2,2.3,2.4,2.5,1.1,2.6,2.7,2.8,2.9,2.10,1.00,85,18,4.0,?,1.2
0,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,?,1
1,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,?,1
2,2,31,1,?,1,2,2,2,2,2,2,2,2,2,0.7,46,52,4.0,80,1
3,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,?,200,4.0,?,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.9,95,28,4.0,75,1


In [21]:
hepitie_data.columns = [
    'class', 'age', 'sex', 'steroid', 'antivirals', 'fatigue', 'malaise', 'anorexia', 'liver big', 'liver firm',
    'Spleen Palpable', 'Spiders', 'Ascites', 'Varices', 'Bilirubin', 'Alk Phosphate', 'Sgot', 'Albumin', 'Protime','histology'
]

In [25]:
hepitie_data.to_csv('hepitie.csv')

In [22]:
hepitie_data.head()

Unnamed: 0,class,age,sex,steroid,antivirals,fatigue,malaise,anorexia,liver big,liver firm,Spleen Palpable,Spiders,Ascites,Varices,Bilirubin,Alk Phosphate,Sgot,Albumin,Protime,histology
0,2,50,1,1,2,1,2,2,1,2,2,2,2,2,0.9,135,42,3.5,?,1
1,2,78,1,2,2,1,2,2,2,2,2,2,2,2,0.7,96,32,4.0,?,1
2,2,31,1,?,1,2,2,2,2,2,2,2,2,2,0.7,46,52,4.0,80,1
3,2,34,1,2,2,2,2,2,2,2,2,2,2,2,1.0,?,200,4.0,?,1
4,2,34,1,2,2,2,2,2,2,2,2,2,2,2,0.9,95,28,4.0,75,1


In [23]:
for col in hepitie_data.columns:
    s = set(hepitie_data[col].values.tolist())
    if '?' in s:
        s.remove('?')  # Remove the '?' value from the set
        hepitie_data[col] = hepitie_data[col].apply(lambda x: s if x == '?' else x)

In [26]:
X_hepitie = hepitie_data.drop('class', axis = 1)
y_hepitie = hepitie_data['class']

In [27]:
for col in X_hepitie.columns:
    X_hepitie[col] = X_hepitie[col].apply(lambda x: {x} if type(x) != set else x)

In [28]:
red_set = ReductSet(X_hepitie, y_hepitie)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for age : 14.8477661915055


Information Entropy for age : 11.458560464954893

Gain Ratio for age : -12.605629856147733


Conditional Entropy for sex : 21.639168312728973


Information Entropy for sex : -138.11184828600827

Gain Ratio for sex : 1.09500941377275


Conditional Entropy for steroid : 22.279497800046485


Information Entropy for steroid : -121.7323044322849

Gain Ratio for steroid : 1.2476072331197203


Conditional Entropy for antivirals : 22.24941665105165


Information Entropy for antivirals : -133.0739978196703

Gain Ratio for antivirals : 1.14104952772833


Conditional Entropy for fatigue : 20.150407007503826


Information Entropy for fatigue : -123.84063655044852

Gain Ratio for fatigue : 1.2091750889903303


Conditional Entropy for malaise : 20.232183629245746


Information Entropy for malaise : -122.6362058236278

Gain Ratio for malaise : 1.221717422982954


Conditional Entropy for anorexia : 22.356513619193937


Information Entropy for anorexia 


Conditional Entropy for liver firm : 22.637670135641812


Information Entropy for liver firm : -124.45622913450339

Gain Ratio for liver firm : 1.2231792406720279


Conditional Entropy for Spleen Palpable : 21.63366210541661


Information Entropy for Spleen Palpable : -130.79173854420986

Gain Ratio for Spleen Palpable : 1.1562524476139049


Conditional Entropy for Spiders : 19.466740267897777


Information Entropy for Spiders : -124.77978292468342

Gain Ratio for Spiders : 1.194595330173595


Conditional Entropy for Ascites : 18.8276017836834


Information Entropy for Ascites : -135.4070549420045

Gain Ratio for Ascites : 1.0961187181957195


Conditional Entropy for Varices : 20.393578226189756


Information Entropy for Varices : -136.50927882247754

Gain Ratio for Varices : 1.0987398456279642


Conditional Entropy for Bilirubin : 14.748308963488109


Information Entropy for Bilirubin : -32.925966456323835

Gain Ratio for Bilirubin : 4.383862653476318


Conditional Entropy for Alk Ph

In [30]:
selected_features_hepitie = list(reduct_set)
hepitie_selected_featue_df = hepitie_data[selected_features_hepitie]

In [31]:
hepitie_selected_featue_df['class'] = hepitie_data['class']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hepitie_selected_featue_df['class'] = hepitie_data['class']


In [32]:
hepitie_selected_featue_df.head()

Unnamed: 0,age,Sgot,Alk Phosphate,sex,class
0,50,42,135,1,2
1,78,32,96,1,2
2,31,52,46,1,2
3,34,200,"{34, 53, 56, 45, 175, 65, 295, 102, 125, 243, ...",1,2
4,34,28,95,1,2


In [34]:
hepitie_selected_featue_df.to_csv('hepitie_selected_feature_df.csv')

# Audiology

In [35]:
audiology_data = pd.read_csv('audiology/audiology.standardized.data')
audiology_data.head()

Unnamed: 0,f,mild,f.1,normal,normal.1,?,t,?.1,f.2,f.3,...,f.53,f.54,normal.4,t.1,a,f.55,f.56,f.57,p1,cochlear_unknown
0,f,moderate,f,normal,normal,?,t,?,f,f,...,f,f,normal,t,a,f,f,f,p2,cochlear_unknown
1,t,mild,t,?,absent,mild,t,?,f,f,...,f,f,normal,t,as,f,f,f,p3,mixed_cochlear_age_fixation
2,t,mild,t,?,absent,mild,f,?,f,f,...,f,f,normal,t,b,f,f,f,p4,mixed_cochlear_age_otitis_media
3,t,mild,f,normal,normal,mild,t,?,f,f,...,f,f,good,t,a,f,f,f,p5,cochlear_age
4,t,mild,f,normal,normal,mild,t,?,f,f,...,f,f,very_good,t,a,f,f,f,p6,cochlear_age


In [36]:
audiology_data.columns = np.arange(0,71)

In [37]:
audiology_data.rename(columns={70: 'class'}, inplace=True)

In [38]:
audiology_data['class'].unique()

array(['cochlear_unknown', 'mixed_cochlear_age_fixation',
       'mixed_cochlear_age_otitis_media', 'cochlear_age', 'normal_ear',
       'cochlear_poss_noise', 'cochlear_age_and_noise',
       'acoustic_neuroma', 'mixed_cochlear_unk_ser_om',
       'conductive_discontinuity', 'retrocochlear_unknown',
       'conductive_fixation', 'bells_palsy',
       'cochlear_noise_and_heredity', 'mixed_cochlear_unk_fixation',
       'mixed_poss_noise_om', 'otitis_media', 'possible_menieres',
       'possible_brainstem_disorder', 'cochlear_age_plus_poss_menieres',
       'mixed_cochlear_age_s_om', 'mixed_cochlear_unk_discontinuity',
       'mixed_poss_central_om', 'poss_central'], dtype=object)

In [39]:
X_audiology = audiology_data.drop('class', axis = 1)
y_audiology = audiology_data['class']

In [40]:
for col in X_audiology.columns:
    s = set(X_audiology[col].values.tolist())
    if '?' in s:
        s.remove('?')  # Remove the '?' value from the set
        X_audiology[col] = X_audiology[col].apply(lambda x: s if x == '?' else x)

In [41]:
for col in X_audiology.columns:
    X_audiology[col] = X_audiology[col].apply(lambda x: {x} if type(x) != set else x)

In [42]:
X_audiology.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,{f},{moderate},{f},{normal},{normal},"{moderate, mild, unmeasured, normal}",{t},"{normal, degraded}",{f},{f},...,{f},{f},{f},{normal},{t},{a},{f},{f},{f},{p2}
1,{t},{mild},{t},"{elevated, normal, absent}",{absent},{mild},{t},"{normal, degraded}",{f},{f},...,{f},{f},{f},{normal},{t},{as},{f},{f},{f},{p3}
2,{t},{mild},{t},"{elevated, normal, absent}",{absent},{mild},{f},"{normal, degraded}",{f},{f},...,{f},{f},{f},{normal},{t},{b},{f},{f},{f},{p4}
3,{t},{mild},{f},{normal},{normal},{mild},{t},"{normal, degraded}",{f},{f},...,{f},{f},{f},{good},{t},{a},{f},{f},{f},{p5}
4,{t},{mild},{f},{normal},{normal},{mild},{t},"{normal, degraded}",{f},{f},...,{f},{f},{f},{very_good},{t},{a},{f},{f},{f},{p6}


In [43]:
red_set = ReductSet(X_audiology, y_audiology)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for 0 : 102.42417422675821


Information Entropy for 0 : -173.34161685917087

Gain Ratio for 0 : 1.0183108213192127


Conditional Entropy for 1 : 113.48329155130315


Information Entropy for 1 : -149.5331978388768

Gain Ratio for 1 : 1.2544021278761177


Conditional Entropy for 2 : 118.72519229104098


Information Entropy for 2 : -190.9676397973785

Gain Ratio for 2 : 1.0096823865107656


Conditional Entropy for 3 : 118.63016140654585


Information Entropy for 3 : -158.76568079472733

Gain Ratio for 3 : 1.213874626101503


Conditional Entropy for 4 : 117.10687885337128


Information Entropy for 4 : -158.78951804204465

Gain Ratio for 4 : 1.2040993084231777


Conditional Entropy for 5 : 123.71658727709489


Information Entropy for 5 : -169.85889593105748

Gain Ratio for 5 : 1.1645434064477012


Conditional Entropy for 6 : 128.8302408549353


Information Entropy for 6 : -179.47597901068815

Gain Ratio for 6 : 1.1306343722396148


Conditional Entropy for 7 : 136.88314

In [44]:
audiology_features = audiology_data[list(reduct_set)]

In [45]:
audiology_features['class'] = audiology_data['class']
audiology_features.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  audiology_features['class'] = audiology_data['class']


Unnamed: 0,69,63,class
0,p2,normal,cochlear_unknown
1,p3,normal,mixed_cochlear_age_fixation
2,p4,normal,mixed_cochlear_age_otitis_media
3,p5,good,cochlear_age
4,p6,very_good,cochlear_age


In [46]:
audiology_data.to_csv('audiology_data.csv')
audiology_features.to_csv('audiology_selected_features.csv')

# Soyabean

In [47]:
soyabean_data = pd.read_csv('soybean+large/soybean-large.data')
soyabean_data.head()

Unnamed: 0,diaporthe-stem-canker,6,0,2,1,0.1,1.1,1.2,1.3,0.2,...,0.9,0.10,0.11,4,0.12,0.13,0.14,0.15,0.16,0.17
0,diaporthe-stem-canker,4,0,2,1,0,2,0,2,1,...,0,0,0,4,0,0,0,0,0,0
1,diaporthe-stem-canker,3,0,2,1,0,1,0,2,1,...,0,0,0,4,0,0,0,0,0,0
2,diaporthe-stem-canker,3,0,2,1,0,1,0,2,0,...,0,0,0,4,0,0,0,0,0,0
3,diaporthe-stem-canker,6,0,2,1,0,2,0,1,0,...,0,0,0,4,0,0,0,0,0,0
4,diaporthe-stem-canker,5,0,2,1,0,3,0,1,0,...,0,0,0,4,0,0,0,0,0,0


In [48]:
soyabean_data.columns = np.arange(0,36)

In [49]:
soyabean_data.rename(columns = {0: 'class'}, inplace = True)

In [50]:
X = soyabean_data.drop('class', axis = 1)
y = soyabean_data['class']

In [51]:
# for col in X.columns:
#     print(X[col].unique())

for col in X.columns:
    s = set(X[col].values.tolist())
    if '?' in s:
        s.remove('?')  # Remove the '?' value from the set
        X[col] = X[col].apply(lambda x: s if x == '?' else x)


In [52]:
for col in X.columns:
    X[col] = X[col].apply(lambda x: {x} if type(x) != set else x)

In [53]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for 1 : 183.9410782400513


Information Entropy for 1 : -199.32901423157867

Gain Ratio for 1 : 1.5721586160401393


Conditional Entropy for 2 : 214.3598707048705


Information Entropy for 2 : -304.016613291995

Gain Ratio for 2 : 1.1308448439479921


Conditional Entropy for 3 : 195.38192405046019


Information Entropy for 3 : -295.26826249323335

Gain Ratio for 3 : 1.100076487119291


Conditional Entropy for 4 : 208.66526876341058


Information Entropy for 4 : -283.0282396634486

Gain Ratio for 4 : 1.194584038950891


Conditional Entropy for 5 : 226.23551893670137


Information Entropy for 5 : -324.38776542510385

Gain Ratio for 5 : 1.0964386014420542


Conditional Entropy for 6 : 223.73483948414005


Information Entropy for 6 : -246.772137521883

Gain Ratio for 6 : 1.4311607134485944


Conditional Entropy for 7 : 196.28474204192028


Information Entropy for 7 : -244.4169228729134

Gain Ratio for 7 : 1.3326429574690049


Conditional Entropy for 8 : 207.65698749284

In [54]:
soyabean_data.to_csv('soyabean.csv')


In [55]:
soyabean_selected_features = soyabean_data[list(reduct_set)]
soyabean_selected_features['class'] = soyabean_data['class']
soyabean_selected_features.to_csv('soyabean_selected_features.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  soyabean_selected_features['class'] = soyabean_data['class']


In [56]:
soyabean_selected_features.head()

Unnamed: 0,1,class
0,4,diaporthe-stem-canker
1,3,diaporthe-stem-canker
2,3,diaporthe-stem-canker
3,6,diaporthe-stem-canker
4,5,diaporthe-stem-canker


# Dermatology

In [57]:
der = pd.read_csv('dermatology/dermatology.data')
der.head()

Unnamed: 0,2,2.1,0,3,0.1,0.2,0.3,0.4,1,0.5,...,0.19,0.20,3.2,0.21,0.22,0.23,1.1,0.24,55,2.3
0,3,3,3,2,1,0,0,0,1,1,...,0,0,0,0,0,0,1,0,8,1
1,2,1,2,3,1,3,0,3,0,0,...,0,2,3,2,0,0,2,3,26,3
2,2,2,2,0,0,0,0,0,3,2,...,3,0,0,0,0,0,3,0,40,1
3,2,3,2,2,2,2,0,2,0,0,...,2,3,2,3,0,0,2,3,45,3
4,2,3,2,0,0,0,0,0,0,0,...,0,0,2,0,0,0,1,0,41,2


In [58]:
der.columns = np.arange(0,35)

In [59]:
der.rename(columns = {34: "class"}, inplace = True)

In [60]:
der.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,class
0,3,3,3,2,1,0,0,0,1,1,...,0,0,0,0,0,0,1,0,8,1
1,2,1,2,3,1,3,0,3,0,0,...,0,2,3,2,0,0,2,3,26,3
2,2,2,2,0,0,0,0,0,3,2,...,3,0,0,0,0,0,3,0,40,1
3,2,3,2,2,2,2,0,2,0,0,...,2,3,2,3,0,0,2,3,45,3
4,2,3,2,0,0,0,0,0,0,0,...,0,0,2,0,0,0,1,0,41,2


In [61]:
X = der.drop('class', axis = 1)
y = der['class']

In [62]:
# for col in X.columns:
#     print(X[col].unique())

for col in X.columns:
    s = set(X[col].values.tolist())
    if '?' in s:
        s.remove('?')  # Remove the '?' value from the set
        X[col] = X[col].apply(lambda x: s if x == '?' else x)


In [63]:
for col in X.columns:
    X[col] = X[col].apply(lambda x: {x} if type(x) != set else x)

In [64]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for 0 : 167.21947357340446


Information Entropy for 0 : -346.78233798660557

Gain Ratio for 0 : 1.2732180207124275


Conditional Entropy for 1 : 157.86436310901223


Information Entropy for 1 : -340.2964726517235

Gain Ratio for 1 : 1.269993803218909


Conditional Entropy for 2 : 151.43789015494323


Information Entropy for 2 : -319.2788179044036

Gain Ratio for 2 : 1.333467535883785


Conditional Entropy for 3 : 155.98794335119337


Information Entropy for 3 : -308.0859492118634

Gain Ratio for 3 : 1.3966816496107621


Conditional Entropy for 4 : 148.10667942133315


Information Entropy for 4 : -341.3938618754543

Gain Ratio for 4 : 1.2373295920339855


Conditional Entropy for 5 : 130.08478242286273


Information Entropy for 5 : -386.1072072680088

Gain Ratio for 5 : 1.0473641082749576


Conditional Entropy for 6 : 155.55796565326568


Information Entropy for 6 : -410.11457187652513

Gain Ratio for 6 : 1.0481656677115518


Conditional Entropy for 7 : 132.57663640

In [65]:
der.to_csv('dermatology.csv')

In [69]:
selected_features_der = der[list(reduct_set)]
selected_features_der['class'] = der['class']

selected_features_der.to_csv('selected_features_dermatology.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_features_der['class'] = der['class']


In [68]:
selected_features_der.head()

Unnamed: 0,33,class
0,8,1
1,26,3
2,40,1
3,45,3
4,41,2


# Example 1

In [40]:
# data = pd.DataFrame({
#     'c1' : [{1,2,3,4},{2,3},{1,2,3,4},{2,3,4},{2,4}],
#     'c2' : [{0,1},{2,3},{1,2},{0,1,2,3},{0,1,2}],
#     'c3' : [{1,2},{1},{1,2},{0,1},{0,1}],
#     'd' : [0,0,1,0,1]
# })
# data
data = pd.DataFrame({
    'c1' : [{1,2,3,4}, {2,3}, {1,2,3,4}, {2,3,4}, {2,4}],
    'c2' : [{0,1}, {2, 3}, {1,2}, {0,1,2,3}, {0,1,2}],
    'c3' : [{1,2}, {1}, {1,2}, {0,1}, {0,1}],
    'c4' : [{0},{1}, {0}, {0,1}, {0,1}],
    'd' : [1,1,2,1,2]

})
data

Unnamed: 0,c1,c2,c3,c4,d
0,"{1, 2, 3, 4}","{0, 1}","{1, 2}",{0},1
1,"{2, 3}","{2, 3}",{1},{1},1
2,"{1, 2, 3, 4}","{1, 2}","{1, 2}",{0},2
3,"{2, 3, 4}","{0, 1, 2, 3}","{0, 1}","{0, 1}",1
4,"{2, 4}","{0, 1, 2}","{0, 1}","{0, 1}",2


In [41]:
cond_attributes = data.drop('d',axis = 1)
dec_attr = data['d'] #series

In [42]:
dec_attr.tolist()

[1, 1, 2, 1, 2]

In [43]:
red_set = ReductSet(cond_attributes, dec_attr)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for c1 : 0.8913710863859107


Information Entropy for c1 : 0.2941355537869794

Gain Ratio for c1 : 0.27055385533702414


Conditional Entropy for c2 : 0.8457059689455463


Information Entropy for c2 : 0.5660468062277841

Gain Ratio for c2 : 0.22126195949018804


Conditional Entropy for c3 : 0.9547516540310301


Information Entropy for c3 : 0.4205714832212132

Gain Ratio for c3 : 0.03851649736108736


Conditional Entropy for c4 : 0.908858182613419


Information Entropy for c4 : 0.538871442685031

Gain Ratio for c4 : 0.11522676267991148

Conditional Entropy {'c2', 'c1'} 0.84
Information Entropy c2 0.57
Conditional Entorpy {'c1'} 0.89
Gain Ratio {'c2', 'c1'} 0.08771929824561413
Conditional Entropy {'c3', 'c1'} 0.89
Information Entropy c3 0.42
Conditional Entorpy {'c1'} 0.89
Gain Ratio {'c3', 'c1'} 0.0
Conditional Entropy {'c1', 'c4'} 0.84
Information Entropy c4 0.54
Conditional Entorpy {'c1'} 0.89
Gain Ratio {'c1', 'c4'} 0.09259259259259267
Conditional Entropy {'c2', '

# Example 2

In [44]:
data = pd.DataFrame({
    'c1' : [{1,2},{2},{2},{2},{1},{2},{2},{1,2},{1},{2},{1,2},{1}],
    'c2' : [{1,2}, {1,3}, {1,2,3}, {1,2}, {1,2}, {1,3}, {1,2}, {1,2}, {1,2}, {1,3}, {1,2,3}, {1,3}],
    'c3' : [{1,2}, {2,3}, {2,3}, {1,2}, {1,2,3}, {2,3}, {2,3}, {1,2,3}, {1,2}, {2,3}, {1,2,3}, {1,2}],
    'c4' : [{1,2,3}, {3}, {1,2,3}, {2}, {2}, {1,2,3}, {2,3}, {2}, {1,2,3}, {2}, {3}, {1,2,3}],
    'c5' : [set(), {2,3}, set(), set(), set(), {2,3}, {2,3}, set(), set(), {2,3}, {2,3}, set()],
    'c6' : [{2}, {1,3}, {1,3}, {2}, {2}, {2}, {1,2,3}, {2}, {1,3}, {1,3}, {1,2,3}, {2}],
    'c7' : [{2,3}, {1,2,3}, {1,3}, {2,3}, {1,3}, {1,2,3}, {2,3}, {2,3}, {1,2,3}, {2,3},{2,3}, {1,2}],
    'c8' : [{1}, {1,2,3}, {2,3}, {2,3}, {2,3}, {2,3}, {1}, {1}, {2,3}, {1,2,3}, {1,2,3}, {2,3}],
    'd' : [1,2,2,3,2,1,3,1,2,1,2,3]
})
data

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,d
0,"{1, 2}","{1, 2}","{1, 2}","{1, 2, 3}",{},{2},"{2, 3}",{1},1
1,{2},"{1, 3}","{2, 3}",{3},"{2, 3}","{1, 3}","{1, 2, 3}","{1, 2, 3}",2
2,{2},"{1, 2, 3}","{2, 3}","{1, 2, 3}",{},"{1, 3}","{1, 3}","{2, 3}",2
3,{2},"{1, 2}","{1, 2}",{2},{},{2},"{2, 3}","{2, 3}",3
4,{1},"{1, 2}","{1, 2, 3}",{2},{},{2},"{1, 3}","{2, 3}",2
5,{2},"{1, 3}","{2, 3}","{1, 2, 3}","{2, 3}",{2},"{1, 2, 3}","{2, 3}",1
6,{2},"{1, 2}","{2, 3}","{2, 3}","{2, 3}","{1, 2, 3}","{2, 3}",{1},3
7,"{1, 2}","{1, 2}","{1, 2, 3}",{2},{},{2},"{2, 3}",{1},1
8,{1},"{1, 2}","{1, 2}","{1, 2, 3}",{},"{1, 3}","{1, 2, 3}","{2, 3}",2
9,{2},"{1, 3}","{2, 3}",{2},"{2, 3}","{1, 3}","{2, 3}","{1, 2, 3}",1


In [45]:
cond_attributes = data.drop('d',axis = 1)
dec_attr = data['d']

In [46]:
red_set = ReductSet(cond_attributes, dec_attr)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for c1 : 3.623904209445649


Information Entropy for c1 : -1.4093293852575215

Gain Ratio for c1 : 2.074875048817428


Conditional Entropy for c2 : 3.6996650047261967


Information Entropy for c2 : -2.179236296761263

Gain Ratio for c2 : 1.376602976361507


Conditional Entropy for c3 : 3.6640844366269625


Information Entropy for c3 : -2.225452464811198

Gain Ratio for c3 : 1.3320269253510377


Conditional Entropy for c4 : 3.465214975078202


Information Entropy for c4 : -1.6274224657694194

Gain Ratio for c4 : 1.6993086926332357


Conditional Entropy for c5 : nan


Information Entropy for c5 : inf

Gain Ratio for c5 : nan


Conditional Entropy for c6 : 3.241748240614915


Information Entropy for c6 : -1.0842525552876032

Gain Ratio for c6 : 2.344496580440169


Conditional Entropy for c7 : 3.4792314025887676


Information Entropy for c7 : -2.2091306845759244

Gain Ratio for c7 : 1.2581915545266722


Conditional Entropy for c8 : 3.380844395173386


Information Entro

  decisional_entropy = decisional_entropy + np.log2(r/np.sum(row1))
  information_entropy = information_entropy + np.log2(np.sum(row)/5)


# Example 3

In [47]:
cond_attributes = pd.DataFrame({
        'A1': [{1, 2}, {1, 3}, {1, 2}, {2, 3}, {1, 3}],
        'A2': [{1, 3}, {1, 2}, {1, 3}, {1, 2}, {1, 2}],
        'A3': [{1, 2}, {1, 2}, {1, 3}, {1, 2}, {1, 2}]
    })
dec_attr = np.array([1, 2, 3, 4, 5])


In [48]:
cond_attributes

Unnamed: 0,A1,A2,A3
0,"{1, 2}","{1, 3}","{1, 2}"
1,"{1, 3}","{1, 2}","{1, 2}"
2,"{1, 2}","{1, 3}","{1, 3}"
3,"{2, 3}","{1, 2}","{1, 2}"
4,"{1, 3}","{1, 2}","{1, 2}"


In [49]:
red_set = ReductSet(cond_attributes, dec_attr)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for A1 : 1.7628764377903146


Information Entropy for A1 : 0.5590516570970478

Gain Ratio for A1 : 0.9999999999999996


Conditional Entropy for A2 : 1.9229419688230416


Information Entropy for A2 : 0.3989861260643207

Gain Ratio for A2 : 0.9999999999999997


Conditional Entropy for A3 : 2.0529325012980815


Information Entropy for A3 : 0.26899559358928127

Gain Ratio for A3 : 0.9999999999999979

Conditional Entropy {'A2', 'A1'} 1.76
Information Entropy A1 0.56
Conditional Entorpy {'A2'} 1.92
Gain Ratio {'A2', 'A1'} 0.28571428571428553
Conditional Entropy {'A2', 'A3'} 1.83
Information Entropy A3 0.27
Conditional Entorpy {'A2'} 1.92
Gain Ratio {'A2', 'A3'} 0.33333333333333276
Conditional Entropy {'A2', 'A1', 'A3'} 1.67
Information Entropy A1 0.56
Conditional Entorpy {'A2', 'A3'} 1.83
Gain Ratio {'A2', 'A1', 'A3'} 0.2857142857142859
Reduct set: {'A2', 'A1', 'A3'}


# Example 4 

In [50]:
new_data = pd.DataFrame({
    'Audition' : [{'E'}, {'E','F','G'}, {'F','G'}, {'E', 'F'}, {'F','G'},{'F'}, {'E','F','G'},{'F','G'},{'E','G'},{'E','F'}],
    'Spoken Language' : [{'E'}, {'E','F','G'}, {'F'}, {'E','G'}, {'F','G'}, {'F'}, {'E','F','G'},{'F'}, {'G'}, {'E','G'}],
    'Reading' : [{'F', 'G'}, {'F','G'},{'F','G'}, {'F','G'},{'F','G'}, {'E','F'},{'E','G'},{"E",'F','G'}, {'F','G'},{'F','G'}],
    'Writing' : [{'F','G'}, {'E','F','G'}, {'F','G'}, {'F'}, {'F'}, {"E",'F'}, {'E','F','G'}, {'E','G'}, {'F','G'}, {'E','F'}],
    'Evaluation' : ['Poor', 'Good', 'Good', 'Poor','Poor','Poor','Good','Good','Poor','Good']
})
new_data

Unnamed: 0,Audition,Spoken Language,Reading,Writing,Evaluation
0,{E},{E},"{F, G}","{F, G}",Poor
1,"{F, E, G}","{F, E, G}","{F, G}","{F, E, G}",Good
2,"{F, G}",{F},"{F, G}","{F, G}",Good
3,"{F, E}","{G, E}","{F, G}",{F},Poor
4,"{F, G}","{F, G}","{F, G}",{F},Poor
5,{F},{F},"{F, E}","{F, E}",Poor
6,"{F, E, G}","{F, E, G}","{G, E}","{F, E, G}",Good
7,"{F, G}",{F},"{F, E, G}","{G, E}",Good
8,"{G, E}",{G},"{F, G}","{F, G}",Poor
9,"{F, E}","{G, E}","{F, G}","{F, E}",Good


In [51]:
# separate the conditional attribute and decisional attribute
cond_attributes = new_data.drop('Evaluation',axis = 1)
dec_attr = new_data['Evaluation']

In [52]:
red_set = ReductSet(cond_attributes, dec_attr)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for Audition : 1.9362760811483972


Information Entropy for Audition : -0.7879965972603803

Gain Ratio for Audition : 2.4572137593997594


Conditional Entropy for Spoken Language : 1.9154834639700222


Information Entropy for Spoken Language : -0.19579934296712453

Gain Ratio for Spoken Language : 9.782890151432424


Conditional Entropy for Reading : 1.9415777317632112


Information Entropy for Reading : -1.3830316383595387

Gain Ratio for Reading : 1.4038563384321296


Conditional Entropy for Writing : 1.712741915982302


Information Entropy for Writing : -1.0116637199365086

Gain Ratio for Writing : 1.6929952930305663

Conditional Entropy {'Audition', 'Spoken Language'} 1.86
Information Entropy Audition -0.79
Conditional Entorpy {'Spoken Language'} 1.92
Gain Ratio {'Audition', 'Spoken Language'} -0.07594936708860738
Conditional Entropy {'Reading', 'Spoken Language'} 1.85
Information Entropy Reading -1.38
Conditional Entorpy {'Spoken Language'} 1.92
Gain Ratio {'R

# Example 5 

In [53]:
ex5 = pd.DataFrame({
    'A1' : [{0},{0,1,2},{1,2},{0,1},{1,2},{1},{0},{1}],
    'A2' : [{0},{0,1,2},{0,1},{0,2},{1,2},{1,2},{0},{1,2}],
    'A3' : [{1,2},{1,2},{1,2},{1,2},{1,2},{0,1},{1,2},{0,1}],
    'A4' : [{1,2},{0,1,2},{1,2},{1},{1},{0,1},{1,2},{0,1}],
    'd' : [0,0,1,0,1,1,0,1]
})
ex5.head()

Unnamed: 0,A1,A2,A3,A4,d
0,{0},{0},"{1, 2}","{1, 2}",0
1,"{0, 1, 2}","{0, 1, 2}","{1, 2}","{0, 1, 2}",0
2,"{1, 2}","{0, 1}","{1, 2}","{1, 2}",1
3,"{0, 1}","{0, 2}","{1, 2}",{1},0
4,"{1, 2}","{1, 2}","{1, 2}",{1},1


In [54]:
cond_attr = ex5.drop('d',axis = 1).head(6)
dec_att = ex5['d'].head(6)

In [55]:
red_set = ReductSet(cond_attr, dec_att)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for A1 : 0.6902342878741656


Information Entropy for A1 : 0.5062569980439168

Gain Ratio for A1 : 0.38345035402047306


Conditional Entropy for A2 : 0.8626496379460844


Information Entropy for A2 : 0.4353889601486857

Gain Ratio for A2 : 0.049861335588181635


Conditional Entropy for A3 : 1.1411538646611796


Information Entropy for A3 : -0.034588889183983354

Gain Ratio for A3 : 7.424209268352079


Conditional Entropy for A4 : 1.1785132357088224


Information Entropy for A4 : 0.13567664299423615

Gain Ratio for A4 : -2.168055725861903

Conditional Entropy {'A1', 'A3'} 0.68
Information Entropy A1 0.51
Conditional Entorpy {'A3'} 1.14
Gain Ratio {'A1', 'A3'} 0.9019607843137252
Conditional Entropy {'A2', 'A3'} 0.86
Information Entropy A2 0.44
Conditional Entorpy {'A3'} 1.14
Gain Ratio {'A2', 'A3'} 0.6363636363636361
Conditional Entropy {'A3', 'A4'} 1.13
Information Entropy A4 0.14
Conditional Entorpy {'A3'} 1.14
Gain Ratio {'A3', 'A4'} 0.07142857142857148
Conditiona

# Example 6

In [56]:
df = pd.read_csv('set valued data.csv')
df.head()

Unnamed: 0,Student,Audition,Spoken Language,Reading,Writing,Evaluation
0,x1,{E},{E},"{F,G}","{F,G}",Poor
1,x2,"{E,F,G}","{E,F,G}","{F,G}","{E,F,G}",Good
2,x3,"{F,G}",{F},"{F,G}","{F,G}",Good
3,x4,"{E,F}","{E,G}","{F,G}",{F},Poor
4,x5,"{F,G}","{F,G}","{F,G}",{F},Poor


In [57]:
df.drop('Student', axis = 1, inplace = True)

In [58]:
cond_attr = df.drop('Evaluation',axis = 1)
dec_att = df['Evaluation']

In [59]:
cond_attr = cond_attr.map(lambda x: set(x.strip('{}').split(',')))

In [60]:
red_set = ReductSet(cond_attr, dec_att)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for Audition : 9.792122026166108


Information Entropy for Audition : -28.258061874399885

Gain Ratio for Audition : 1.1698473424600186


Conditional Entropy for Spoken Language : 9.226080091516362


Information Entropy for Spoken Language : -25.96260152976036

Gain Ratio for Spoken Language : 1.2514761517617312


Conditional Entropy for Reading  : 9.888638555145963


Information Entropy for Reading  : -29.49084405800997

Gain Ratio for Reading  : 1.1242179115186217


Conditional Entropy for Writing  : 9.34112788955389


Information Entropy for Writing  : -26.55940107164118

Gain Ratio for Writing  : 1.2276867374483456

Conditional Entropy {'Audition', 'Spoken Language'} 9.26
Information Entropy Audition -28.26
Conditional Entorpy {'Spoken Language'} 9.23
Gain Ratio {'Audition', 'Spoken Language'} 0.00106157112526537
Conditional Entropy {'Reading ', 'Spoken Language'} 9.29
Information Entropy Reading  -29.49
Conditional Entorpy {'Spoken Language'} 9.23
Gain Ratio {

# Example 7

In [61]:
ex7 = pd.DataFrame({
    'Products(C1)' : [{'earphones','tablet'},{'laptop','earphones'},{'earphones'}, {'earphones','tablet'},{'laptop'}],
    'Importer(C2)' : [{'IA1','IA2'}, {'IA2','IA3'}, {'IA1','IA2'}, {'IA1', 'IA2'}, {'IA3'}],
    'Exporters(C3)': [{'EB1'}, {'EB1','EB3'}, {'EB2'}, {'EB1', 'EB2'}, {'EB3'}],
    'Prospects(D)' : ['Good', 'Bad','Bad','Good','Bad']
})
ex7

Unnamed: 0,Products(C1),Importer(C2),Exporters(C3),Prospects(D)
0,"{tablet, earphones}","{IA2, IA1}",{EB1},Good
1,"{laptop, earphones}","{IA3, IA2}","{EB3, EB1}",Bad
2,{earphones},"{IA2, IA1}",{EB2},Bad
3,"{tablet, earphones}","{IA2, IA1}","{EB2, EB1}",Good
4,{laptop},{IA3},{EB3},Bad


In [62]:
cond_attr = ex7.drop('Prospects(D)',axis = 1)
dec_att = ex7['Prospects(D)']

In [63]:
red_set = ReductSet(cond_attr, dec_att)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for Products(C1) : 0.5472604408706362


Information Entropy for Products(C1) : 0.810067714472283

Gain Ratio for Products(C1) : 0.5230305393173761


Conditional Entropy for Importer(C2) : 0.6751258386365057


Information Entropy for Importer(C2) : 0.7462012171610496

Gain Ratio for Importer(C2) : 0.3964409987746191


Conditional Entropy for Exporters(C3) : 0.5637902270976218


Information Entropy for Exporters(C3) : 1.1389408077008025

Gain Ratio for Exporters(C3) : 0.35749036701827175

Conditional Entropy {'Importer(C2)', 'Products(C1)'} 0.57
Information Entropy Importer(C2) 0.75
Conditional Entorpy {'Products(C1)'} 0.55
Gain Ratio {'Importer(C2)', 'Products(C1)'} -0.026666666666666543
Conditional Entropy {'Exporters(C3)', 'Products(C1)'} 0.52
Information Entropy Exporters(C3) 1.14
Conditional Entorpy {'Products(C1)'} 0.55
Gain Ratio {'Exporters(C3)', 'Products(C1)'} 0.026315789473684237
Conditional Entropy {'Importer(C2)', 'Exporters(C3)', 'Products(C1)'} 0.52
In

# Example 8 Glass Data

In [70]:
glass_data = pd.read_csv("glass.csv")
glass_data.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type,Type_st
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1,one
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1,one
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1,one
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1,one
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1,one


In [66]:
glass_data.shape

(214, 10)

In [67]:
glass_data.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [79]:
X = glass_data.drop('Type', axis = 1)
y = glass_data['Type'].apply(lambda x: str(x))

In [85]:
glass_data['Type'] = glass_data['Type'].apply(lambda x: str(x))

In [89]:
glass_data.to_csv('glass2.csv')

In [70]:
for column in X.columns:
    X[column] = X[column].apply(lambda x: {x})

In [71]:
X.head()

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe
0,{1.52101},{13.64},{4.49},{1.1},{71.78},{0.06},{8.75},{0.0},{0.0}
1,{1.51761},{13.89},{3.6},{1.36},{72.73},{0.48},{7.83},{0.0},{0.0}
2,{1.51618},{13.53},{3.55},{1.54},{72.99},{0.39},{7.78},{0.0},{0.0}
3,{1.51766},{13.21},{3.69},{1.29},{72.61},{0.57},{8.22},{0.0},{0.0}
4,{1.51742},{13.27},{3.62},{1.24},{73.08},{0.55},{8.07},{0.0},{0.0}


In [72]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for RI : 6.150977500432694


Information Entropy for RI : 84.52558995988124

Gain Ratio for RI : -1.7148840635289089


Conditional Entropy for Na : 19.647766191505507


Information Entropy for Na : 65.20491817305565

Gain Ratio for Na : -2.430006513513688


Conditional Entropy for Mg : 36.185126352613985


Information Entropy for Mg : 10.419101118877222

Gain Ratio for Mg : -16.794705612149535


Conditional Entropy for Al : 26.002653693668982


Information Entropy for Al : 51.675170074706706

Gain Ratio for Al : -3.189215693679575


Conditional Entropy for Si : 24.362707505625032


Information Entropy for Si : 62.56288245425615

Gain Ratio for Si : -2.6079891268927913


Conditional Entropy for K : 43.113121503284376


Information Entropy for K : -5.225090822487282

Gain Ratio for K : 34.81541993538696


Conditional Entropy for Ca : 18.553631194101673


Information Entropy for Ca : 66.67391376664489

Gain Ratio for Ca : -2.3600570596017483


Conditional Entropy for 

In [72]:
glass_data.columns

Index(['RI', 'Na', 'Mg', 'Al', 'Si', 'K', 'Ca', 'Ba', 'Fe', 'Type', 'Type_st'], dtype='object')

In [74]:
glass_selected_features = glass_data[['Mg', 'Al', 'Si', 'K','Type']]
glass_selected_features.to_csv('glass_selected_features.csv')

# Annealing

In [75]:
annealing = pd.read_csv('annealing.csv')
annealing.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V30,V31,V32,V33,V34,V35,V36,V37,V38,Class
0,'?',C,A,8,0,'?',S,'?',0,'?',...,'?','?',COIL,0.7,610.0,0,'?',0,'?',3
1,'?',C,R,0,0,'?',S,2,0,'?',...,'?','?',COIL,3.2,610.0,0,'?',0,'?',3
2,'?',C,R,0,0,'?',S,2,0,'?',...,'?','?',SHEET,0.7,1300.0,762,'?',0,'?',3
3,'?',C,A,0,60,T,'?','?',0,'?',...,'?','?',COIL,2.801,385.1,0,'?',0,'?',3
4,'?',C,A,0,60,T,'?','?',0,'?',...,'?','?',SHEET,0.801,255.0,269,'?',0,'?',3


In [76]:
#printing unique class
annealing['Class'].unique()

array([3, 5, 1, 4, 2], dtype=int64)

In [77]:
# Separating X and Y variable
X = annealing.drop('Class', axis = 1)
y = annealing['Class']

In [78]:
# for col in X.columns:
#     print(X[col].unique())

for col in X.columns:
    s = set(X[col].values.tolist())
    if '?' in s:
        s.remove('?')  # Remove the '?' value from the set
        X[col] = X[col].apply(lambda x: s if x == '?' else x)


In [79]:
for col in X.columns:
    X[col] = X[col].apply(lambda x: {x} if type(x) != set else x)

In [80]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for V1 : 140.2216796133571


Information Entropy for V1 : -1214.7578638470425

Gain Ratio for V1 : 1.04670069372515


Conditional Entropy for V2 : 213.69416057430672


Information Entropy for V2 : -1344.960379757753

Gain Ratio for V2 : 0.9999999999999832


Conditional Entropy for V3 : 158.72730481127076


Information Entropy for V3 : -986.605554337797

Gain Ratio for V3 : 1.3075068534968155


Conditional Entropy for V4 : 204.4727622322226


Information Entropy for V4 : -1227.2848865266387

Gain Ratio for V4 : 1.0883691277222074


Conditional Entropy for V5 : 161.41579640186325


Information Entropy for V5 : -1133.3543171016531

Gain Ratio for V5 : 1.1405806605043736


Conditional Entropy for V6 : 187.27161290865237


Information Entropy for V6 : -1234.2885455725814

Gain Ratio for V6 : 1.0682573672272164


Conditional Entropy for V7 : 175.309531269571


Information Entropy for V7 : -1137.007662013153

Gain Ratio for V7 : 1.1491353964489641


Conditional Entropy fo

# Cardiotocography

In [81]:
card_df = pd.read_csv('cardiotocography.csv')
card_df.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V27,V28,V29,V30,V31,V32,V33,V34,V35,Class
0,23,240,357,120,120,0,0,0,73,0.5,...,0,0,0,0,0,0,0,1,0,9
1,45,5,632,132,132,4,0,4,17,2.1,...,0,0,0,0,1,0,0,0,0,6
2,45,177,779,133,133,2,0,5,16,2.1,...,0,0,0,0,1,0,0,0,0,6
3,45,411,1192,134,134,2,0,6,16,2.4,...,0,0,0,0,1,0,0,0,0,6
4,45,533,1147,132,132,4,0,5,16,2.4,...,1,0,0,0,0,0,0,0,0,2


In [82]:
X = card_df.drop('Class', axis = 1)
y = card_df['Class']

In [83]:
for col in X.columns:
    X[col] = X[col].apply(lambda x: {x} if type(x) != set else x)

In [84]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)

# Euclyptus

In [2]:
euclyptus_data = pd.read_csv('euclyptus/euclyptus.csv')
euclyptus_data.head()

Unnamed: 0,Abbrev,Rep,Locality,Map_Ref,Latitude,Altitude,Rainfall,Frosts,Year,Sp,PMCno,DBH,Ht,Surv,Vig,Ins_res,Stem_Fm,Crown_Fm,Brnch_Fm,Utility
0,Cra,1,Central_Hawkes_Bay,N135_382/137,39__38,100,850,-2,1980,co,1520,18.45,9.96,40,4.0,3.0,3.5,4.0,3.5,good
1,Cra,1,Central_Hawkes_Bay,N135_382/137,39__38,100,850,-2,1980,fr,1487,13.15,9.65,90,4.5,4.0,3.5,3.5,3.0,best
2,Cra,1,Central_Hawkes_Bay,N135_382/137,39__38,100,850,-2,1980,ma,1362,10.32,6.5,50,2.3,2.5,3.0,3.5,3.0,low
3,Cra,1,Central_Hawkes_Bay,N135_382/137,39__38,100,850,-2,1980,nd,1596,14.8,9.48,70,3.7,3.0,3.3,4.0,3.5,good
4,Cra,1,Central_Hawkes_Bay,N135_382/137,39__38,100,850,-2,1980,ni,2088,14.5,10.78,90,4.0,2.7,3.3,3.0,3.0,good


In [3]:
euclyptus_data.shape

(736, 20)

In [4]:
def make_set(X):
    for col in X.columns:
        X[col] = X[col].apply(lambda x: {x} if type(x) != set else x)

In [6]:
X = euclyptus_data.drop('Utility', axis = 1)
y = euclyptus_data['Utility']

In [7]:
make_set(X)

In [9]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for Abbrev : 270.5074406070441


Information Entropy for Abbrev : -503.35350950915256

Gain Ratio for Abbrev : 1.9819256590005605


Conditional Entropy for Rep : 329.7865234521322


Information Entropy for Rep : -824.9417431782215

Gain Ratio for Rep : 1.2811672189328511


Conditional Entropy for Locality : 296.6934468223384


Information Entropy for Locality : -680.1478579600614

Gain Ratio for Locality : 1.5052539389451172


Conditional Entropy for Map_Ref : 272.9723134109473


Information Entropy for Map_Ref : -548.4159372178794

Gain Ratio for Map_Ref : 1.8235686474055863


Conditional Entropy for Latitude : 274.0808115027498


Information Entropy for Latitude : -583.6443363630586

Gain Ratio for Latitude : 1.7153984791126073


Conditional Entropy for Altitude : 304.2593810916086


Information Entropy for Altitude : -646.3719491034797

Gain Ratio for Altitude : 1.5956156172297367


Conditional Entropy for Rainfall : 278.7022254441059


Information Entropy for R

In [10]:
selected_features_euclyptus = euclyptus_data[list(reduct_set)]
selected_features_euclyptus['Utility'] = euclyptus_data['Utility']
selected_features_euclyptus.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_features_euclyptus['Utility'] = euclyptus_data['Utility']


Unnamed: 0,PMCno,Ht,DBH,Utility
0,1520,9.96,18.45,good
1,1487,9.65,13.15,best
2,1362,6.5,10.32,low
3,1596,9.48,14.8,good
4,2088,10.78,14.5,good


In [12]:
selected_features_euclyptus.to_csv('euclyptus/euclyptus_selected_features.csv')

# Grub

In [13]:
grub_data = pd.read_csv('grub/grub_damage.csv')
grub_data.head()

Unnamed: 0,year_zone,year,strip,pdk,damage_rankRJT,damage_rankALL,dry_or_irr,zone,GG_new
0,6f,86,3,1,1,0,D,F,low
1,6f,86,3,2,0,0,D,F,high
2,6f,86,3,3,1,1,D,F,high
3,6f,86,3,4,1,0,D,F,high
4,6f,86,3,5,0,0,D,F,low


In [14]:
X = grub_data.drop('GG_new', axis = 1)
y = grub_data['GG_new']

In [16]:
make_set(X)

In [17]:
X.head()

Unnamed: 0,year_zone,year,strip,pdk,damage_rankRJT,damage_rankALL,dry_or_irr,zone
0,{6f},{86},{3},{1},{1},{0},{D},{F}
1,{6f},{86},{3},{2},{0},{0},{D},{F}
2,{6f},{86},{3},{3},{1},{1},{D},{F}
3,{6f},{86},{3},{4},{1},{0},{D},{F}
4,{6f},{86},{3},{5},{0},{0},{D},{F}


In [18]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for year_zone : 41.40281040678885


Information Entropy for year_zone : -19.979324353051243

Gain Ratio for year_zone : 6.773903588066989


Conditional Entropy for year : 54.28874331728915


Information Entropy for year : -67.29721858118454

Gain Ratio for year : 2.2025271319939788


Conditional Entropy for strip : 50.38487374303414


Information Entropy for strip : -57.11040136613731

Gain Ratio for strip : 2.5270367009555903


Conditional Entropy for pdk : 57.01516313491071


Information Entropy for pdk : -90.45659356400984

Gain Ratio for pdk : 1.6687602716721002


Conditional Entropy for damage_rankRJT : 52.01547174437353


Information Entropy for damage_rankRJT : -76.76882663172833

Gain Ratio for damage_rankRJT : 1.9011711480236646


Conditional Entropy for damage_rankALL : 53.71182925127009


Information Entropy for damage_rankALL : -78.85510787845696

Gain Ratio for damage_rankALL : 1.872383917023415


Conditional Entropy for dry_or_irr : 55.20455225339227


# Pasture

In [20]:
pasture_data = pd.read_csv('pasture/pasture.csv')
pasture_data.head()

Unnamed: 0,fertiliser,slope,aspect-dev-NW,OlsenP,MinN,TS,Ca-Mg,LOM,NFIX-mean,Eworms-main-3,...,Air-Perm,Porosity,HFRG-pct-mean,legume-yield,OSPP-pct-mean,Jan-Mar-mean-TDR,Annual-Mean-Runoff,root-surface-area,Leaf-P,pasture-prod-class
0,LL,25,37,8,235,235,3.64,2.11,0.061,129.9,...,0,0.188,2.63,298.3,8.63,19.7,615.7,269.3,1335,MED
1,LL,23,17,12,218,280,3.34,2.26,0.069,138.5,...,0,0.166,4.09,260.0,4.29,24.1,791.2,436.0,2161,MED
2,LL,20,18,9,243,285,3.34,1.99,0.062,109.5,...,0,0.167,13.06,374.8,6.88,27.5,711.6,294.7,1481,MED
3,LL,27,35,10,204,440,3.34,2.31,0.073,141.3,...,0,0.175,13.92,254.8,6.48,24.3,595.8,98.6,2063,MED
4,LL,8,105,8,327,455,3.64,1.3,0.067,128.0,...,0,0.153,23.71,221.2,6.79,30.6,721.0,187.8,1765,MED


In [21]:
X = pasture_data.drop('pasture-prod-class', axis = 1)
y = pasture_data['pasture-prod-class']

In [22]:
make_set(X)

In [23]:
X.head()

Unnamed: 0,fertiliser,slope,aspect-dev-NW,OlsenP,MinN,TS,Ca-Mg,LOM,NFIX-mean,Eworms-main-3,...,OM,Air-Perm,Porosity,HFRG-pct-mean,legume-yield,OSPP-pct-mean,Jan-Mar-mean-TDR,Annual-Mean-Runoff,root-surface-area,Leaf-P
0,{LL},{25},{37},{8},{235},{235},{3.64},{2.11},{0.061},{129.9},...,{6.8},{0},{0.188},{2.63},{298.3},{8.63},{19.7},{615.7},{269.3},{1335}
1,{LL},{23},{17},{12},{218},{280},{3.34},{2.26},{0.069},{138.5},...,{8.8},{0},{0.166},{4.09},{260.0},{4.29},{24.1},{791.2},{436.0},{2161}
2,{LL},{20},{18},{9},{243},{285},{3.34},{1.99},{0.062},{109.5},...,{10.1},{0},{0.167},{13.06},{374.8},{6.88},{27.5},{711.6},{294.7},{1481}
3,{LL},{27},{35},{10},{204},{440},{3.34},{2.31},{0.073},{141.3},...,{5.5},{0},{0.175},{13.92},{254.8},{6.48},{24.3},{595.8},{98.6},{2063}
4,{LL},{8},{105},{8},{327},{455},{3.64},{1.3},{0.067},{128.0},...,{9.3},{0},{0.153},{23.71},{221.2},{6.79},{30.6},{721.0},{187.8},{1765}


In [24]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for fertiliser : 5.749595485207253


Information Entropy for fertiliser : -6.169830096360441

Gain Ratio for fertiliser : 2.405810690956734


Conditional Entropy for slope : 5.032251891746034


Information Entropy for slope : 8.33465289101028

Gain Ratio for slope : -1.6948635772205574


Conditional Entropy for aspect-dev-NW : 1.2


Information Entropy for aspect-dev-NW : 14.717882283189013

Gain Ratio for aspect-dev-NW : -0.6994109290954921


Conditional Entropy for OlsenP : 4.301955000865388


Information Entropy for OlsenP : 7.064949781890927

Gain Ratio for OlsenP : -1.8960931268337091


Conditional Entropy for MinN : 0.4


Information Entropy for MinN : 16.317882283189014

Gain Ratio for MinN : -0.5818063617105541


Conditional Entropy for TS : 1.7509775004326937


Information Entropy for TS : 14.56690478275632

Gain Ratio for TS : -0.7444838408824945


Conditional Entropy for Ca-Mg : 2.472905595320056


Information Entropy for Ca-Mg : 10.24497668786895

Gain 

In [25]:
pasture_selected_features = pasture_data[list(reduct_set)]
pasture_selected_features['pasture-prod-class'] = pasture_data['pasture-prod-class']
pasture_selected_features.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pasture_selected_features['pasture-prod-class'] = pasture_data['pasture-prod-class']


Unnamed: 0,Ca-Mg,Eworms-No-species,slope,OlsenP,pasture-prod-class
0,3.64,5,25,8,MED
1,3.34,3,23,12,MED
2,3.34,4,20,9,MED
3,3.34,3,27,10,MED
4,3.64,5,8,8,MED


In [26]:
pasture_selected_features.to_csv('pasture/pasture_selected_features.csv')


# Squash Stored

In [3]:
squash_stored = pd.read_csv("squash stored/squash_stored.csv")
squash_stored.head()

Unnamed: 0,site,daf,fruit,weight,storewt,pene,solids,brix,a*,egdd,...,total,glucose+fructose,starch,sweetness,flavour,dry/moist,fibre,heat_input_emerg,heat_input_flower,Acceptability
0,P,30,1,1815,1717,8.3,22.3,10.6,22.5,651,...,57.65,37.64,92.77,667.0,748.6,610.1,376.9,847,458,excellent
1,P,30,2,1667,1590,8.4,21.6,11.3,19.9,651,...,60.98,38.6,85.1,823.6,817.8,670.9,308.4,721,458,excellent
2,P,30,9,1508,1437,7.0,19.8,11.2,20.2,651,...,58.94,40.31,70.29,740.5,837.8,693.7,331.5,847,458,ok
3,P,30,10,1508,1422,6.6,19.1,10.6,21.0,651,...,58.48,44.9,63.79,697.5,780.8,784.4,408.4,847,458,ok
4,P,40,2,1611,1504,8.0,16.3,10.1,23.9,772,...,57.87,37.82,27.87,740.5,746.5,744.5,590.5,968,568,ok


In [4]:
X = squash_stored.drop('Acceptability', axis = 1)
y = squash_stored['Acceptability']

In [7]:
make_set(X)

In [8]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for site : 12.489676203141325


Information Entropy for site : -18.739660192720883

Gain Ratio for site : 1.72883295943809


Conditional Entropy for daf : 12.736329439201631


Information Entropy for daf : -11.867587820094524

Gain Ratio for daf : 2.7507186734807005


Conditional Entropy for fruit : 8.701955000865388


Information Entropy for fruit : 8.993164684665098

Gain Ratio for fruit : -3.1813073585047227


Conditional Entropy for weight : 0.4


Information Entropy for weight : 23.348052186828596

Gain Ratio for weight : -0.8697970102260153


Conditional Entropy for storewt : -0.0


Information Entropy for storewt : 24.1480521868286

Gain Ratio for storewt : -0.8244170516396025


Conditional Entropy for pene : 1.3509775004326936


Information Entropy for pene : 17.695119685530504

Gain Ratio for pene : -1.2014071599934384


Conditional Entropy for solids : 2.5509775004326936


Information Entropy for solids : 20.397074686395896

Gain Ratio for solids : -1.101

In [9]:
squash_stored_reduced = squash_stored[list(reduct_set)]
squash_stored_reduced['Acceptability'] = squash_stored['Acceptability']
squash_stored_reduced.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  squash_stored_reduced['Acceptability'] = squash_stored['Acceptability']


Unnamed: 0,fruit,egdd,daf,Acceptability
0,1,651,30,excellent
1,2,651,30,excellent
2,9,651,30,ok
3,10,651,30,ok
4,2,772,40,ok


In [50]:
squash_stored_reduced.to_csv('squash stored/squash_stored_reduced.csv')

# Squash Unstored

In [18]:
squash_unstored_df = pd.read_csv('squash unstored/squash_unstored.csv')
squash_unstored_df.head()

Unnamed: 0,site,daf,fruit,weight,pene,solids,brix,a*,egdd,fgdd,...,total,glucose+fructose,starch,sweetness,flavour,dry/moist,fibre,heat_input_emerg,heat_input_flower,Acceptability
0,P,30,3,1712,6.9,21.2,8.4,12.2,651,262,...,?,?,?,505.5,542.9,597.4,226.1,721,332,ok
1,P,30,5,1674,7.7,24.9,8.7,11.3,651,262,...,?,?,?,339.4,393.5,264.2,312.6,721,332,not_suitable
2,P,30,6,1619,6.7,20.5,7.7,12.8,651,262,...,?,?,?,538.5,605.5,440.4,307.6,721,332,ok
3,P,30,16,2228,7.5,26.4,9.0,11.4,651,262,...,?,?,?,314.0,436.8,175.8,328.1,721,332,not_suitable
4,P,40,3,1722,7.4,22.8,9.6,17.6,772,372,...,47.77,31.17,108.98,604.3,621.6,538.1,312.6,842,442,ok


In [19]:
X = squash_unstored_df.drop('Acceptability', axis = 1)
y = squash_unstored_df['Acceptability']

In [20]:
for col in X.columns:
    s = set(X[col].values.tolist())
    if '?' in s:
        s.remove('?')  # Remove the '?' value from the set
        X[col] = X[col].apply(lambda x: s if x == '?' else x)

In [22]:
make_set(X)

In [23]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for site : 12.059502340649043


Information Entropy for site : -18.739660192720883

Gain Ratio for site : 1.7890980848076756


Conditional Entropy for daf : 11.77236562551481


Information Entropy for daf : -11.867587820094524

Gain Ratio for daf : 2.8009022515364523


Conditional Entropy for fruit : 7.003910001730778


Information Entropy for fruit : 9.189254682934324

Gain Ratio for fruit : -3.0983467978856525


Conditional Entropy for weight : 1.2


Information Entropy for weight : 22.948052186828598

Gain Ratio for weight : -0.9877782931444155


Conditional Entropy for pene : 2.452932501298081


Information Entropy for pene : 17.144142185097806

Gain Ratio for pene : -1.3952590956802156


Conditional Entropy for solids : 1.2


Information Entropy for solids : 21.748052186828595

Gain Ratio for solids : -1.0422812868649833


Conditional Entropy for brix : 2.9509775004326935


Information Entropy for brix : 17.446097185963193

Gain Ratio for brix : -1.39965776071

In [24]:
squash_unstored_selected_features = squash_unstored_df[list(reduct_set)]
squash_unstored_selected_features['Acceptability'] = squash_unstored_df['Acceptability']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  squash_unstored_selected_features['Acceptability'] = squash_unstored_df['Acceptability']


In [25]:
squash_unstored_selected_features.head()

Unnamed: 0,fruit,egdd,daf,Acceptability
0,3,651,30,ok
1,5,651,30,not_suitable
2,6,651,30,ok
3,16,651,30,not_suitable
4,3,772,40,ok


In [27]:
squash_unstored_selected_features.to_csv('squash unstored/squash_unstored_selected_features.csv')

# White Clover

In [28]:
white_clover_df = pd.read_csv('white clover/white_clover.csv')
white_clover_df.head()

Unnamed: 0,strata,plot,paddock,WhiteClover-91,BareGround-91,Cocksfoot-91,OtherGrasses-91,OtherLegumes-91,RyeGrass-91,Weeds-91,...,RyeGrass-93,Weeds-93,BareGround-94,Cocksfoot-94,OtherGrasses-94,OtherLegumes-94,RyeGrass-94,Weeds-94,strata-combined,WhiteClover-94
0,1_OldCamp,tahora,45,39.13,0.0,0.0,13.04,0.0,39.13,8.7,...,28.57,14.29,0.0,41.67,0.0,0.0,25.0,8.33,1,17.645<=WhiteClover-94<26.4675
1,6_OldEdge,prop,42,13.33,6.67,6.67,33.33,0.0,20.0,20.0,...,33.33,11.11,0.0,0.0,50.0,0.0,0.0,50.0,1,0<=WhiteClover-94<8.8225
2,6_OldEdge,prop,25,14.29,0.0,21.43,21.43,0.0,28.57,14.29,...,0.0,0.0,0.0,38.24,23.53,0.0,23.53,11.76,1,0<=WhiteClover-94<8.8225
3,7_NewEdge,tahora,42,26.67,0.0,33.33,20.0,0.0,13.33,6.67,...,0.0,0.0,0.0,16.67,50.0,16.67,0.0,16.67,1,0<=WhiteClover-94<8.8225
4,5_Steep,prop,42,24.02,5.41,14.41,31.23,2.7,12.61,9.61,...,13.58,15.4,1.64,15.57,32.51,7.65,4.64,30.33,4,0<=WhiteClover-94<8.8225


In [29]:
X = white_clover_df.drop('WhiteClover-94', axis = 1)
y = white_clover_df['WhiteClover-94']

In [30]:
make_set(X)

In [31]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for strata : 12.326830042310185


Information Entropy for strata : -10.684761022592356

Gain Ratio for strata : 3.916091098338687


Conditional Entropy for plot : 15.470380066609266


Information Entropy for plot : -26.08690553143166

Gain Ratio for plot : 1.7244685268839675


Conditional Entropy for paddock : 14.288320847575406


Information Entropy for paddock : -26.08690553143166

Gain Ratio for paddock : 1.6791561682521774


Conditional Entropy for WhiteClover-91 : 0.8


Information Entropy for WhiteClover-91 : 23.334365900693435

Gain Ratio for WhiteClover-91 : -1.2991853995586404


Conditional Entropy for BareGround-91 : 9.126349743983793


Information Entropy for BareGround-91 : -1.8597231288178633

Gain Ratio for BareGround-91 : 20.778371054989556


Conditional Entropy for Cocksfoot-91 : 4.550977500432694


Information Entropy for Cocksfoot-91 : 21.905316495148103

Gain Ratio for Cocksfoot-91 : -1.5551770271896397


Conditional Entropy for OtherGrasses-91 :

In [32]:
white_clover_selected_features = white_clover_df[list(reduct_set)]
white_clover_selected_features['Class'] = white_clover_df['WhiteClover-94']
white_clover_selected_features.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  white_clover_selected_features['Class'] = white_clover_df['WhiteClover-94']


Unnamed: 0,RyeGrass-94,OtherLegumes-91,OtherLegumes-94,OtherLegumes-92,Class
0,25.0,0.0,0.0,0.0,17.645<=WhiteClover-94<26.4675
1,0.0,0.0,0.0,0.0,0<=WhiteClover-94<8.8225
2,23.53,0.0,0.0,0.0,0<=WhiteClover-94<8.8225
3,0.0,0.0,16.67,2.22,0<=WhiteClover-94<8.8225
4,4.64,2.7,7.65,1.42,0<=WhiteClover-94<8.8225


In [48]:
white_clover_selected_features.to_csv('white clover/white_clover_selected_features')

In [2]:
df = pd.read_csv(r'C:\Users\Aditya Shakya\Downloads\NPHA-doctor-visits.csv')

In [3]:
df.head()

Unnamed: 0,Number of Doctors Visited,Age,Phyiscal Health,Mental Health,Dental Health,Employment,Stress Keeps Patient from Sleeping,Medication Keeps Patient from Sleeping,Pain Keeps Patient from Sleeping,Bathroom Needs Keeps Patient from Sleeping,Uknown Keeps Patient from Sleeping,Trouble Sleeping,Prescription Sleep Medication,Race,Gender
0,3,2,4,3,3,3,0,0,0,0,1,2,3,1,2
1,2,2,4,2,3,3,1,0,0,1,0,3,3,1,1
2,3,2,3,2,3,3,0,0,0,0,1,3,3,4,1
3,1,2,3,2,3,3,0,0,0,1,0,3,3,4,2
4,3,2,3,3,3,3,1,0,0,0,0,2,3,1,2


In [5]:
df.columns

Index(['Number of Doctors Visited', 'Age', 'Phyiscal Health', 'Mental Health',
       'Dental Health', 'Employment', 'Stress Keeps Patient from Sleeping',
       'Medication Keeps Patient from Sleeping',
       'Pain Keeps Patient from Sleeping',
       'Bathroom Needs Keeps Patient from Sleeping',
       'Uknown Keeps Patient from Sleeping', 'Trouble Sleeping',
       'Prescription Sleep Medication', 'Race', 'Gender'],
      dtype='object')

In [7]:
X = df.drop('Trouble Sleeping', axis = 1)
y = df['Trouble Sleeping']

In [8]:
make_set(X)

In [9]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for Number of Doctors Visited : 192.79616638620894


Information Entropy for Number of Doctors Visited : -813.848152000239

Gain Ratio for Number of Doctors Visited : 1.254833012228244


Conditional Entropy for Age : 193.69392806880296


Information Entropy for Age : -1022.1412897534398

Gain Ratio for Age : 1.000000000000004


Conditional Entropy for Phyiscal Health : 186.27850898392373


Information Entropy for Phyiscal Health : -753.9484936870871

Gain Ratio for Phyiscal Health : 1.3458822176382097


Conditional Entropy for Mental Health : 189.11325772042812


Information Entropy for Mental Health : -756.2893644148237

Gain Ratio for Mental Health : 1.3454646690587841


Conditional Entropy for Dental Health : 189.1195983015706


Information Entropy for Dental Health : -685.6589404325398

Gain Ratio for Dental Health : 1.4840715988393463


Conditional Entropy for Employment : 188.48993073030633


Information Entropy for Employment : -892.7588389473237

Gain Ratio

In [10]:
df_selected_features = df[list(reduct_set)]
df_selected_features['Class'] = df['Trouble Sleeping']
df_selected_features.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_features['Class'] = df['Trouble Sleeping']


Unnamed: 0,Dental Health,Class
0,3,2
1,3,3
2,3,3
3,3,3
4,3,2


In [11]:
df_selected_features.to_csv('df_selected_features.csv')

In [12]:
wdbc = pd.read_csv(r"C:\Users\Aditya Shakya\Downloads\wdbc.data")

In [13]:
wdbc.head()

Unnamed: 0,842302,M,17.99,10.38,122.8,1001,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
0,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [14]:
len(wdbc.columns)

32

In [18]:
wdbc.columns = np.arange(0,32)

In [19]:
wdbc.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22,23,24,25,26,27,28,29,30,31
0,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
1,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
2,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
3,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678
4,843786,M,12.45,15.7,82.57,477.1,0.1278,0.17,0.1578,0.08089,...,15.47,23.75,103.4,741.6,0.1791,0.5249,0.5355,0.1741,0.3985,0.1244


In [20]:
wdbc.shape

(568, 32)

In [21]:
X = wdbc.drop(1, axis = 1)
y = wdbc[1]

In [22]:
make_set(X)

In [23]:
red_set = ReductSet(X, y)
reduct_set = red_set.Reduct()
print("Reduct set:", reduct_set)


Conditional Entropy for 0 : -0.0


Information Entropy for 0 : 263.77103157920396

Gain Ratio for 0 : -2.5306632841350463


Conditional Entropy for 2 : 10.452932501298081


Information Entropy for 2 : 216.45734657314478

Gain Ratio for 2 : -3.1321117452029643


Conditional Entropy for 3 : 13.305865002596159


Information Entropy for 3 : 226.41223407530865

Gain Ratio for 3 : -3.0069997445988736


Conditional Entropy for 4 : 2.9509775004326935


Information Entropy for 4 : 244.51809907790533

Gain Ratio for 4 : -2.7419918814383317


Conditional Entropy for 5 : 2.8


Information Entropy for 5 : 251.62005407877092

Gain Ratio for 5 : -2.6639993679759386


Conditional Entropy for 6 : 20.080725598781612


Information Entropy for 6 : 221.4334634773923

Gain Ratio for 6 : -3.105205418533773


Conditional Entropy for 7 : 4.950977500432694


Information Entropy for 7 : 250.66907657833826

Gain Ratio for 7 : -2.682686878316213


Conditional Entropy for 8 : 2.0


Information Entropy for 8 : 245.