In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer

from scipy.stats import chi2_contingency
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('primary_data.csv',sep=';', on_bad_lines='skip')
df.head()

Unnamed: 0,family,name,class,cap-diameter,cap-shape,Cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,Spore-print-color,habitat,season
0,Amanita Family,Fly Agaric,p,"[10, 20]","[x, f]","[g, h]","[e, o]",[f],[e],,...,[s],[y],[w],[u],[w],[t],"[g, p]",,[d],"[u, a, w]"
1,Amanita Family,Panther Cap,p,"[5, 10]","[p, x]",[g],[n],[f],[e],,...,,[y],[w],[u],[w],[t],[p],,[d],"[u, a]"
2,Amanita Family,False Panther Cap,p,"[10, 15]","[x, f]",,"[g, n]",[f],[e],,...,,,[w],[u],[w],[t],"[e, g]",,[d],"[u, a]"
3,Amanita Family,The Blusher,e,"[5, 15]","[x, f]",,[n],[t],,,...,[b],,[w],[u],[w],[t],[g],,[d],"[u, a]"
4,Amanita Family,Death Cap,p,"[5, 12]","[x, f]",[h],[r],[f],,[c],...,,,[w],[u],[w],[t],"[g, p]",,[d],"[u, a]"


In [3]:
df.columns

Index(['family', 'name', 'class', 'cap-diameter', 'cap-shape', 'Cap-surface',
       'cap-color', 'does-bruise-or-bleed', 'gill-attachment', 'gill-spacing',
       'gill-color', 'stem-height', 'stem-width', 'stem-root', 'stem-surface',
       'stem-color', 'veil-type', 'veil-color', 'has-ring', 'ring-type',
       'Spore-print-color', 'habitat', 'season'],
      dtype='object')

In [4]:
# Keep naming consistent
df = df.rename(columns={"Cap-surface": "cap-surface"})

In [5]:
# Because of multicolinearity and name has no correlation
df = df.drop(columns=['Spore-print-color',
                     'has-ring','name'])

#Binary Encoding
binary_mapping = {'[t]': 1, '[f]': 0}
df['does-bruise-or-bleed'] = df['does-bruise-or-bleed'].map(binary_mapping)
binary_mapping = {'p': 1, 'e': 0}
df['class'] = df['class'].map(binary_mapping)
                                 

In [6]:
# Too many Nan values where imputation is not good to do
df = df.drop(columns=['stem-root','veil-type','veil-color','stem-surface'])

In [7]:
#Only small subset of families therefore used a label encoder
label_encoder = LabelEncoder()
df['family'] = label_encoder.fit_transform(df['family'])

In [8]:
df['gill-attachment'].fillna('[?]', inplace=True)
df['ring-type'].fillna('[?]', inplace=True)
df['cap-surface'].fillna('[?]', inplace=True)
df['gill-spacing'].fillna('[?]', inplace=True)
# As per the ? key used, we simply filled in the Nans as they are essentially unknown

In [9]:
def convert_dimensions(l):
    if len(l) ==2 :
        
        return (float(l[0])+float(l[1]))/2.0
    return float(l[0])

df['cap-diameter'] = df['cap-diameter'].apply(lambda x: convert_dimensions(x.strip('[]').split(', ')))
df['stem-height'] = df['stem-height'].apply(lambda x: convert_dimensions(x.strip('[]').split(', ')))
df['stem-width'] = df['stem-width'].apply(lambda x: convert_dimensions(x.strip('[]').split(', ')))

In [10]:
columns_to_apply_multilabeling = ['cap-shape','cap-surface','cap-color','gill-attachment', 'gill-spacing', 'gill-color','stem-color','ring-type','season']

for col in columns_to_apply_multilabeling:
    mlb = MultiLabelBinarizer()
    column = df[col].apply(lambda x: x.strip('[]').split(', '))
    
    encoded_data = mlb.fit_transform(column)
    encoded_df = pd.DataFrame(encoded_data, columns=mlb.classes_+"_"+col)
    df = df.drop(col, axis=1)
    df = pd.concat([df, encoded_df], axis=1)


In [11]:
df.head()

Unnamed: 0,family,class,cap-diameter,does-bruise-or-bleed,stem-height,stem-width,habitat,b_cap-shape,c_cap-shape,f_cap-shape,...,g_ring-type,l_ring-type,m_ring-type,p_ring-type,r_ring-type,z_ring-type,a_season,s_season,u_season,w_season
0,0,1,15.0,0,17.5,17.5,[d],0,0,1,...,1,0,0,1,0,0,1,0,1,1
1,0,1,7.5,0,8.0,15.0,[d],0,0,0,...,0,0,0,1,0,0,1,0,1,0
2,0,1,12.5,0,11.0,15.0,[d],0,0,1,...,1,0,0,0,0,0,1,0,1,0
3,0,0,10.0,1,11.0,17.5,[d],0,0,1,...,1,0,0,0,0,0,1,0,1,0
4,0,1,8.5,0,11.0,15.0,[d],0,0,1,...,1,0,0,1,0,0,1,0,1,0


In [12]:
X = df.drop(columns = {'habitat','cap-diameter','stem-height','stem-width'})

In [13]:
y = df['habitat']

In [14]:
#Feature Selection using Chi-Squared Test and SelectKBest
selector = SelectKBest(score_func=chi2, k=10)  # Specify the desired number of features
X_selected = selector.fit_transform(X, y)
selected_feature_indices = selector.get_support(indices=True)
selected_feature_names = X.columns[selected_feature_indices]
X_selected_df = pd.DataFrame(X_selected, columns=selected_feature_names)

In [15]:
X_selected_df.head(15)

Unnamed: 0,family,p_cap-shape,y_cap-surface,b_gill-color,k_gill-color,r_stem-color,e_ring-type,l_ring-type,m_ring-type,r_ring-type
0,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,1,1,0,0
7,0,0,0,0,0,0,0,0,0,0
8,12,1,1,0,0,0,0,0,1,0
9,12,0,1,0,0,0,0,0,0,0


In [16]:
#Encoding y
mlb = MultiLabelBinarizer()
y_transformed = mlb.fit_transform(y)

In [27]:
X = df
X = X.drop(columns = {'habitat'})


In [29]:
X.head()

Unnamed: 0,family,class,cap-diameter,does-bruise-or-bleed,stem-height,stem-width,b_cap-shape,c_cap-shape,f_cap-shape,o_cap-shape,...,g_ring-type,l_ring-type,m_ring-type,p_ring-type,r_ring-type,z_ring-type,a_season,s_season,u_season,w_season
0,0,1,15.0,0,17.5,17.5,0,0,1,0,...,1,0,0,1,0,0,1,0,1,1
1,0,1,7.5,0,8.0,15.0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
2,0,1,12.5,0,11.0,15.0,0,0,1,0,...,1,0,0,0,0,0,1,0,1,0
3,0,0,10.0,1,11.0,17.5,0,0,1,0,...,1,0,0,0,0,0,1,0,1,0
4,0,1,8.5,0,11.0,15.0,0,0,1,0,...,1,0,0,1,0,0,1,0,1,0


In [30]:
#Feature importance Using RandomForests
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X, y)
feature_importances = rf_classifier.feature_importances_
feature_importances_with_names = list(zip(X.columns, feature_importances))
for name, importance in feature_importances_with_names:
    print(f"Feature: {name}, Importance: {importance}")

Feature: family, Importance: 0.07126171893972132
Feature: class, Importance: 0.01573003915773229
Feature: cap-diameter, Importance: 0.0653548068308416
Feature: does-bruise-or-bleed, Importance: 0.00847594128786664
Feature: stem-height, Importance: 0.053380033211973724
Feature: stem-width, Importance: 0.06542689251108498
Feature: b_cap-shape, Importance: 0.012432696914615933
Feature: c_cap-shape, Importance: 0.013799406459122277
Feature: f_cap-shape, Importance: 0.015314325137511315
Feature: o_cap-shape, Importance: 0.004047084186206161
Feature: p_cap-shape, Importance: 0.015754722470163695
Feature: s_cap-shape, Importance: 0.009943324317751516
Feature: x_cap-shape, Importance: 0.01550139540479517
Feature: ?_cap-surface, Importance: 0.01145987717862232
Feature: d_cap-surface, Importance: 0.007536015178556566
Feature: e_cap-surface, Importance: 0.005891100596095576
Feature: g_cap-surface, Importance: 0.008078662911047219
Feature: h_cap-surface, Importance: 0.009123828889607324
Feature: i

In [31]:
#Top 10 important features
for i, (name, importance) in enumerate(feature_importances_with_names[:10], start=1):
    print(f"{i}. Feature: {name}, Importance: {importance}")

1. Feature: family, Importance: 0.07126171893972132
2. Feature: class, Importance: 0.01573003915773229
3. Feature: cap-diameter, Importance: 0.0653548068308416
4. Feature: does-bruise-or-bleed, Importance: 0.00847594128786664
5. Feature: stem-height, Importance: 0.053380033211973724
6. Feature: stem-width, Importance: 0.06542689251108498
7. Feature: b_cap-shape, Importance: 0.012432696914615933
8. Feature: c_cap-shape, Importance: 0.013799406459122277
9. Feature: f_cap-shape, Importance: 0.015314325137511315
10. Feature: o_cap-shape, Importance: 0.004047084186206161
