In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder,MultiLabelBinarizer
label_encoder = LabelEncoder()

In [2]:
df = pd.read_csv('/home/sxluong/151A/CSE151A_MushroomAnalysis/primary_data.csv',sep=';', on_bad_lines='skip')


FileNotFoundError: [Errno 2] No such file or directory: '/home/sxluong/151A/CSE151A_MushroomAnalysis/primary_data.csv'

### Basic Info
Number of Observations (Rows): 173
Number of Features (Columns): 23
Data Types: All columns are recognized as object type. Either strings or mixed types.

### Missing Data
Cap-surface: 40 missing values
Gill-attachment: 28 missing values
Gill-spacing: 71 missing values
Stem-root: 146 missing values (most missing)
Stem-surface: 108 missing values
Veil-type: 164 missing values (almost all missing)
Veil-color: 152 missing values
Spore-print-color: 155 missing values
Column Descriptions

### Column Descriptions
Categorical Variables: All columns contain categorical data, with some encoded as strings that represent ranges or lists (e.g., cap-diameter, cap-shape).
Fungal Characteristics: The dataset appears to describe various physical characteristics of fungi, such as cap diameter, shape, color, gill attachment, and more.
Classification: The class column, with two unique values (p for poisonous, e for edible), is used for classification purposes.
High Cardinality: Some columns have a high number of unique values, suggesting a wide variety of categories or ranges (e.g., cap-color, habitat).

In [324]:
# Keep naming consistent
df = df.rename(columns={"Cap-surface": "cap-surface"})

In [325]:
# Because of multicolinearity and name has no correlation
df = df.drop(columns=['Spore-print-color',
                     'has-ring','name'])

#Binary Encoding
binary_mapping = {'[t]': 1, '[f]': 0}
df['does-bruise-or-bleed'] = df['does-bruise-or-bleed'].map(binary_mapping)
binary_mapping = {'p': 1, 'e': 0}
df['class'] = df['class'].map(binary_mapping)
                                 

In [326]:
# Too many Nan values where imputation is not good to do
df = df.drop(columns=['stem-root','veil-type','veil-color','stem-surface'])

In [327]:
#Only small subset of families therefore used a label encoder
df['family'] = label_encoder.fit_transform(df['family'])

In [328]:
df['gill-attachment'].fillna('[?]', inplace=True)
df['ring-type'].fillna('[?]', inplace=True)
df['cap-surface'].fillna('[?]', inplace=True)
df['gill-spacing'].fillna('[?]', inplace=True)
# As per the ? key used, we simply filled in the Nans as they are essentially unknown

In [329]:
def convert_dimensions(l):
    
    if len(l) ==2 :
        
        return (float(l[0])+float(l[1]))/2.0
    return float(l[0])

df['cap-diameter'] = df['cap-diameter'].apply(lambda x: convert_dimensions(x.strip('[]').split(', ')))
df['stem-height'] = df['stem-height'].apply(lambda x: convert_dimensions(x.strip('[]').split(', ')))
df['stem-width'] = df['stem-width'].apply(lambda x: convert_dimensions(x.strip('[]').split(', ')))

In [330]:
columns_to_apply_multilabeling = ['cap-shape','cap-surface','cap-color','gill-attachment', 'gill-spacing', 'gill-color','stem-color','ring-type','season']

for col in columns_to_apply_multilabeling:
    mlb = MultiLabelBinarizer()
    column = df[col].apply(lambda x: x.strip('[]').split(', '))
    
    encoded_data = mlb.fit_transform(column)
    encoded_df = pd.DataFrame(encoded_data, columns=mlb.classes_+"_"+col)
    df = df.drop(col, axis=1)
    df = pd.concat([df, encoded_df], axis=1)
