In [1]:
from collections import Counter
from pathlib import Path

import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("data/mushrooms.csv")
data

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [3]:
data.shape

(8124, 23)

To convert the 22 feautures to numerical values the below is method is used

In [4]:
possible_values = {}
for feature in data.columns:
    counter = Counter(list(data[feature]))
    possible_values[feature] = list(counter.keys())
possible_values

{'class': ['p', 'e'],
 'cap-shape': ['x', 'b', 's', 'f', 'k', 'c'],
 'cap-surface': ['s', 'y', 'f', 'g'],
 'cap-color': ['n', 'y', 'w', 'g', 'e', 'p', 'b', 'u', 'c', 'r'],
 'bruises': ['t', 'f'],
 'odor': ['p', 'a', 'l', 'n', 'f', 'c', 'y', 's', 'm'],
 'gill-attachment': ['f', 'a'],
 'gill-spacing': ['c', 'w'],
 'gill-size': ['n', 'b'],
 'gill-color': ['k', 'n', 'g', 'p', 'w', 'h', 'u', 'e', 'b', 'r', 'y', 'o'],
 'stalk-shape': ['e', 't'],
 'stalk-root': ['e', 'c', 'b', 'r', '?'],
 'stalk-surface-above-ring': ['s', 'f', 'k', 'y'],
 'stalk-surface-below-ring': ['s', 'f', 'y', 'k'],
 'stalk-color-above-ring': ['w', 'g', 'p', 'n', 'b', 'e', 'o', 'c', 'y'],
 'stalk-color-below-ring': ['w', 'p', 'g', 'b', 'n', 'e', 'y', 'o', 'c'],
 'veil-type': ['p'],
 'veil-color': ['w', 'n', 'o', 'y'],
 'ring-number': ['o', 't', 'n'],
 'ring-type': ['p', 'e', 'l', 'f', 'n'],
 'spore-print-color': ['k', 'n', 'u', 'h', 'w', 'r', 'o', 'y', 'b'],
 'population': ['s', 'n', 'a', 'v', 'y', 'c'],
 'habitat': ['u'

Creating labels for the class data

In [5]:
# Create a list with all the data points labels
labels_data = list(data["class"])

# Convert to numbers
labels = []
for label in labels_data:
    if label == "p":
        labels.append([0])
    elif label == "e":
        labels.append([1])
        
labels = np.array(labels)
print("Labels array shape = {}".format(labels.shape))

# Delete "class" column from the data
data = data.drop(labels=["class"], axis=1)

Labels array shape = (8124, 1)


In [32]:
data_matrix=[]
for colum in data.columns:
    colum_data=[]
    columlist=list(data[colum])
    counter=Counter(columlist)
    
    no_of_colum_label=len(counter)
    colum_labels=list(counter.keys())
    
    for i in range(no_of_colum_label):
        colum_data.append(np.zeros(len(columlist)))
    
    for j in range(len(columlist)):
        label_index=colum_labels.index(columlist[j])
        colum_data[label_index][j]=1

    for data_column in colum_data:
        data_matrix.append(data_column)

data_matrix=np.array(data_matrix).transpose()
data_matrix.shape


(8124, 117)

Splitting the availabel data into training, testing and validation sets

In [36]:
data_split=0.1
split_instance=round(data_matrix.shape[0]*data_split)
train_instance=data_matrix.shape[0]-split_instance *2


In [38]:
training_data = data_matrix[:train_instance, :]
training_labels = labels[:train_instance, :]
print("Training data shape = {}".format(training_data.shape))

Training data shape = (6500, 117)


In [39]:
validation_data = data_matrix[train_instance:train_instance + split_instance, :]
validation_labels = labels[train_instance:train_instance + split_instance, :]
print("Validation data shape = {}".format(validation_data.shape))

Validation data shape = (812, 117)


In [40]:
test_data = data_matrix[train_instance + split_instance:train_instance + split_instance * 2, :]
test_labels = labels[train_instance + split_instance:train_instance + split_instance *2, :]
print("Testing data shape = {}".format(test_data.shape))

Testing data shape = (812, 117)


In [41]:
Path("training_data").mkdir(exist_ok=True)
np.save(open("training_data/mushrooms_training_data.npy", 'wb'), training_data)
np.save(open("training_data/mushrooms_training_labels.npy", 'wb'), training_labels)
np.save(open("training_data/mushrooms_validation_data.npy", 'wb'), validation_data)
np.save(open("training_data/mushrooms_validation_labels.npy", 'wb'), validation_labels)
np.save(open("training_data/mushrooms_test_data.npy", 'wb'), test_data)
np.save(open("training_data/mushrooms_test_labels.npy", 'wb'), test_labels)