# Assignment 2: Practical data mining project - Exploring Mushrooms.txt and Constructing ID3 #
by Luke Crawford (12617306) and Jonathan Rau (13112750)

## Importing and Exploring the data ##

In [2]:
import pandas as pd
import torch
import numpy as np
from skimage import io, transform
from math import log

In [3]:
#Load the data from Mushrooms.txt
shrooms = pd.read_csv('Data/Mushrooms.txt', header=None)
#Input column names from Mushroom Attributes.txt
columns=['class','cap-shape','cap-surface','cap-color','bruises','odor','gill-attachment','gill-spacing','gill-size','gill-color','stalk-shape','stalk-root','stalk-surface-above-ring','stalk-surface-below-ring','stalk-color-above-ring','stalk-color-below-ring','veil-type','veil-color','ring-number','ring-type','spore-print-color','population','habitat']

#Rename columns according to their real attributes
shrooms.set_axis(columns, axis='columns', inplace=True)

In [4]:
shrooms.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,9,9,1,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,w,w,p,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,4936,4464,4384,8124,7924,7488,3968,2388,4040,3148


In [5]:
print('Edible Mushrooms: {}'.format(shrooms['class'].value_counts()[0]))
print('Poisonous Mushrooms: {}'.format(shrooms['class'].value_counts()[1]))

Edible Mushrooms: 4208
Poisonous Mushrooms: 3916


# Preprocessing

In [6]:
#Remove columns with only 1 value, as this data is irrelevant 
nonuniques = []
for attr in shrooms:
    if shrooms[attr].value_counts()[0] == shrooms[attr].count(): #If there are no unique values
        nonuniques.append(attr)

#Remove nonunique columns
shrooms.drop(columns=nonuniques,inplace=True)

shrooms.describe()

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-color,ring-number,ring-type,spore-print-color,population,habitat
count,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124,...,8124,8124,8124,8124,8124,8124,8124,8124,8124,8124
unique,2,6,4,10,2,9,2,2,2,12,...,4,4,9,9,4,3,5,9,6,7
top,e,x,y,n,f,n,f,c,b,b,...,s,s,w,w,w,o,p,w,v,d
freq,4208,3656,3244,2284,4748,3528,7914,6812,5612,1728,...,5176,4936,4464,4384,7924,7488,3968,2388,4040,3148


In [7]:
#Check to see if there are any missing values
shrooms.isnull().sum()

class                       0
cap-shape                   0
cap-surface                 0
cap-color                   0
bruises                     0
odor                        0
gill-attachment             0
gill-spacing                0
gill-size                   0
gill-color                  0
stalk-shape                 0
stalk-root                  0
stalk-surface-above-ring    0
stalk-surface-below-ring    0
stalk-color-above-ring      0
stalk-color-below-ring      0
veil-color                  0
ring-number                 0
ring-type                   0
spore-print-color           0
population                  0
habitat                     0
dtype: int64

In [8]:
#Encode values to numerical data
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

for attr in shrooms:
    shrooms[attr] = encoder.fit_transform(shrooms[attr].astype('str'))


In [9]:
shrooms['class']
encoder.inverse_transform(shrooms['class'])

  if diff:


array(['g', 'd', 'd', ..., 'd', 'g', 'd'], dtype=object)

In [10]:
shrooms['population'].head()

0    3
1    2
2    2
3    3
4    0
Name: population, dtype: int64

In [11]:
#Split the data into training and test data
#y is our target class
y = shrooms.iloc[:,0]
#x is our attributes
x = shrooms.iloc[:,1:]


from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size = 0.1)



## Test using a basic neural network ##

In [12]:
from sklearn.neural_network import MLPClassifier
mlpc = MLPClassifier()
mlpc.fit(x_train,y_train)

predictions = mlpc.predict(x_test)

In [13]:
print ("Your model accuracy is: " + str(np.sum(predictions == y_test) / float(predictions.size)))


Your model accuracy is: 0.9302516411378556


Compared to In[13] of Assignment2_Final:

|          | Exploration (NN) | Final (ID3) |
|----------|------------------|-------------|
| Accuracy | 93%              | 99.8%       |

# ID3 Algorithm #

# Training the Algorithm on the Mushroom Set #

In [14]:
#TODO

class DecisionTreeNode:
    def __init__(self, parent):
        self.parent=parent
        self.children= dict()
    
    def add_child(self, child_key, child_value):
        self.children[child_key] = child_value
        
    def get_children(self):
        return self.children
    
    def get_parent(self):
        return self.parent
    
    def get_attribute(self):
        return self.attribute
    
    def set_attribute(self, attribute):
        self.attribute = attribute
        
    def set_label(self, label):
        self.label=label
    
    def get_label(self):
        return self.label
    
    def __str__(self, level=1):
        text=""
        if not hasattr(self, 'parent'):
            text += "root: "
        if hasattr(self, 'label'):
            text += "leaf: label = {}".format(self.label)
        else:
            text += "split on {}, descendants (".format(self.attribute)
            for value, child in self.children.items():
                text += "\n"+"\t"*level+"branch value = {}, child node: {}".format(value, child.__str__(level+1))
            
            text += ")"
        return text
            
        

In [15]:
def entropy(target_):
    h = 0
    for label_ in target_.unique():
        h += -((target_[target_==label_].size / target_.size)* log(target_[target_==label_].size / target_.size, 2))
    return h

def determine_split_attribute(data_, target_, attributes_):
    best_attribute_ = None
    best_gain_ = 0
    base_entropy_ = entropy(target_)
    for attribute_ in attributes_:
        x_select_ = data_.loc[:,attribute_]
        information_gain_ = base_entropy_
        for value_ in x_select_.unique():
            split_ = pd.concat([x_select_[x_select_==value_], target_], axis=1, join='inner')
            split_entropy_ = entropy(split_['class'])
            information_gain_ -= split_entropy_ * (split_.size / data_.size)
        
        if information_gain_ >= best_gain_:
            best_attribute_ = attribute_
            
    return best_attribute_

In [16]:
#param attributes_ should be a set of attributes
#param target_ should be a series (like y_train)
#param data_ should be a dataframe (like x_train)
def build_decision_tree(data_, target_, attributes_):
    node_ = DecisionTreeNode(None)
    if target_.unique().size==1:
        node_.set_label(target_.iloc[0])
        
    if len(attributes_) == 0:
        node_.set_label(target_.value_counts().head(1).last_valid_index())
        
    else:
        split_attribute_ = determine_split_attribute(data_, target_, attributes_)
        #print('split attribute: {}'.format(split_attribute_))
        node_.set_attribute(split_attribute_)
        split_select_ = data_.loc[:,split_attribute_]
        for split_value_ in split_select_.unique():
            child_data_ = data_[data_[split_attribute_] == split_value_]
            child_target_ = pd.concat([child_data_, target_], axis=1, join='inner').loc[:,'class']
            child_attributes_ = attributes_
            child_attributes_.remove(split_attribute_)
            #print('child_attributes: {}'.format(child_attributes_))
            node_.add_child(split_value_, build_decision_tree(child_data_,child_target_,child_attributes_))
            #print('currend subtree: {}'.format(node_))
            child_attributes_.add(split_attribute_)
            
    return node_
            

In [17]:
#data should be a dataframe (like x_train)
#root should be a a DecisionTreeNode (returned from build_decision_tree)
def make_prediction(root, data):
    predictions = dict()
    for i, point in data.iterrows():
        current_node_ = root
        not_predicted = True
        while not_predicted:
            if(hasattr(current_node_, 'label')):
                predictions[i]=current_node_.get_label()
                not_predicted = False
            else:
                split_value = point[current_node_.get_attribute()]
                #missing one edgecase: what if the split_value is not a branch in the decision tree?
                current_node_ = current_node_.get_children()[split_value]
    result = pd.Series(predictions)
    return result
        
    

In [18]:
attributes = {'cap-shape','cap-surface'}


# testing some methods, can be removed
for attribute_ in attributes:
    x_select = x_train.loc[:,attribute_]
    for value_ in x_select.unique():
        split = pd.concat([x_select[x_select==value_], y_train], axis=1, join='inner')
        ##split = pd.merge(left = x_select[x_select==value_], right=y_train, left_index=True, right_index=True)
        split_entropy = entropy(split['class'])
        print(split_entropy)
        
determine_split_attribute(x_train, y_train, attributes)

0.99878651470097
0.9986210111652296
0.672294817075638
0.8951127210954559
0.0
0.9877446975059743
0.9867089261132442
0.9059282160429992


'cap-surface'

In [19]:
tree= build_decision_tree(x_train, y_train, attributes)
print(tree)

split on cap-surface, descendants (
	branch value = 3, child node: split on cap-shape, descendants (
		branch value = 2, child node: leaf: label = 0
		branch value = 5, child node: leaf: label = 1
		branch value = 3, child node: leaf: label = 1
		branch value = 0, child node: leaf: label = 0)
	branch value = 2, child node: split on cap-shape, descendants (
		branch value = 5, child node: leaf: label = 1
		branch value = 0, child node: leaf: label = 0
		branch value = 2, child node: leaf: label = 1
		branch value = 3, child node: leaf: label = 1)
	branch value = 0, child node: split on cap-shape, descendants (
		branch value = 2, child node: leaf: label = 0
		branch value = 5, child node: leaf: label = 0
		branch value = 3, child node: leaf: label = 0
		branch value = 0, child node: leaf: label = 0
		branch value = 4, child node: leaf: label = 0))


In [42]:
training_results = make_prediction(tree,x_train)

# Testing with an interactive example #

## Define a dictionary of dictionaries to map attribute descriptions ##

In [22]:
results = []
with open('Data/Mushroom Attributes.txt') as input:
    for line in input:
        x = line.expandtabs()
        x = x.replace(' ','')
        x = x.replace('\n','')
        results.append(x)

print(results)

['AttributeInformation:', '0.class:edible=e,poisonous=p)', '1.cap-shape:bell=b,conical=c,convex=x,flat=f,', 'knobbed=k,sunken=s', '2.cap-surface:fibrous=f,grooves=g,scaly=y,smooth=s', '3.cap-color:brown=n,buff=b,cinnamon=c,gray=g,green=r,', 'pink=p,purple=u,red=e,white=w,yellow=y', '4.bruises?:bruises=t,no=f', '5.odor:almond=a,anise=l,creosote=c,fishy=y,foul=f,', 'musty=m,none=n,pungent=p,spicy=s', '6.gill-attachment:attached=a,descending=d,free=f,notched=n', '7.gill-spacing:close=c,crowded=w,distant=d', '8.gill-size:broad=b,narrow=n', '9.gill-color:black=k,brown=n,buff=b,chocolate=h,gray=g,', 'green=r,orange=o,pink=p,purple=u,red=e,', 'white=w,yellow=y', '10.stalk-shape:enlarging=e,tapering=t', '11.stalk-root:bulbous=b,club=c,cup=u,equal=e,', 'rhizomorphs=z,rooted=r,missing=?', '12.stalk-surface-above-ring:fibrous=f,scaly=y,silky=k,smooth=s', '13.stalk-surface-below-ring:fibrous=f,scaly=y,silky=k,smooth=s', '14.stalk-color-above-ring:brown=n,buff=b,cinnamon=c,gray=g,orange=o,', 'pink=

In [23]:
del results[0] #Remove 'Attribute Information' text from results

In [24]:
def addDictionaryEntry(string, dictionary):
    #Creates a key and value based on the letter and actual value
    #E.G dictionary['cap-color']['brown'] = n
    x = string.split('=')
    dictionary[x[0]] = x[1]
    return dictionary



In [25]:
#Turn raw data into a dictionary of dictionaries as the data structure.

import re
dictionaryformat = re.compile(r'\d+.\w+[-\w]+:') #Split data by name formatting

dictofdict = {}
for attr in results:
    searchObj = re.search(dictionaryformat,attr) #Search for attribute names
    if searchObj: # If regex query returns anything
        x = attr.split(':') #Split into attribute name and data
        attrName = x[0]
        entries = x[1]
        currentDict = {}
        dictofdict[attrName.split('.')[1]] = currentDict #Create entry for attribute name in master dictionary
        for entry in entries.split(','):
            if (entry != ''):
                currentDict = addDictionaryEntry(entry,currentDict)
    else: #Moments where formatting errors occur such as /n creating a new entry
        for entry in attr.split(','):
            if (entry != ''):
                currentDict = addDictionaryEntry(entry,currentDict)
                
print(dictofdict['cap-color']['brown'])

n


In [28]:
def predictionAsString(prediction):
    if (str(prediction) == "1"):
        print("This Mushroom is Poisonous! Don't eat it!")
    else:
        print("This Mushroom is not Poisonous :)")

In [1]:
x = training_results.sample()
print(dictofdict)


NameError: name 'training_results' is not defined

In [57]:
print("This mushroom is classifed as: {}",predictionAsString(y_test(x)))
#predictionAsString(prediction)

TypeError: 'Series' object is not callable