In [None]:
# Import statements
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestClassifier

In [None]:
def get_data(predictor_columns, shuffle_data, train_size, random_seed=0):
    """Read the dataset from the CSV file and prepare the data for using it in training and testing the neural network.
    
    Parameters
    ----------
    predictor_columns : list of strings
        The names of the columns used for predicting the classes (the Iris species)
    shuffle_data : bool
        Whether or not the data is shuffled before using it for the neural network
    train_size : int
        The number of rows in the dataset of 150 rows in total which will be used for training the
        network. The remainder of the rows will be used for testing the trained network.
    random_seed : int
        Seed for data shuffling.
        
    Returns
    -------
    iris_classes : list of strings
        List of Iris species.
    x_testset : numpy array of floats
        Test dataset.
    x_trainset : numpy array of floats
        Train dataset.
    y_testset : numpy array of floats
        Labels test dataset, one hot encoded.
    y_trainset : numpy array of floats
        Labels train dataset, one hot encoded.
    """
    
    # Load the dataset from the file
    dataset = pd.read_csv('iris_data.csv')
    iris_classes = dataset['Species'].unique()  # Create a list with all Iris species
    dataset = pd.get_dummies(dataset, columns=['Species'])  # One Hot encoding

    # Select the correct classification and predictor columns and make numpy arrays of them with the correct types
    # Classification columns: 'Species_Iris-setosa', 'Species_Iris-versicolor', 'Species_Iris-virginica'
    y = dataset[list(dataset.columns.values)[-3:]]
    y = np.array(y, dtype='float32')

    # Predictor columns: (a subset of) 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'
    x = dataset[predictor_columns]
    x = np.array(x, dtype='float32')

    # Shuffle the data; initially the data is ordered based on their class
    if shuffle_data:
        np.random.seed(random_seed)
        indices = np.random.choice(len(x), len(x), replace=False)
        x_values, y_values = x[indices], y[indices]
    else:
        x_values, y_values = x, y

    # Create a train set and a test set. Total dataset size is 150.
    test_size = 150 - train_size
    x_testset = x_values[-test_size:]
    x_trainset = x_values[:-test_size]
    y_testset = y_values[-test_size:]
    y_trainset = y_values[:-test_size]

    return iris_classes, x_testset, x_trainset, y_testset, y_trainset

In [None]:
# Parameter values
# Play with these parameters to improve results!

# Columns used to predict the Iris-classes. All columns: SepalLengthCm, SepalWidthCm, PetalLengthCm, PetalWidthCm.
predictor_columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm']

# Whether or not the input data is be shuffled. Initially, the first 50 rows are Iris Setosa, the next 50 rows are
# Iris Versicolor and the last 50 rows are Iris Virginica.
shuffle_data = False

# Number of rows to be used for training the network. In total there are 150 rows in the dataset. The remaining rows
# will be used to test the trained network.
train_size = 75

# The maximum depth of the decision tree(s).
max_depth = 1

# Number of trees in the random forest.
n_forest_estimators = 3

In [None]:
# Train and test a single decision tree

# Get data
iris_classes, x_testset, x_trainset, y_testset, y_trainset = get_data(predictor_columns, shuffle_data, train_size)

# Make decision tree model
tree = DecisionTreeClassifier(random_state=0, max_depth=max_depth)

# Train model
tree.fit(x_trainset, y_trainset)

# Run on test set
acc = tree.score(x_testset, y_testset)

# Print results
print("REPORT")
print("------------------------------------------------------------------------------")
print("Predictor columns: " + str(predictor_columns))
print("Shuffle data: " + str(shuffle_data))
print("Training dataset size: " + str(train_size))
print("Max depth: " + str(max_depth))
print("------------------------------------------------------------------------------")
print("Tree test accuracy: " + str(acc))

In [None]:
# Export decision tree graph

graph = export_graphviz(tree, feature_names=predictor_columns, impurity=False, node_ids=False)
print("Copy and paste code below to http://www.webgraphviz.com/ to see tree structure")
print("Read the 'value' field as follows:")
print("[[# not class1, # class1], [# not class2, # class2], [# not class3, # class3]]")
print("------------------------------------------------------------------------------")
print(graph)

In [None]:
# Train and test a random forest

# Get data
iris_classes, x_testset, x_trainset, y_testset, y_trainset = get_data(predictor_columns, shuffle_data, train_size)

# Make random forest model
forest = RandomForestClassifier(random_state=0, max_depth=max_depth, n_estimators=n_forest_estimators)

# Train model
forest.fit(x_trainset, y_trainset)

# Run on test set
acc = forest.score(x_testset, y_testset)

# Print results
print("REPORT")
print("------------------------------------------------------------------------------")
print("Columns: " + str(predictor_columns))
print("Shuffle data: " + str(shuffle_data))
print("Training dataset size: " + str(train_size))
print("Max depth: " + str(max_depth))
print("Number of estimators in forest: " + str(n_forest_estimators))
print("------------------------------------------------------------------------------")
print("Forest test accuracy: " + str(acc))