<font color='blue'>Cell 1
Importing libraries

In [None]:
install.packages("rpart")
library("rpart") # This R package yileds a decision tree with high accuracy 

<font color='blue'>Cell 2
Reading the data

In [None]:
#Input the data
data <- read.csv(file="data1.csv", header=TRUE)
data[c(1:3,52:53,102:103),] #Print a few rows of data for each class/label

<font color='blue'>Cell 3
Dataset verification

In [None]:
#Check the dataset to make sure no data is missing
verify_dataset <- function(dataset) {
  data_found <- 1 #Flag to signal no missing data points were found in a column
  for (each_column in colnames(dataset)) { #Test all columns
    
    if (any(is.na(dataset[,each_column]))=="TRUE") { #Check for any missing data in column
      cat(paste0("Data missing in Column ", each_column)) #line prints which column is missing data
      data_found <- 0 #Alternative tag
      break() #Terminate loop and ultimately function since the flag data_found is 0
    }
  }
  
  if (data_found==1) { #Same as flag means no data missing
    print("Dataset is complete. No missing value")
  }
  
} #end of function

verify_dataset(data) #Test the dataset with the above function

<font color='blue'>Cell 4
Creating testing and training data sets

In [None]:
#Split data into a train and a test set
split_dataset_test_train <- function(data) {
  set.seed(37) #This ensures that we obtain the same selection and is important for reproducibility while building code
  index <- sample(1:nrow(data), 0.7*nrow(data)) #Randomly select 70% of rows
  training_data <- data[index,] #Training data is 70% of rows
  testing_data <- data[-index,] #Testing data is 30% of rows
  datasets <- list(training_data, testing_data) #Create list called testtrain to hold data sets
  return(datasets) #Return the list
}

testtrain <- split_dataset_test_train(data)
print(testtrain)
training_data <- testtrain[[1]]

<font color='blue'>Cell 5
Create the classification decision tree and the prediction function using the training data and the gini index

In [None]:
tree_predict <- function(dataset, train_data) {
  tree <- rpart(formula = variety ~ ., data=train_data, method="class", parms=list(split="gini")) # Creating the desicion tree model using "variety" as classification label, and splitting based on gini index
  tree_pred <- as.data.frame(predict(tree, dataset, type="class"))
  return(tree_pred)
}

tree_pred <- tree_predict(data, training_data)

<font color='blue'>Cell 6
Define and apply the confusion matrix to the training data

In [None]:
#Normalize the confusion table
normalizeCM <- function(cm){
    cm_norm <- cm
    for (i in 1:3){
        for (j in 1:3){
            cm_norm[i,j] <- cm[i,j]/sum(cm[i,])
        }
    }
    return(round(cm_norm, 2))
}

#Call upon the normalizeCM function and print out the normalized confusion table
normalizeCM(table(tree_pred[,1], data$variety))

<font color='blue'> Exercise 2.1


<font color='blue'>Cell 7
Function for producing error message for an incorrectly entered test value

In [None]:
# The below function inputValue asks the user to enter a numeric value for a certain feature
inputValue <- function(feature_name) {
  # creates message to prompt user to input the respective value
  message <- paste("Enter ", feature_name, ": ")
  while(TRUE){
    # If below zero or above the maximum value for that feature, prints out error message, and prompts the user once more 
    # to input a value for the respective feature
    value = as.double(readline(prompt = message))
    if(value < 0 || value > 10){
      warning("Invalid entry, please try again")
    } else {
      # returns the measurement as a double
      return(value)
    }
  } 
}

<font color='blue'>Cell 8
Make predictions based on user inputted values

In [None]:
# Create an iris species level vector
species_levels <- levels(factor(data$variety))

# Function to predict species 
predictSpecies <- function(measurements, train_data) {
        #Create a data frame with given input values
        predict_features <- data.frame('sepal.length'= measurements[1], 'sepal.width'= measurements[2],
                                       'petal.length'= measurements[3], 'petal.width'= measurements[4])
        # probabilities for the 3 possible outcomes 
        prediction <- tree_predict(predict_features, train_data)
        # Change prediction from numeric to character using species_levels
        species_pred <- species_levels[as.numeric(prediction)]
        # Inform user of calculated prediction
        cat(paste0("That flower is most likely: ", species_pred))
}

<font color='blue'>Cell 9
Enter user inputs and display the predicted species

In [None]:
# Call upon the inputValue function four times to obtain the four input values
print('Measurements need to be numeric values between 0 and 10')
sepal_length <-  inputValue('sepal.length')
sepal_width <-  inputValue('sepal.width')
petal_length <- inputValue('petal.length')
petal_width <- inputValue('petal.width')

# create a list with four given measurements
features = c(sepal_length, sepal_width, petal_length, petal_width)

# Call upon the predictSpecies function 
predictSpecies(features, training_data)