<font color='blue'>Cell 1
Importing libraries

In [16]:
import numpy as np
import pandas as pd

<font color='blue'>Cell 2
Reading the data

In [17]:
data = pd.read_csv('data1.csv')
data.loc[np.r_[0:3, 51:53, 101:103], :]

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
51,6.4,3.2,4.5,1.5,Versicolor
52,6.9,3.1,4.9,1.5,Versicolor
101,5.8,2.7,5.1,1.9,Virginica
102,7.1,3.0,5.9,2.1,Virginica


<font color='blue'>Cell 3
Dataset verification

In [18]:
#Check the dataset to make sure no data is missing and Check the class labels
def verify_dataset(data):
  #Use data_found as a dummy variable to determine whether to print missing value information
    data_found = 1
    for each_column in data.columns:
        if data[each_column].isnull().any():
            print("Data missing in Column " + each_column)
            data_found = 0
            quit()
            
        if data_found == 1:
            print("Dataset is complete. No missing value")
            
        return
#Call verify_dataset and check data
verify_dataset(data)

Dataset is complete. No missing value


<font color='blue'>Cell 4
Creating testing and training data sets

In [19]:
##Splitting The Datase in training and testing
def split_dataset_test_train(data):
    data = data.sample(frac=1).reset_index(drop=True) 
    ##Use the .sample() function to scramble the data set
    training_data = data.iloc[:int(0.7 * len(data))].reset_index(drop=True)
    ##Determine the integer location (iloc) from beginning of array (:) to 0.7*150 and do a ”cleanup” with a reset call
    testing_data = data.iloc[int(0.7 * len(data)):].reset_index(drop=True)
    return [training_data, testing_data]
    
#Call split_dataset_test_train and check data sets
testtrain = split_dataset_test_train(data)
print(testtrain)

[     sepal.length  sepal.width  petal.length  petal.width     variety
0             7.1          3.0           5.9          2.1   Virginica
1             4.4          2.9           1.4          0.2      Setosa
2             5.7          2.9           4.2          1.3  Versicolor
3             4.9          2.4           3.3          1.0  Versicolor
4             6.3          2.5           4.9          1.5  Versicolor
..            ...          ...           ...          ...         ...
100           6.4          3.1           5.5          1.8   Virginica
101           6.0          2.2           4.0          1.0  Versicolor
102           5.0          3.0           1.6          0.2      Setosa
103           6.4          2.7           5.3          1.9   Virginica
104           5.5          2.5           4.0          1.3  Versicolor

[105 rows x 5 columns],     sepal.length  sepal.width  petal.length  petal.width     variety
0            6.8          2.8           4.8          1.4  Versico

<font color='blue'>Cell 5
Calculate gini index for a given split 

In [20]:
def gini_index(data, target_col):    
    elements, counts = np.unique(data[target_col], return_counts = True)
    total_counts = sum(counts)
    sum_prob = 0.0
    for i in range (elements.size):
        prob_i = counts[i] / total_counts
        sum_prob = sum_prob + prob_i * prob_i
    
    gini_index= 1 - sum_prob
    return gini_index    

<font color='blue'>Cell 6
Information gain

In [21]:
def information_gain(data, target_col, threshold, target_class = "variety"):
    total_gini_index = gini_index(data, "variety")
    data_left = data[data[target_col] < threshold]
    data_right = data[data[target_col] >= threshold]
    gini_index_after_split = data_left.shape[0]/ data.shape[0] * gini_index(data_left, "variety") + data_right.shape[0]/data.shape[0] * gini_index(data_right, "variety")
    info_gain = total_gini_index - gini_index_after_split
    return info_gain    

<font color='blue'>Cell 7
Establish optimal splits based on the best features, best cutoffs, and best information gains

In [22]:
def selectBestFeatureAndCutoff(data, target_class = "variety"):
    featureList = list(data)[0:4]
    best_feature = "None"
    best_cutoff = 0.0
    best_info_gain = 0.0
    for feature in featureList:
        max_value = data[feature].max()
        min_value = data[feature].min()
        for cutoff in np.arange(min_value, max_value, 0.1):
            if best_info_gain < information_gain(data, feature, cutoff):
                best_info_gain = information_gain(data, feature, cutoff)
                best_cutoff = cutoff
                best_feature = feature
                
    return [best_feature, best_cutoff, best_info_gain]

<font color='blue'>Cell 8
Define the decision tree root (ie the first node), create the associated recursive splitting function, and create the associated prediction function


In [23]:
class Node:
    def __init__(self, feature, cut_off, label = None, is_leaf = False):
        self.feature = feature
        self.cut_off = cut_off
        self.left_child = None
        self.right_child = None
        self.is_leaf = is_leaf
        self.label = label 
        #print("node's label: ")
        #print(self.label)
class DTree:
    # method to train a decision tree
    def train(self, data):
        self.root = self.build_tree(data)
    
    # method to build decision tree
    def build_tree(self, data):
        best_feature, best_cutoff, best_info_gain = selectBestFeatureAndCutoff(data)
        # if all data has the same label , we are at a leaf node 
        if len(np.unique(data["variety"])) == 1:
            #print(data["variety"].iloc[0])
            return Node(best_feature, best_cutoff, data["variety"].iloc[0], True)
 
        # if we are not the leaf
        # first lets split data        
        data_left = data[data[best_feature] < best_cutoff]
        data_right = data[data[best_feature] >= best_cutoff]
 
        #build current node
        current_node = Node(best_feature, best_cutoff)
        #add left node 
        current_node.left_child = self.build_tree(data_left)
        #add right node 
        current_node.right_child = self.build_tree(data_right)
 
        return current_node
 # Make a prediction with a decision tree
    def predict(self, data):
        current_node = self.root
        while(True):
 
            # if we are at the leaf node , return label
            if current_node.is_leaf == True:
                return current_node.label
            # otherwise we need figure out where to go next
            feature = current_node.feature
            cutoff = current_node.cut_off
            if data[feature]  < cutoff:
                current_node = current_node.left_child
            else:
                current_node = current_node.right_child
    

<font color='blue'>Cell 9
Train the decision tree

In [24]:
d_tree = DTree()
training_data = testtrain[0]
d_tree.train(training_data)

<font color='blue'>Cell 10
Define the confusion matrix

In [25]:
def print_ConfusionMatrix(result):
    count_SS = result[0]
    count_SVi = result[1]
    count_SVe = result[2] 
    count_ViVi = result[3]
    count_ViVe = result[4]
    count_ViS = result[5] 
    count_VeVe = result[6] 
    count_VeVi = result[7] 
    count_VeS = result[8]
    data = {"predict\Observe": ["Setosa (predict)", "Virginica (predict)", "Versicolor (predict)"],
            "Setosa (observed)": [count_SS / (count_SS + count_ViS + count_VeS), count_SVi / (count_SVi + count_ViVi + count_VeVi), count_SVe / (count_SVe + count_ViVe + count_VeVe)],
            "Virginica (observed)": [count_ViS / (count_SS + count_ViS + count_VeS), count_ViVi / (count_SVi + count_ViVi + count_VeVi), count_ViVe / (count_SVe + count_ViVe + count_VeVe)],        
            "Versicolor (observed)": [count_VeS / (count_SS + count_ViS + count_VeS), count_VeVi / (count_SVi + count_ViVi + count_VeVi), count_VeVe / (count_SVe + count_ViVe + count_VeVe)]
            }
    
    output = pd.DataFrame(data, columns = ["predict\Observe", "Setosa (observed)", "Virginica (observed)", "Versicolor (observed)"])
    return output

<font color='blue'>Cell 11
Create the confusion matrix

In [26]:
def predict_batch(data):
    d_tree = DTree()
    d_tree.train(training_data)
    count_SS = 0
    count_SVi = 0
    count_SVe = 0
    count_ViVi = 0
    count_ViS = 0
    count_ViVe = 0
    count_VeVe = 0
    count_VeS = 0
    count_VeVi = 0
    count_total_T = 0
    count_total_F  = 0
    
    for i in range (data.shape[0]):
        instance = data.iloc[i]
        true_label = instance["variety"]
        predict_label = d_tree.predict(data.iloc[i])
        if true_label == predict_label:
            count_total_T = count_total_T + 1
            if true_label == "Setosa":
                count_SS = count_SS + 1
            elif true_label == "Versicolor":
                count_ViVi = count_ViVi + 1
            elif true_label == "Virginica":
                count_VeVe = count_VeVe + 1
        else:
            count_total_F = count_total_F + 1
            if true_label == "Setosa" and predict_label == "Virginica":
                count_SVi = count_SVi + 1
            elif true_label == "Setosa" and predict_label == "Versicolor":
                count_SVe = count_SVe + 1 
            elif true_label == "Versicolor" and predict_label == "Virginica":
                count_VeVi = count_VeVi + 1  
            elif true_label == "Versicolor" and predict_label == "Setosa":
                count_VeS = count_VeS + 1     
            elif true_label == "Virginica" and predict_label == "Versicolor":
                count_ViVe = count_ViVe + 1  
            elif true_label == "Virginica" and predict_label == "Setosa":
                count_ViS = count_ViS + 1                   

    return [count_SS, count_SVi, count_SVe, count_ViVi, count_ViVe, count_ViS, count_VeVe, count_VeVi, count_VeS, count_total_T, count_total_F]


<font color='blue'>Cell 12
Look at the confusion matrix for training data

In [27]:
training_data = testtrain[0]
print_ConfusionMatrix(predict_batch(training_data))

Unnamed: 0,predict\Observe,Setosa (observed),Virginica (observed),Versicolor (observed)
0,Setosa (predict),1.0,0.0,0.0
1,Virginica (predict),0.0,1.0,0.0
2,Versicolor (predict),0.0,0.0,1.0


<font color='blue'>Cell 13
Look at the confusion matrix for testing data

In [28]:
testing_data = testtrain[1]
print_ConfusionMatrix(predict_batch(testing_data))

Unnamed: 0,predict\Observe,Setosa (observed),Virginica (observed),Versicolor (observed)
0,Setosa (predict),1.0,0.0,0.0
1,Virginica (predict),0.0,1.0,0.0
2,Versicolor (predict),0.0,0.055556,0.944444


<font color='blue'>Cell 14
Function to make predictions

In [29]:
# method that run prediction
def predict(d_tree, sepal_length, sepal_width, petal_length, petal_width):
    test_data = pd.Series([sepal_length, sepal_width, petal_length, petal_width], index = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width'])
    return d_tree.predict(test_data)

<font color='blue'> Exercise 2.1


In [30]:

def input_test_seq():
    sepal_length = float(input('Enter the Sepal length in cm :'))
    while True:
        if float(sepal_length)< 0 or float(sepal_length) > 10:
            print('Inalid Entry. Please enter value less than 10')
            sepal_length = float(input('Enter the sepal length in cm :'))
            continue
        else:
            break

    sepal_width = float(input('Enter the Sepal width in cm :'))
    while True:
        if float(sepal_width) < 0 or float(sepal_width) > 10:
            print('Inalid Entry. Please enter value less than 10')
            sepal_width = float(input('Enter the sepal width in cm :'))
            continue
        else:
            break

    petal_length = float(input('Enter the petal length in cm :'))
    while True:
        if float(petal_length) <0 or float(petal_length) > 10:
            print('Inalid Entry. Please enter value less than 10')
            petal_length = float(input('Enter the petal length in cm :'))
            continue
        else:
            break

    petal_width = float(input('Enter the petal width in cm :'))
    while True:
        if float(petal_width) < 0 or float(petal_width) > 10:
            print('Inalid Entry. Please enter value less than 10')
            petal_width = float(input('Enter the petal width in cm :'))
            continue
        else:
            break

    predict_features = [sepal_length,sepal_width,petal_length,petal_width]
    result_category = predict(d_tree, predict_features[0],predict_features[1],predict_features[2],predict_features[3])
    print("This flower is a ", result_category)

    return 

input_test_seq()


Enter the Sepal length in cm :1
Enter the Sepal width in cm :2
Enter the petal length in cm :3
Enter the petal width in cm :4
This flower is a  Virginica
