# <font color='blue'>
# Exercise 2.2

<font color='blue'>Cell 1
Importing libraries

In [26]:
import numpy as np
import pandas as pd

<font color='blue'>Cell 2
Reading the data

In [27]:
data = pd.read_csv('cancerdata1.csv')
data.loc[np.r_[0:3, 51:53, 101:103], :]

Unnamed: 0,ESR1,PGR,BCL2,NAT1,Results
0,5.1,3.5,1.4,0.2,Cured
1,4.9,3.0,1.4,0.2,Cured
2,4.7,3.2,1.3,0.2,Cured
51,6.4,3.2,4.5,1.5,Recurrence
52,6.9,3.1,4.9,1.5,Recurrence
101,5.8,2.7,5.1,1.9,Dead
102,7.1,3.0,5.9,2.1,Dead


<font color='blue'>Cell 3
Dataset verification

In [28]:
#Check the dataset to make sure no data is missing and Check the class labels
def verify_dataset(data):
  #Use data_found as a dummy variable to determine whether to print missing value information
    data_found = 1
    for each_column in data.columns:
        if data[each_column].isnull().any():
            print("Data missing in Column " + each_column)
            data_found = 0
            quit()

        if data_found == 1:
            print("Dataset is complete. No missing value")

        return
#Call verify_dataset and check data
verify_dataset(data)

Dataset is complete. No missing value


<font color='blue'>Cell 4
Creating testing and training data sets

In [29]:
#Splitting The Database in training and testing
def split_dataset_test_train(data):
    data = data.sample(frac=1).reset_index(drop=True)
    ##Use the .sample() function to scramble the data set
    training_data = data.iloc[:int(0.7 * len(data))].reset_index(drop=True)
    ##Determine the integer location (iloc) from beginning of array (:) to 0.7*150 and do a ”cleanup” with a reset call
    testing_data = data.iloc[int(0.7 * len(data)):].reset_index(drop=True)
    return [training_data, testing_data]

#Call split_dataset_test_train and check data sets
testtrain = split_dataset_test_train(data)
print(testtrain)

[     ESR1  PGR  BCL2  NAT1     Results
0     6.7  3.3   5.7   2.5        Dead
1     5.7  4.4   1.5   0.4       Cured
2     6.0  2.7   5.1   1.6  Recurrence
3     6.6  2.9   4.6   1.3  Recurrence
4     5.1  3.8   1.5   0.3       Cured
..    ...  ...   ...   ...         ...
100   6.3  2.5   4.9   1.5  Recurrence
101   5.0  3.0   1.6   0.2       Cured
102   5.0  2.0   3.5   1.0  Recurrence
103   5.5  2.6   4.4   1.2  Recurrence
104   7.7  3.0   6.1   2.3        Dead

[105 rows x 5 columns],     ESR1  PGR  BCL2  NAT1     Results
0    5.2  3.4   1.4   0.2       Cured
1    5.5  2.3   4.0   1.3  Recurrence
2    6.8  2.8   4.8   1.4  Recurrence
3    5.1  3.5   1.4   0.2       Cured
4    6.0  2.2   4.0   1.0  Recurrence
5    4.5  2.3   1.3   0.3       Cured
6    5.8  2.7   5.1   1.9        Dead
7    6.3  2.3   4.4   1.3  Recurrence
8    4.7  3.2   1.3   0.2       Cured
9    5.6  2.8   4.9   2.0        Dead
10   5.2  2.7   3.9   1.4  Recurrence
11   5.1  3.8   1.9   0.4       Cured
12   5.6  3.

<font color='blue'>Cell 5
Calculate gini index for a given split

In [30]:
def gini_index(data, target_col):
    elements, counts = np.unique(data[target_col], return_counts = True)
    total_counts = sum(counts)
    sum_prob = 0.0
    for i in range (elements.size):
        prob_i = counts[i] / total_counts
        sum_prob = sum_prob + prob_i * prob_i

    gini_index= 1 - sum_prob
    return gini_index

<font color='blue'>Cell 6
Information gain

In [31]:
def information_gain(data, target_col, threshold, target_class = "Results"):
    total_gini_index = gini_index(data, "Results")
    data_left = data[data[target_col] < threshold]
    data_right = data[data[target_col] >= threshold]
    gini_index_after_split = data_left.shape[0]/ data.shape[0] * gini_index(data_left, "Results") + data_right.shape[0]/data.shape[0] * gini_index(data_right, "Results")
    info_gain = total_gini_index - gini_index_after_split
    return info_gain

<font color='blue'>Cell 7
Establish optimal splits based on the best features, best cutoffs, and best information gains

In [32]:
def selectBestFeatureAndCutoff(data, target_class = "Results"):
    featureList = list(data)[0:4]
    best_feature = "None"
    best_cutoff = 0.0
    best_info_gain = 0.0
    for feature in featureList:
        max_value = data[feature].max()
        min_value = data[feature].min()
        for cutoff in np.arange(min_value, max_value, 0.1):
            if best_info_gain < information_gain(data, feature, cutoff):
                best_info_gain = information_gain(data, feature, cutoff)
                best_cutoff = cutoff
                best_feature = feature

    return [best_feature, best_cutoff, best_info_gain]

<font color='blue'>Cell 8
Define the decision tree root (ie the first node), create the associated recursive splitting function, and create the associated prediction function


In [33]:
class Node:
    def __init__(self, feature, cut_off, label = None, is_leaf = False):
        self.feature = feature
        self.cut_off = cut_off
        self.left_child = None
        self.right_child = None
        self.is_leaf = is_leaf
        self.label = label
        #print("node's label: ")
        #print(self.label)
class DTree:
    # method to train a decision tree
    def train(self, data):
        self.root = self.build_tree(data)

    # method to build decision tree
    def build_tree(self, data):
        best_feature, best_cutoff, best_info_gain = selectBestFeatureAndCutoff(data)
        # if all data has the same label , we are at a leaf node
        if len(np.unique(data["Results"])) == 1:
            return Node(best_feature, best_cutoff, data["Results"].iloc[0], True)

        # if we are not the leaf
        # first lets split data
        data_left = data[data[best_feature] < best_cutoff]
        data_right = data[data[best_feature] >= best_cutoff]

        #build current node
        current_node = Node(best_feature, best_cutoff)
        #add left node
        current_node.left_child = self.build_tree(data_left)
        #add right node
        current_node.right_child = self.build_tree(data_right)

        return current_node
 # Make a prediction with a decision tree
    def predict(self, data):
        current_node = self.root
        while(True):

            # if we are at the leaf node , return label
            if current_node.is_leaf == True:
                return current_node.label
            # otherwise we need figure out where to go next
            feature = current_node.feature
            cutoff = current_node.cut_off
            if data[feature]  < cutoff:
                current_node = current_node.left_child
            else:
                current_node = current_node.right_child


<font color='blue'>Cell 9
Train the decision tree

In [34]:
d_tree = DTree()
training_data = testtrain[0]
d_tree.train(training_data)

<font color='blue'>Cell 10
Define the confusion matrix

In [35]:
def print_ConfusionMatrix(result):
    count_SS = result[0]
    count_SVi = result[1]
    count_SVe = result[2]
    count_ViVi = result[3]
    count_ViVe = result[4]
    count_ViS = result[5]
    count_VeVe = result[6]
    count_VeVi = result[7]
    count_VeS = result[8]
    count_total_T =  result[9]
    count_total_F =  result[10]

    print ("True - Cured, Predicted - Cured : count_SS = ",  count_SS)
    print ("True - Cured, Predicted - Recurrence: count_SVi = ",  count_SVi)
    print ("True - Cured, Predicted - Dead: count_SVe = ",  count_SVe)

    print ("True - Dead, Predicted - Dead: count_ViVi = ",  count_ViVi)
    print ("True - Dead, Predicted - Recurrence: count_ViVe = ",  count_ViVe)
    print ("True - Dead, Predicted - Cured: Cured = ",  count_ViS)

    print ("True - Recurrence, Predicted - Recurrence: count_VeVe = ",  count_VeVe)
    print ("True - Recurrence, Predicted - Dead: count_VeVi = ",  count_VeVi)
    print ("True - Recurrence, Predicted - Cured:count_VeS = ",  count_VeS)

    print ("count_total_T = ",  count_total_T)
    print ("count_total_F = ",  count_total_F)


    print ("1) count_SS / (count_SS + count_ViS + count_VeS) = ", count_SS / (count_SS + count_ViS + count_VeS))
    if (count_SS + count_ViS + count_VeS)!=0:
      count_SS_ratio=count_SS / (count_SS + count_ViS + count_VeS)
    else:
      count_SS_ratio=0

    print ("2) count_SVi / (count_SVi + count_ViVi + count_VeVi) = ", count_SVi / (count_SVi + count_ViVi + count_VeVi))
    print ("3) count_SVe / (count_SVe + count_ViVe + count_VeVe) = ", count_SVe / (count_SVe + count_ViVe + count_VeVe))


    print ("4) count_ViS / (count_SS + count_ViS + count_VeS) = ", count_ViS / (count_SS + count_ViS + count_VeS))
    print ("5) count_ViVi / (count_SVi + count_ViVi + count_VeVi) = ", count_ViVi / (count_SVi + count_ViVi + count_VeVi))
    print ("6) count_ViVe / (count_SVe + count_ViVe + count_VeVe) = ", count_ViVe / (count_SVe + count_ViVe + count_VeVe))


    print ("7) count_VeS / (count_SS + count_ViS + count_VeS) = ", count_VeS / (count_SS + count_ViS + count_VeS))
    print ("8) count_VeVi / (count_SVi + count_ViVi + count_VeVi) = ", count_VeVi / (count_SVi + count_ViVi + count_VeVi))
    print ("9) count_VeVe / (count_SVe + count_ViVe + count_VeVe) = ", count_VeVe / (count_SVe + count_ViVe + count_VeVe))


    data = {"predict\Observe": ["Cured (predict)", "Recurrence (predict)", "Deceased (predict)"],
            "Cured (observed)": [ count_SS_ratio, count_SVi / (count_SVi + count_ViVi + count_VeVi), count_SVe / (count_SVe + count_ViVe + count_VeVe)],
            "Recurrence (observed)": [count_ViS / (count_SS + count_ViS + count_VeS), count_ViVi / (count_SVi + count_ViVi + count_VeVi), count_ViVe / (count_SVe + count_ViVe + count_VeVe)],
            "Deceased (observed)": [count_VeS / (count_SS + count_ViS + count_VeS), count_VeVi / (count_SVi + count_ViVi + count_VeVi), count_VeVe / (count_SVe + count_ViVe + count_VeVe)]
            }

    output = pd.DataFrame(data, columns = ["predict\Observe", "Cured (observed)", "Recurrence (observed)", "Deceased (observed)"])
    return output

<font color='blue'>Cell 11
Create the confusion matrix

In [36]:
def predict_batch(data):
    d_tree = DTree()
    d_tree.train(training_data)
    count_SS = 0
    count_SVi = 0
    count_SVe = 0
    count_ViVi = 0
    count_ViS = 0
    count_ViVe = 0
    count_VeVe = 0
    count_VeS = 0
    count_VeVi = 0
    count_total_T = 0
    count_total_F  = 0

    for i in range (data.shape[0]):
        instance = data.iloc[i]
        true_label = instance["Results"]
        predict_label = d_tree.predict(data.iloc[i])
        print (i, ") true_label  = ", true_label , "predict_label  = ", predict_label )
        if true_label == predict_label:
            count_total_T = count_total_T + 1
            if true_label == "Cured":
                count_SS = count_SS + 1
            elif true_label == "Dead":
                count_ViVi = count_ViVi + 1
            elif true_label == "Recurrence":
                count_VeVe = count_VeVe + 1
        else:
            count_total_F = count_total_F + 1
            if true_label == "Cured" and predict_label == "Recurrence":
                count_SVi = count_SVi + 1
            elif true_label == "Cured" and predict_label == "Dead":
                count_SVe = count_SVe + 1
            elif true_label == "Dead" and predict_label == "Recurrence":
                count_VeVi = count_VeVi + 1
            elif true_label == "Dead" and predict_label == "Cured":
                count_VeS = count_VeS + 1
            elif true_label == "Recurrence" and predict_label == "Dead":
                count_ViVe = count_ViVe + 1
            elif true_label == "Recurrence" and predict_label == "Cured":
                count_ViS = count_ViS + 1

    return [count_SS, count_SVi, count_SVe, count_ViVi, count_ViVe, count_ViS, count_VeVe, count_VeVi, count_VeS, count_total_T, count_total_F]


<font color='blue'>Cell 12
Look at the confusion matrix for training data

In [37]:
training_data = testtrain[0]
print ("training_data = ", training_data)
predict_batch_results=predict_batch(training_data)
print ("predict_batch_results = ", predict_batch_results)
print_ConfusionMatrix(predict_batch(training_data))

training_data =       ESR1  PGR  BCL2  NAT1     Results
0     6.7  3.3   5.7   2.5        Dead
1     5.7  4.4   1.5   0.4       Cured
2     6.0  2.7   5.1   1.6  Recurrence
3     6.6  2.9   4.6   1.3  Recurrence
4     5.1  3.8   1.5   0.3       Cured
..    ...  ...   ...   ...         ...
100   6.3  2.5   4.9   1.5  Recurrence
101   5.0  3.0   1.6   0.2       Cured
102   5.0  2.0   3.5   1.0  Recurrence
103   5.5  2.6   4.4   1.2  Recurrence
104   7.7  3.0   6.1   2.3        Dead

[105 rows x 5 columns]
0 ) true_label  =  Dead predict_label  =  Dead
1 ) true_label  =  Cured predict_label  =  Cured
2 ) true_label  =  Recurrence predict_label  =  Recurrence
3 ) true_label  =  Recurrence predict_label  =  Recurrence
4 ) true_label  =  Cured predict_label  =  Cured
5 ) true_label  =  Cured predict_label  =  Cured
6 ) true_label  =  Cured predict_label  =  Cured
7 ) true_label  =  Recurrence predict_label  =  Recurrence
8 ) true_label  =  Recurrence predict_label  =  Recurrence
9 ) true_lab

Unnamed: 0,predict\Observe,Cured (observed),Recurrence (observed),Deceased (observed)
0,Cured (predict),1.0,0.0,0.0
1,Recurrence (predict),0.0,1.0,0.0
2,Deceased (predict),0.0,0.0,1.0


<font color='blue'>Cell 13
Look at the confusion matrix for testing data

In [38]:
testing_data = testtrain[1]
print_ConfusionMatrix(predict_batch(testing_data))

0 ) true_label  =  Cured predict_label  =  Cured
1 ) true_label  =  Recurrence predict_label  =  Recurrence
2 ) true_label  =  Recurrence predict_label  =  Recurrence
3 ) true_label  =  Cured predict_label  =  Cured
4 ) true_label  =  Recurrence predict_label  =  Recurrence
5 ) true_label  =  Cured predict_label  =  Cured
6 ) true_label  =  Dead predict_label  =  Dead
7 ) true_label  =  Recurrence predict_label  =  Recurrence
8 ) true_label  =  Cured predict_label  =  Cured
9 ) true_label  =  Dead predict_label  =  Dead
10 ) true_label  =  Recurrence predict_label  =  Recurrence
11 ) true_label  =  Cured predict_label  =  Cured
12 ) true_label  =  Recurrence predict_label  =  Recurrence
13 ) true_label  =  Recurrence predict_label  =  Recurrence
14 ) true_label  =  Recurrence predict_label  =  Recurrence
15 ) true_label  =  Cured predict_label  =  Cured
16 ) true_label  =  Dead predict_label  =  Dead
17 ) true_label  =  Recurrence predict_label  =  Recurrence
18 ) true_label  =  Dead p

Unnamed: 0,predict\Observe,Cured (observed),Recurrence (observed),Deceased (observed)
0,Cured (predict),1.0,0.0,0.0
1,Recurrence (predict),0.0,0.857143,0.142857
2,Deceased (predict),0.0,0.0,1.0


<font color='blue'>Cell 14
Function to make predictions

In [39]:
# method that run prediction
def predict(d_tree, ESR1, PGR, BCL2, NAT1):
    test_data = pd.Series([ESR1, PGR, BCL2, NAT1], index = ['ESR1', 'PGR', 'BCL2', 'NAT1'])
    return d_tree.predict(test_data)

<font color='blue'>Cell 15
Make predictions

In [40]:
def input_test_seq():
    ESR1_expr = float(input('Enter the expression level of ESR1:'))
    while True:
        if float(ESR1_expr)< 0 or float(ESR1_expr) > 10:
            print('Invalid Entry. Please enter value less than 10')
            ESR1_expr = float(input('Enter the expression level of ESR1:'))
            continue
        else:
            break

    PGR_expr = float(input('Enter the expression level of PGR:'))
    while True:
        if float(PGR_expr) < 0 or float(PGR_expr) > 10:
            print('Invalid Entry. Please enter value less than 10')
            PGR_expr = float(input('Enter the expression level of PGR:'))
            continue
        else:
            break

    BCL2_expr = float(input('Enter the expression level of BCL2:'))
    while True:
        if float(BCL2_expr) <0 or float(BCL2_expr) > 10:
            print('Invalid Entry. Please enter value less than 10')
            BCL2_expr = float(input('Enter the expression level of BCL2:'))
            continue
        else:
            break

    NAT1_expr = float(input('Enter the expression level of NAT1:'))
    while True:
        if float(NAT1_expr) < 0 or float(NAT1_expr) > 10:
            print('Invalid Entry. Please enter value less than 10')
            NAT1_expr = float(input('Enter the expression level of NAT1:'))
            continue
        else:
            break

    predict_expr = [ESR1_expr,PGR_expr,BCL2_expr,NAT1_expr]
    result_category = predict(d_tree, predict_expr[0],predict_expr[1],predict_expr[2],predict_expr[3])
    print("This patient is ", result_category)

    return

input_test_seq()


Enter the expression level of ESR1:1
Enter the expression level of PGR:1
Enter the expression level of BCL2:1
Enter the expression level of NAT1:1
This patient is  Cured
