In [1]:
import turicreate, copy
import numpy as np

### Load the Data and View the Raw Data

In [2]:
raw_data = turicreate.SFrame('../../data/processed/trim_small_data.csv')

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[str,str,str,str,str,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [3]:
raw_data

sex,age_group,Race and ethnicity (combined) ...,hosp_yn,icu_yn,death_yn,medcond_yn
Female,0 - 9 Years,"Multiple/Other, Non- Hispanic ...",Yes,Yes,No,Yes
Male,0 - 9 Years,Hispanic/Latino,No,No,No,No
Male,0 - 9 Years,"Multiple/Other, Non- Hispanic ...",No,No,No,No
Male,10 - 19 Years,Hispanic/Latino,Yes,No,No,Yes
Female,10 - 19 Years,Hispanic/Latino,No,No,No,No
Female,10 - 19 Years,"Black, Non-Hispanic",No,No,No,No
Male,10 - 19 Years,"White, Non-Hispanic",No,No,No,No
Male,10 - 19 Years,"White, Non-Hispanic",No,No,No,Yes
Male,10 - 19 Years,"White, Non-Hispanic",No,No,No,Yes
Male,10 - 19 Years,"White, Non-Hispanic",No,No,No,No


### Encoding the Raw Data and Check Correctness

##### Encoding Scheme: 
1. Sex: Female - 1 Male - 0
2. Yes/No: Yes - 1 No - 0
3. Age/Group:  < 40 - 1  >= 40 - 0
4. Race/Ethnical Group:    White - 1  Non-White - 0

##### Encode Data

In [4]:
x_data = raw_data.copy()
x_data['death_yn'] = x_data['death_yn'].apply(lambda x: 1 if x == 'Yes' else 0)
x_data['sex'] = x_data['sex'].apply(lambda x: 1 if x == 'Female' else 0)
x_data['hosp_yn'] = x_data['hosp_yn'].apply(lambda x: 1 if x == 'Yes' else 0)
x_data['icu_yn'] = x_data['icu_yn'].apply(lambda x: 1 if x == 'Yes' else 0)
x_data['medcond_yn'] = x_data['medcond_yn'].apply(lambda x: 1 if x == 'Yes' else 0)
x_data['Race and ethnicity (combined)'] = x_data['Race and ethnicity (combined)'].apply(lambda x: 1 if x == 'White, Non-Hispanic' else 0)
x_data['age_group'] = x_data['age_group'].apply(lambda x: 1 if x == '0 - 9 Years' or x == '10 - 19 Years' or x == '20 - 29 Years' or x == '30 - 39 Years' else 0)

##### Check the Correctness

In [5]:
print "=============== Gender ================="
print raw_data['sex'].value_counts()
print x_data['sex'].value_counts()
print "=============== death_yn ================="
print raw_data['death_yn'].value_counts()
print x_data['death_yn'].value_counts()
print "=============== hosp_yn ================="
print x_data['hosp_yn'].value_counts()
print raw_data['hosp_yn'].value_counts()
print "=============== icu_yn ================="
print x_data['icu_yn'].value_counts()
print raw_data['icu_yn'].value_counts()
print "=============== medcond_yn ================="
print x_data['medcond_yn'].value_counts()
print raw_data['medcond_yn'].value_counts()
print "=============== Race/Ethnical Group ================="
print x_data['Race and ethnicity (combined)'].value_counts()
print raw_data['Race and ethnicity (combined)'].value_counts()
print "=============== Age Group ================="
print x_data['age_group'].value_counts()
print raw_data['age_group'].value_counts()

+--------+--------+
| value  | count  |
+--------+--------+
| Female | 127076 |
|  Male  | 115345 |
+--------+--------+
[2 rows x 2 columns]

+-------+--------+
| value | count  |
+-------+--------+
|   1   | 127076 |
|   0   | 115345 |
+-------+--------+
[2 rows x 2 columns]

+-------+--------+
| value | count  |
+-------+--------+
|   No  | 217742 |
|  Yes  | 24679  |
+-------+--------+
[2 rows x 2 columns]

+-------+--------+
| value | count  |
+-------+--------+
|   0   | 217742 |
|   1   | 24679  |
+-------+--------+
[2 rows x 2 columns]

+-------+--------+
| value | count  |
+-------+--------+
|   0   | 180705 |
|   1   | 61716  |
+-------+--------+
[2 rows x 2 columns]

+-------+--------+
| value | count  |
+-------+--------+
|   No  | 180705 |
|  Yes  | 61716  |
+-------+--------+
[2 rows x 2 columns]

+-------+--------+
| value | count  |
+-------+--------+
|   0   | 218765 |
|   1   | 23656  |
+-------+--------+
[2 rows x 2 columns]

+-------+--------+
| value | count  |
+---

##### View the Data After Encoding

In [6]:
x_data

sex,age_group,Race and ethnicity (combined) ...,hosp_yn,icu_yn,death_yn,medcond_yn
1,1,0,1,1,0,1
0,1,0,0,0,0,0
0,1,0,0,0,0,0
0,1,0,1,0,0,1
1,1,0,0,0,0,0
1,1,0,0,0,0,0
0,1,1,0,0,0,0
0,1,1,0,0,0,1
0,1,1,0,0,0,1
0,1,1,0,0,0,0


### Start Analyzing

##### Spliting Majority class and Minority class

In [7]:
majority_class = x_data[x_data['death_yn'] == 0]
minority_class = x_data[x_data['death_yn'] == 1]

##### Bootstrap a balanced sample

In [8]:
def bootstrap(majority_class, minority_class, seed):
    fraction = .113
    undersample = minority_class.copy()
    undersample = undersample + majority_class.sample(fraction, seed)
    # undersample = minority_class.sample(.8, seed) + majority_class.sample(fraction * .8, seed)
    
    return undersample

##### Build a decision tree

In [9]:
class Node:

    def __init__(self, data, level):

        self.left = None
        self.right = None
        self.data = data
        self.level = level + 1

# Print the tree
    def PrintTree(self):
        if self.left:
            self.left.PrintTree()
        print( self.data),
        if self.right:
            self.right.PrintTree()
            
def height(node):
    if node.left is None and node.right is None:
        return 1
    if node.left:
        h_l = height(node.left)
    if node.right: 
        h_r = height(node.right)
    return (h_l + 1 if h_l > h_r else h_r + 1)

def levelOrder(node):
    queue = []
    queue.append(node)
    prev_l = 0

    while len(queue) is not 0:
        cur = queue.pop(0)
        if cur.left is not None:
            queue.append(cur.left) 
        if cur.right is not None:
            queue.append(cur.right)
        this_l = cur.level
        if prev_l != this_l:
            print
        prev_l = this_l
        print str(cur.data) + " ",

In [10]:
def splitData(x_sample, x_features, index_max):
    left_sample = x_sample[x_sample[x_features[index_max]] == 0]
    right_sample = x_sample[x_sample[x_features[index_max]] == 1]
    
    return (left_sample, right_sample)

In [11]:
def calculateH_x_j_y(sample_size, x_sarray, y_sarray):
    # initialize
    c_zero_zero = 0.0
    c_zero_one = 0.0
    c_one_zero = 0.0
    c_one_one = 0.0
    res_h_x_1_y = 0.0
    
    # count
    y_ones = y_sarray.nnz()
    y_zeros = sample_size - y_ones
    for i in xrange(0, int(sample_size)):
        if x_sarray[i] == 0 and y_sarray[i] == 0:
            c_zero_zero = c_zero_zero + 1
        if x_sarray[i] == 0 and y_sarray[i] == 1:
            c_zero_one = c_zero_one + 1
        if x_sarray[i] == 1 and y_sarray[i] == 0:
            c_one_zero = c_one_zero + 1
        if x_sarray[i] == 1 and y_sarray[i] == 1:
            c_one_one = c_one_one + 1
    
    # calculate
    if c_zero_zero != 0: 
        res_h_x_1_y = res_h_x_1_y + calEntropy(c_zero_zero / sample_size, c_zero_zero / y_zeros)        
    if c_one_zero != 0: 
        res_h_x_1_y = res_h_x_1_y + calEntropy(c_one_zero / sample_size, c_one_zero / y_zeros)
    if c_zero_one != 0: 
        res_h_x_1_y = res_h_x_1_y + calEntropy(c_zero_one / sample_size, c_zero_one / y_ones)
    if c_one_one != 0: 
        res_h_x_1_y = res_h_x_1_y + calEntropy(c_one_one / sample_size, c_one_one / y_ones)
    
    # print res_h_x_1_y
    return res_h_x_1_y

In [12]:
def calEntropy(prob1, prob2):
    return prob1 * np.log2(1/prob2)
def calculateH_x_j(sample_size, x_sarray):
    # count
    x_j_ones = x_sarray.nnz()
    x_j_zeros = sample_size - x_j_ones
    
    # calculate
    res_h_x_j = 0.0
    if x_j_zeros != 0:
        res_h_x_j = res_h_x_j + calEntropy(x_j_zeros / sample_size, x_j_zeros / sample_size)
    if x_j_ones != 0:
        res_h_x_j = res_h_x_j + calEntropy(x_j_ones / sample_size, x_j_ones / sample_size)
    
    # print res_h_x_j
    return res_h_x_j

In [13]:
def calculateMutualInformation(x_sarray, y_sarray):
    # initialize
    sample_size = float(len(x_sarray))
    
    # calculate H(x_j)
    h_x_j = calculateH_x_j(sample_size=sample_size,
                           x_sarray=x_sarray)
    
    # calculate H(x_j, y)
    h_x_j_y = calculateH_x_j_y(sample_size=sample_size, 
                               x_sarray=x_sarray, 
                               y_sarray=y_sarray)
    
    # calculate I(x_j, y)
    I_x_j_y = h_x_j - h_x_j_y
    
    # print I_x_j_y
    return I_x_j_y

In [14]:
def buildDecisionTree(x_features, under_sample, output):
    # initial value
    num_features = len(x_features)
    size = len(under_sample)
    
    mar_y = 1 if under_sample[output].nnz() > size / 2 else 0
    # print "size is " + str(size) + " and mar_y is " + str(mar_y) + " and non zero count is " + str(under_sample[output].nnz())
    
    # check for the first two stop conditions
    if size <= 49358 * 0.05 or num_features == 0:
        # print "this is a leaf with num_features " + str(num_features)
        return Node(str(mar_y), 6 - num_features)
    
    # find features with largest mutual information
    mut_info_max, index_max = -1, -1
    for i in xrange(0, num_features):
        mut_info = calculateMutualInformation(x_sarray=under_sample[x_features[i]], y_sarray=under_sample[output])
        if mut_info_max < mut_info:
            mut_info_max, index_max = mut_info, i
    max_feature = x_features[index_max]
    new_features = copy.deepcopy(x_features)
    new_features.remove(max_feature)
    
    # initialize root
    root = Node(x_features[index_max], 6 - num_features)
        
    # split data
    left_sample, right_sample = splitData(x_sample=under_sample, x_features=x_features, index_max=index_max)
    
    # recursively call
    root.left = buildDecisionTree(new_features, left_sample, output)
    root.right = buildDecisionTree(new_features, right_sample, output)
    
    return root

##### Build Random Forest

In [15]:
def randomForest(num_tree, x_features, majority, minority, output):
    tree_list = []

    for i in xrange(0, num_tree):
        # randomly split data
        under_sample = bootstrap(majority_class=majority, minority_class=minority, seed=i)

        # train data
        tree_root = buildDecisionTree(x_features=x_features, under_sample=under_sample, output=output)
        
        tree_list.append(tree_root)        
    return tree_list

##### Cross Validation

In [16]:
def findLeaf(node, feature_vec):
    # this is leaf
    if node.left is None and node.right is None:
        return node.data
    
    # print node.data + " " + str(feature_vec[node.data])
    # Go to left
    if feature_vec[node.data] == 0.0:
        return findLeaf(node.left, feature_vec)
    
    # Go to right
    if feature_vec[node.data] == 1.0:
        return findLeaf(node.right, feature_vec)

In [17]:
def evaluation(tree_list, test_sample, output):
    # initialize variable
    num_sample = len(test_sample)
    num_tree = len(tree_list)
    predict_list = []
    positive = test_sample[output].nnz()
    
    # prediction
    for i in xrange(num_sample):
        tmp_sum = 0.0
        for j in xrange(num_tree):
            tmp_sum = tmp_sum + int(findLeaf(tree_list[j], test_sample[i]))
        prediction = 1 if tmp_sum > num_tree / 2.0 else 0
        predict_list.append(prediction)
        
    # calculate metrics
    true_positive = sum(predict_list[:positive])
    false_positive = sum(predict_list[positive:])
    true_negative = num_sample - positive - false_positive
    false_negative = positive - true_positive
    
    TNR = true_negative / float(true_negative + false_positive)
    TPR = true_positive / float(true_positive + false_negative)
    accuracy = (true_positive + true_negative) / float(num_sample)
    G_mean = np.sqrt(TNR + TPR)
    precision = true_positive / float(true_positive + false_positive)
    recall = true_positive / float(true_positive + false_negative)
    F_measure = 2 * precision * recall / float(precision + recall)
    
    # print "true positive is " + str(true_positive)
    # print "false positive is " + str(false_positive)
    # print "true negative is " + str(true_negative)
    # print "false negative is " + str(false_negative)
    
    # print "true positive rate is " + str(TPR)
    # print "true negative rage is " + str(TNR)
    # print "G-mean is " + str(G_mean)
    # print "precision is " + str(precision)
    # print "recall is " + str(recall)
    # print "F-measure is " + str(F_measure)
    # print "accuracy is " + str((true_positive + true_negative)/float(num_sample))
    
    return (TNR, TPR, accuracy, G_mean, precision, recall, F_measure)

In [18]:
def crossvalidation(majority_class, minority_class, output, k, x_features, forest):
    # initialize variable
    majority_len = len(majority_class)
    minority_len = len(minority_class)
    majority_interval = majority_len / float(k)
    minority_interval = minority_len / float(k)
    accuracy_list = []
    TNR_list = []
    TPR_list = []
    G_list = []
    precision_list = []
    recall_list = []
    F_list = []
    
    # train and test
    for i in xrange(k):
        # split up data
        majority_test = majority_class[i * majority_interval:(i + 1) * majority_interval]
        majority_train = majority_class[:i*majority_interval] + majority_class[(i + 1) * majority_interval:]
        minority_test = minority_class[i * minority_interval:(i + 1) * minority_interval]
        minority_train = minority_class[:i*minority_interval] + minority_class[(i + 1) * minority_interval:]
        test_data = minority_test + majority_test
        
        # train
        this_forest = randomForest(forest, x_features, majority_train, minority_train, output)
        
        # test
        TNR, TPR, accuracy, G_mean, precision, recall, F_measure = evaluation(this_forest, test_data, output)
        
        accuracy_list.append(accuracy)
        TNR_list.append(TNR)
        TPR_list.append(TPR)
        G_list.append(G_mean)
        precision_list.append(precision)
        recall_list.append(recall)
        F_list.append(F_measure)
        
    print "average accuracy is " + str(sum(accuracy_list) / float(k))
    print "true positive rate is " + str(sum(TPR_list) / float(k))
    print "true negative rage is " + str(sum(TNR_list) / float(k))
    print "G-mean is " + str(sum(G_list) / float(k))
    print "precision is " + str(sum(precision_list) / float(k))
    print "recall is " + str(sum(recall_list) / float(k))
    print "F-measure is " + str(sum(F_list) / float(k))
        

### Run the 10 Fold Cross Validation with Random Forest of 10 Decision Trees

In [19]:
# initialize some variables
x_features = ['medcond_yn', 'hosp_yn', 'icu_yn', 'Race and ethnicity (combined)', 'age_group', 'sex']
output = 'death_yn'

In [20]:
crossvalidation(majority_class, minority_class, output, 10, x_features, 10)

average accuracy is 0.861005819304
true positive rate is 0.883101050561
true negative rage is 0.858501942879
G-mean is 1.3194182466261162
precision is 0.42691435017
recall is 0.883101050561
F-measure is 0.571652551127
