In [88]:
import pandas as pd
import numpy as np
import math
from collections import Counter
import matplotlib.pyplot as plt

In [89]:
trim_small_data = pd.read_csv('../../data/processed/trim_small_data.csv')

FileNotFoundError: [Errno 2] File ../../data/processed/trim_small_data.csv does not exist: '../../data/processed/trim_small_data.csv'

In [55]:
trim_small_data.head()

Unnamed: 0,sex,age_group,Race and ethnicity (combined),hosp_yn,icu_yn,death_yn,medcond_yn
0,Female,0 - 9 Years,"Multiple/Other, Non-Hispanic",Yes,Yes,No,Yes
1,Male,0 - 9 Years,Hispanic/Latino,No,No,No,No
2,Male,0 - 9 Years,"Multiple/Other, Non-Hispanic",No,No,No,No
3,Male,10 - 19 Years,Hispanic/Latino,Yes,No,No,Yes
4,Female,10 - 19 Years,Hispanic/Latino,No,No,No,No


In [56]:
#trim_small_data['hosp_yn'].replace('Yes',1, inplace=True)
for columns in('hosp_yn','icu_yn','death_yn','medcond_yn','sex'):
    trim_small_data[columns] = trim_small_data[columns].map({'Yes':1,'No':0,'Female':1,'Male':0})

In [57]:
trim_small_data.head()

Unnamed: 0,sex,age_group,Race and ethnicity (combined),hosp_yn,icu_yn,death_yn,medcond_yn
0,1,0 - 9 Years,"Multiple/Other, Non-Hispanic",1,1,0,1
1,0,0 - 9 Years,Hispanic/Latino,0,0,0,0
2,0,0 - 9 Years,"Multiple/Other, Non-Hispanic",0,0,0,0
3,0,10 - 19 Years,Hispanic/Latino,1,0,0,1
4,1,10 - 19 Years,Hispanic/Latino,0,0,0,0


In [58]:
age_group = trim_small_data['age_group'].groupby(trim_small_data['age_group']).size()

In [59]:
age_group.head()

age_group
0 - 9 Years       5564
10 - 19 Years    18244
20 - 29 Years    41221
30 - 39 Years    33880
40 - 49 Years    34433
Name: age_group, dtype: int64

In [60]:
race = trim_small_data['Race and ethnicity (combined)'].groupby(trim_small_data['Race and ethnicity (combined)']).size()

In [61]:
race.head()

Race and ethnicity (combined)
American Indian/Alaska Native, Non-Hispanic      995
Asian, Non-Hispanic                             8099
Black, Non-Hispanic                            38499
Hispanic/Latino                                56298
Multiple/Other, Non-Hispanic                    7044
Name: Race and ethnicity (combined), dtype: int64

In [62]:
def Bernoulli_Probability(data):
    p = np.count_nonzero(data)/data.size
    return ( lambda x: p if x > 0 else (1-p) )

def Multinomial_Probability(data, smoothing=0):
    values = data.value_counts()
    total = values.sum()
    N = values.nunique()
    result = {i : (value + smoothing) / (total + smoothing * N) for i, value in values.items()}
    return (lambda x: result[x])

def Gaussian_Probability(data):
    mu = data.mean()
    std = data.std()
    t = np.sqrt(2 * math.pi * (std**2))
    Gaussian = ( lambda x: 1/t * math.exp(-( (x - mu)**2 ) / (2 * (std**2)) ) )
    return Gaussian

In [63]:
trim_small_data.head()

Unnamed: 0,sex,age_group,Race and ethnicity (combined),hosp_yn,icu_yn,death_yn,medcond_yn
0,1,0 - 9 Years,"Multiple/Other, Non-Hispanic",1,1,0,1
1,0,0 - 9 Years,Hispanic/Latino,0,0,0,0
2,0,0 - 9 Years,"Multiple/Other, Non-Hispanic",0,0,0,0
3,0,10 - 19 Years,Hispanic/Latino,1,0,0,1
4,1,10 - 19 Years,Hispanic/Latino,0,0,0,0


In [64]:
def get_train_NaiveBayes(data):
    pos = {}
    neg = {}
    P = data.loc[data['death_yn'] == 1].drop("death_yn",axis = 1)
    N = data.loc[data['death_yn'] == 0].drop("death_yn",axis = 1)
    NB_distribution = {
    'sex': Bernoulli_Probability,
    'age_group': Multinomial_Probability,
    'Race and ethnicity (combined)': Multinomial_Probability,
    'hosp_yn': Bernoulli_Probability,
    'icu_yn': Bernoulli_Probability,
    'medcond_yn': Bernoulli_Probability   
    }

    features = P.keys()

    for feature in features:
        distribution_func = NB_distribution[feature]
        pre_P = P[feature]
        pos[feature] = distribution_func(pre_P)
        
        pre_N = N[feature]
        neg[feature] = distribution_func(pre_N)
    
    return [pos,neg,features]
        
def predict_with_trained(train_NaiveBayes, personal_feature):
    pos = train_NaiveBayes[0]
    neg = train_NaiveBayes[1]
    features = train_NaiveBayes[2]
    P = [pos[i](personal_feature[i]) for i in features]
    Q = [neg[i](personal_feature[i]) for i in features]
    p = np.prod(P)
    q = np.prod(Q)
    if p >= q:
        return 1
    else:
        return 0
    
def predict(train_NaiveBayes, data):
    if data.ndim == 1:
        predict(train_NaiveBayes,data)
    N = data.shape[0]
    return [predict(train_NaiveBayes,data.iloc[i]) for i in range(N)]
    


In [65]:
def ten_fold_Split(data):
    
    data_len =len(data.index)
    data_per_set = int(data_len/10)
    result_data = []
    
    for i in range(10):
        start = data_per_set*i
        end = data_per_set*(i+1) if i!=9 else data_len
        
        df1 = data[:start]
        df2 = data[end:]
        
        testing = data[start:end]
        training =pd.concat([df1,df2])
        
        result_data.append([testing,training])
        
    return result_data

In [66]:
def correct_count (test_data, trained_NaiveBayes):
    correct_count = 0
    test_output = 0
    for index, input in test_data.iterrows():
        real_output = input["death_yn"]
        x = input.drop("death_yn")
        test_output = predict_with_trained(trained_NaiveBayes,x)
        
        if test_output == real_output:
            correct_count+=1
    return correct_count

In [67]:
def count(test_data, trained_NaiveBayes):
    correct_count = 0
    true_positive = 0
    true_negative = 0
    false_positive = 0
    false_negative = 0
    test_output = 0
    for index, input in test_data.iterrows():
        real_output = input["death_yn"]
        x = input.drop("death_yn")
        predict_output = predict_with_trained(trained_NaiveBayes,x)
        if predict_output == 1:
            if real_output == 1:
                true_positive += 1
            else:
                false_positive +=1
        else:
            if real_output == 1:
                false_negative += 1
            else:
                true_negative +=1
                
        if test_output == real_output:
            correct_count+=1
        
    return [correct_count,true_positive,false_positive,true_negative,false_negative]

In [68]:
def ten_fold_cross_validation(data):
    t_size = len(data)
    t_split = ten_fold_Split(data)
    correct_total = 0
    
    for data_entry in t_split:
        testing_data = data_entry[0]
        training_data = data_entry[1]
        trained_NaiveBayes = get_train_NaiveBayes(training_data)
        correct_total += correct_count (testing_data, trained_NaiveBayes)

    accuracy = correct_total/t_size
    return accuracy

In [69]:
def ten_fold_cross_validation_all(data):
    t_size = len(data)
    t_split = ten_fold_Split(data)
    correct_total = 0
    true_positive_total = 0
    false_positive_total = 0
    true_negative_total = 0
    false_negative_total = 0
    
    for data_entry in t_split:
        testing_data = data_entry[0]
        training_data = data_entry[1]
        trained_NaiveBayes = get_train_NaiveBayes(training_data)
        correct_count,true_positive,false_positive,true_negative,false_negative = count (testing_data, trained_NaiveBayes)
        correct_total += correct_count
        true_positive_total += true_positive
        false_positive_total += false_positive
        true_negative_total += true_negative
        false_negative_total += false_negative
    return [correct_total,true_positive_total,false_positive_total,true_negative_total,false_negative_total,t_size]

In [70]:
trained_NaiveBayes = get_train_NaiveBayes(trim_small_data)

In [72]:
result = ten_fold_cross_validation_all(trim_small_data)

In [73]:
def Accuracy(correct_total,total_size):
    return correct_total/total_size

In [74]:
def Sensitivity(true_positive,false_negative):
    return true_positive/(true_positive+false_negative)

In [75]:
def Specificity(true_negative,false_positive):
    return true_negative/(false_positive+true_negative)

In [76]:
def G_mean(true_positive,false_negative,true_negative,false_positive):
    Sens =Sensitivity(true_positive,false_negative)
    Spec = Specificity(true_negative,false_positive)
    return np.sqrt(Sens*Spec)

In [77]:
def Precision (true_positive,false_positive):
    return true_positive/(true_positive+false_positive)

In [78]:
def Recall (true_positive,false_negative):
    return true_positive/(true_positive+false_negative)

In [79]:
def F_measure (true_positive,false_positive,false_negative):
    p = Precision (true_positive,false_positive)
    r = Recall (true_positive,false_negative)
    return 2*p*r/(p+r)

In [80]:
def printResult(correct_total,true_positive,false_positive,true_negative,false_negative,total_size):
    print("Accuracy: ",Accuracy(correct_total,total_size))
    print("True Positive rate: ", Sensitivity(true_positive,false_negative))
    print("True Negative rate: ", Specificity(true_negative,false_positive))
    print("G-Mean: ", G_mean(true_positive,false_negative,true_negative,false_positive))
    print("Precision: ", Precision (true_positive,false_positive))
    print("Recall: ", Recall (true_positive,false_negative))
    print("F-Measure ", F_measure (true_positive,false_positive,false_negative))

In [81]:
printResult(result[0],result[1],result[2],result[3],result[4],result[5])

Accuracy:  0.8981977633950854
True Positive rate:  0.8847198022610316
True Negative rate:  0.8658641878920925
G-Mean:  0.8752412199484213
Precision:  0.42777375051429245
Recall:  0.8847198022610316
F-Measure  0.5767036450079239
