In [None]:
import sys
import os

current_directory = os.getcwd()

project_directory = os.path.abspath(os.path.join(current_directory, '..'))
if project_directory not in sys.path:
    sys.path.append(project_directory)

import sanity_checks_methods
import ML_algorithms
import pandas as pd
import null_utility

dataset_path = os.path.join(project_directory, 'dataset/stroke_data.csv')
df = pd.read_csv(dataset_path)

no_stroke_group = df[df['stroke'] == 0]
stroke_group = df[df['stroke'] == 1]

group_size = 5000

sampled_no_stroke_group = no_stroke_group.sample(n=group_size, random_state=42)
sampled_stroke_group = stroke_group.sample(n=group_size, random_state=42)

df = pd.concat([sampled_no_stroke_group,sampled_stroke_group])

df = sanity_checks_methods.clean_dataset(df)
df_to_test = df.copy()

adding 10% null values to hypertension feature

In [None]:
print('ADDING 10% TO FEATURE: HYPERTENSION')
print('--------------------------------------')
indices, original_values = null_utility.add_null_values(df,'hypertension', 10)
print('\n DECISION TREE PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_hypertension_dt, y_test_hypertension_dt, decision_tree = ML_algorithms.model_dt(df, df_to_test)
mean_accuracy_dt_hypertension, confidence_interval_dt_hypertension = ML_algorithms.k_fold_cross_validation_dt(decision_tree,df_to_test)
    
print('\n PRINT SVM PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_hypertension_svm, y_test_hypertension_svm, svm = ML_algorithms.model_svm(df, df_to_test)
mean_accuracy_svm_hypertension, confidence_interval_svm_hypertension = ML_algorithms.k_fold_cross_validation_dt(svm,df_to_test) 
    
df.loc[indices, 'hypertension'] = original_values

adding 10% null values to heart_disease feature

In [None]:
print('ADDING 10% TO FEATURE: HEART_DISEASE')
print('--------------------------------------')
indices, original_values = null_utility.add_null_values(df,'heart_disease', 10)
print('\n DECISION TREE PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_heart_dt, y_test_heart_dt, decision_tree = ML_algorithms.model_dt(df, df_to_test)
mean_accuracy_dt_heart, confidence_interval_dt_heart = ML_algorithms.k_fold_cross_validation_dt(decision_tree,df_to_test)
    
print('\n PRINT SVM PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_heart_svm, y_test_heart_svm, svm = ML_algorithms.model_svm(df, df_to_test)
mean_accuracy_svm_heart, confidence_interval_svm_heart = ML_algorithms.k_fold_cross_validation_dt(svm,df_to_test)  
    
df.loc[indices, 'heart_disease'] = original_values

adding 10% null values to avg_glucose_level feature

In [None]:
print('ADDING 10% TO FEATURE: AVG_GLUCOSE_LEVEL')
print('--------------------------------------')
indices, original_values = null_utility.add_null_values(df,'avg_glucose_level', 10)
print('\n DECISION TREE PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_glucose_dt, y_test_glucose_dt, decision_tree = ML_algorithms.model_dt(df, df_to_test)
mean_accuracy_dt_glucose, confidence_interval_dt_glucose = ML_algorithms.k_fold_cross_validation_dt(decision_tree,df_to_test)
    
print('\n PRINT SVM PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_glucose_svm, y_test_glucose_svm, svm = ML_algorithms.model_svm(df, df_to_test)
mean_accuracy_svm_glucose, confidence_interval_svm_glucose = ML_algorithms.k_fold_cross_validation_dt(svm,df_to_test)  
    
df.loc[indices, 'avg_glucose_level'] = original_values

adding 10% null values to bmi feature

In [None]:
print('ADDING 10% TO FEATURE: BMI')
print('--------------------------------------')
indices, original_values = null_utility.add_null_values(df,'bmi', 10)
print('\n DECISION TREE PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_bmi_dt, y_test_bmi_dt, decision_tree = ML_algorithms.model_dt(df, df_to_test)
mean_accuracy_dt_bmi, confidence_interval_dt_bmi = ML_algorithms.k_fold_cross_validation_dt(decision_tree,df_to_test)
    
print('\n PRINT SVM PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_bmi_svm, y_test_bmi_svm, svm = ML_algorithms.model_svm(df, df_to_test)
mean_accuracy_svm_bmi, confidence_interval_svm_bmi = ML_algorithms.k_fold_cross_validation_dt(svm,df_to_test) 
    
df.loc[indices, 'bmi'] = original_values

adding 10% null values to the two most important features

In [None]:
indices_dict = {}
original_values_dict = {}
features = [ 'bmi', 'avg_glucose_level']

for feature in features:
    indices, original_values = null_utility.add_null_values(df,feature,10)
    indices_dict[feature] = indices
    original_values_dict[feature] = original_values


print('\n DECISION TREE PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_bmi_glucose_dt, y_test_bmi_glucose_dt, decision_tree = ML_algorithms.model_dt(df, df_to_test)
mean_accuracy_dt_bmi_glucose, confidence_interval_dt_bmi_glucose = ML_algorithms.k_fold_cross_validation_dt(decision_tree,df_to_test)
    
print('\n PRINT SVM PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_bmi_glucose_svm, y_test_bmi_glucose_svm, svm = ML_algorithms.model_svm(df, df_to_test)
mean_accuracy_svm_bmi_glucose, confidence_interval_svm_bmi_glucose = ML_algorithms.k_fold_cross_validation_dt(svm,df_to_test) 

for feature in features:
    df.loc[indices_dict[feature], feature] = original_values_dict[feature]

Inserting 10% of null values in health related features (avg_glucose_level, bmi, hypertension, smoking_status, heart_disease, age)

In [None]:
indices_dict = {}
original_values_dict = {}
features = [ 'avg_glucose_level', 'age', 'bmi', 'hypertension', 'smoking_status', 'heart_disease']

for feature in features:
    indices, original_values = null_utility.add_null_values(df, feature, 10)
    indices_dict[feature] = indices
    original_values_dict[feature] = original_values


print('\n DECISION TREE PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_health_dt, y_test_health_dt, decision_tree = ML_algorithms.model_dt(df, df_to_test)
mean_accuracy_dt_health, confidence_interval_dt_health = ML_algorithms.k_fold_cross_validation_dt(decision_tree,df_to_test)
    
print('\n PRINT SVM PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_health_svm, y_test_health_svm, svm = ML_algorithms.model_svm(df, df_to_test)
mean_accuracy_svm_health, confidence_interval_svm_health = ML_algorithms.k_fold_cross_validation_dt(svm,df_to_test) 

for feature in features:
    df.loc[indices_dict[feature], feature] = original_values_dict[feature]

inserting 10% of null values in NOT health related features (sex, residence_type, ever_married, work_type)

In [None]:
indices_dict = {}
original_values_dict = {}
features = [ 'sex', 'Residence_type', 'ever_married', 'work_type']

for feature in features:
    indices, original_values = null_utility.add_null_values(df, feature, 10)
    indices_dict[feature] = indices
    original_values_dict[feature] = original_values

print('\n DECISION TREE PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_not_health_dt, y_test_not_health_dt, decision_tree = ML_algorithms.model_dt(df, df_to_test)
mean_accuracy_dt_not_health, confidence_interval_dt_not_health = ML_algorithms.k_fold_cross_validation_dt(decision_tree,df_to_test)
    
print('\n PRINT SVM PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_not_health_svm, y_test_not_health_svm, svm = ML_algorithms.model_svm(df, df_to_test)
mean_accuracy_svm_not_health, confidence_interval_svm_not_health = ML_algorithms.k_fold_cross_validation_dt(svm,df_to_test) 

for feature in features:
    df.loc[indices_dict[feature], feature] = original_values_dict[feature]

## Conclusion

### ROC Curve - Decision Tree

In [None]:
roc_results = [
    (y_pred_prob_hypertension_dt, y_test_hypertension_dt, 'Hypertension'),
    (y_pred_prob_heart_dt, y_test_heart_dt, 'Heart'),
    (y_pred_prob_glucose_dt, y_test_glucose_dt, 'Avg_glucose_level'),
    (y_pred_prob_bmi_dt, y_test_bmi_dt, 'BMI'),
    (y_pred_prob_bmi_glucose_dt, y_test_bmi_glucose_dt, 'BMI AND Avg_Glucose_Level'),
    (y_pred_prob_health_dt, y_test_health_dt, 'Health'),
    (y_pred_prob_not_health_dt, y_test_not_health_dt, 'Not Health')
]

# Plot ROC curves
ML_algorithms.plot_roc_curve_conclusion_with_results(roc_results)

### ROC Curve - Support Vector Machine 

In [None]:
roc_results = [
    (y_pred_prob_hypertension_svm, y_test_hypertension_svm, 'Hypertension'),
    (y_pred_prob_heart_svm, y_test_heart_svm, 'Heart'),
    (y_pred_prob_glucose_svm, y_test_glucose_svm, 'Avg_glucose_level'),
    (y_pred_prob_bmi_svm, y_test_bmi_svm, 'BMI'),
    (y_pred_prob_bmi_glucose_svm, y_test_bmi_glucose_svm, 'BMI AND Avg_Glucose_Level'),
    (y_pred_prob_health_svm, y_test_health_svm, 'Health'),
    (y_pred_prob_not_health_svm, y_test_not_health_svm, 'Not Health')
]

# Plot ROC curves
ML_algorithms.plot_roc_curve_conclusion_with_results(roc_results)

### Confidence Interval - Decision Tree

In [None]:
model_results = [
    ('Hypertension', mean_accuracy_dt_hypertension, confidence_interval_dt_hypertension),
    ('Heart', mean_accuracy_dt_heart, confidence_interval_dt_heart),
    ('Avg_Glucose_Level', mean_accuracy_dt_glucose, confidence_interval_dt_glucose),
    ('BMI', mean_accuracy_dt_bmi, confidence_interval_dt_bmi),
    ('BMI AND Avg_Glucose_Level', mean_accuracy_dt_bmi_glucose, confidence_interval_dt_bmi_glucose),
    ('Health', mean_accuracy_dt_health, confidence_interval_dt_health),
    ('Not Health', mean_accuracy_dt_not_health, confidence_interval_dt_not_health)
]
ML_algorithms.plot_confidence_intervals(model_results)

### Confidence Interval - Support Vector Machine

In [None]:
model_results = [
    ('Hypertension', mean_accuracy_svm_hypertension, confidence_interval_svm_hypertension),
    ('Heart', mean_accuracy_svm_heart, confidence_interval_svm_heart),
    ('Avg_Glucose_Level', mean_accuracy_svm_glucose, confidence_interval_svm_glucose),
    ('BMI', mean_accuracy_svm_bmi, confidence_interval_svm_bmi),
    ('BMI AND Avg_Glucose_Level', mean_accuracy_svm_bmi_glucose, confidence_interval_svm_bmi_glucose),
    ('Health', mean_accuracy_svm_health, confidence_interval_svm_health),
    ('Not Health', mean_accuracy_svm_not_health, confidence_interval_svm_not_health)
]
ML_algorithms.plot_confidence_intervals(model_results)