In [None]:
import sys
import os

current_directory = os.getcwd()

project_directory = os.path.abspath(os.path.join(current_directory, '..'))
if project_directory not in sys.path:
    sys.path.append(project_directory)

import sanity_checks_methods
import ML_algorithms
import pandas as pd
import util_inaccuracy

dataset_path = os.path.join(project_directory, 'dataset/stroke_data.csv')
df = pd.read_csv(dataset_path)

no_stroke_group = df[df['stroke'] == 0]
stroke_group = df[df['stroke'] == 1]

group_size = 5000

sampled_no_stroke_group = no_stroke_group.sample(n=group_size, random_state=42)
sampled_stroke_group = stroke_group.sample(n=group_size, random_state=42)

df = pd.concat([sampled_no_stroke_group,sampled_stroke_group])

df = sanity_checks_methods.clean_dataset(df)
df_to_test = df.copy()

adding 10% of inaccuracies in avg_glucose_level values

In [None]:

print('ADDING 10% TO FEATURE: avg_glucose_level')
print('--------------------------------------')
df_modified = util_inaccuracy.modify_values('avg_glucose_level', df, 10)

print('\nDECISION TREE PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
decision_tree = ML_algorithms.model_dt(df_modified, df_to_test)
ML_algorithms.k_fold_cross_validation_dt(decision_tree, df_to_test)

print('\nSVM PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
svm = ML_algorithms.model_svm(df_modified, df_to_test)
ML_algorithms.k_fold_cross_validation_dt(svm, df_to_test)


adding 10% of inaccuracies in bmi

In [None]:
print('ADDING 10% TO FEATURE: bmi')
print('--------------------------------------')
df_modified= util_inaccuracy.modify_values('bmi',df, 10)
print('\n DECISION TREE PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_age_dt, y_test_age_dt, decision_tree = ML_algorithms.model_dt(df_modified, df_to_test)
mean_accuracy_dt_age, confidence_interval_dt_age = ML_algorithms.k_fold_cross_validation_dt(decision_tree,df_to_test)
    
print('\n PRINT SVM PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_age_svm, y_test_age_svm, svm = ML_algorithms.model_svm(df_modified, df_to_test)
mean_accuracy_svm_age, confidence_interval_svm_age = ML_algorithms.k_fold_cross_validation_dt(svm,df_to_test)

inaccuracies on bmi and avg_glucose_level features

In [None]:
print('ADDING 10% TO FEATURE: avg_glucose_level & bmi')
print('--------------------------------------')
df_modified = util_inaccuracy.modify_values('avg_glucose_level',df, 10)
df_modified = util_inaccuracy.modify_values('bmi',df_modified, 10)
print('\n DECISION TREE PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_bmi_glucose_dt, y_test_bmi_glucose_dt, decision_tree = ML_algorithms.model_dt(df_modified, df_to_test)
mean_accuracy_dt_glucose_bmi, confidence_interval_dt_glucose_bmi = ML_algorithms.k_fold_cross_validation_dt(decision_tree,df_to_test)
    
print('\n PRINT SVM PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_bmi_glucose_svm, y_test_bmi_glucose_svm, svm = ML_algorithms.model_svm(df_modified, df_to_test)
mean_accuracy_svm_glucose_bmi, confidence_interval_svm_glucose_bmi = ML_algorithms.k_fold_cross_validation_dt(svm,df_to_test) 

adding inaccuracies to non binary features

In [None]:
print('ADDING 10% TO MODIFIABLE FEATURES')
print('--------------------------------------')
df_modified = util_inaccuracy.modify_values('avg_glucose_level',df, 10)
for column in ['work_type','age','bmi', 'avg_glucose_level']:
    df_modified = util_inaccuracy.modify_values(column,df_modified, 10)
print('\n DECISION TREE PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_dt, y_test_dt, decision_tree = ML_algorithms.model_dt(df_modified, df_to_test)
mean_accuracy_dt, confidence_interval_dt = ML_algorithms.k_fold_cross_validation_dt(decision_tree,df_to_test)
    
print('\n PRINT SVM PREDICTIONS AND RESULTS')
print('---------------------------------------------------')
y_pred_prob_svm, y_test_svm, svm = ML_algorithms.model_svm(df_modified, df_to_test)
mean_accuracy_svm, confidence_interval_svm = ML_algorithms.k_fold_cross_validation_dt(svm,df_to_test)

## Conclusion

### ROC CURVE - Decision Tree

In [None]:
roc_results = [
    (y_pred_prob_glucose_dt, y_test_glucose_dt, 'Avg_glucose_level'),
    (y_pred_prob_bmi_dt, y_test_bmi_dt, 'BMI'),
    (y_pred_prob_age_dt, y_test_age_dt, 'Age'),
    (y_pred_prob_work_type_dt, y_test_work_type_dt, 'Work Type'),
    (y_pred_prob_bmi_glucose_dt, y_test_bmi_glucose_dt, 'BMI AND Avg_Glucose_Level'),
    (y_pred_prob_dt, y_test_dt, 'Work Type AND Age AND BMI')
]

# Plot ROC curves
ML_algorithms.plot_roc_curve_conclusion_with_results(roc_results)

### ROC CURVE - Support Vector Machine

In [None]:
roc_results = [
    (y_pred_prob_glucose_svm, y_test_glucose_svm, 'Avg_glucose_level'),
    (y_pred_prob_bmi_svm, y_test_bmi_svm, 'BMI'),
    (y_pred_prob_age_svm, y_test_age_svm, 'Age'),
    (y_pred_prob_work_type_svm, y_test_work_type_svm, 'Work Type'),
    (y_pred_prob_bmi_glucose_svm, y_test_bmi_glucose_svm, 'BMI AND Avg_Glucose_Level'),
    (y_pred_prob_dt, y_test_svm, 'Work Type AND Age AND BMI')
]

# Plot ROC curves
ML_algorithms.plot_roc_curve_conclusion_with_results(roc_results)

### Confidence Interval - Decision Tree

In [None]:
model_results = [
    ('Avg_Glucose_Level', mean_accuracy_dt_glucose, confidence_interval_dt_glucose),
    ('BMI', mean_accuracy_dt_bmi, confidence_interval_dt_bmi),
    ('Age', mean_accuracy_dt_age, confidence_interval_dt_age),
    ('Work Type', mean_accuracy_dt_work_type, confidence_interval_dt_work_type),
    ('BMI AND Avg_Glucose_Level', mean_accuracy_dt_glucose_bmi, confidence_interval_dt_glucose_bmi),
    ('Work Type AND Age AND BMI', mean_accuracy_dt, confidence_interval_dt)
]
ML_algorithms.plot_confidence_intervals(model_results)

### Confidence Interval - Support Vector Machine

In [None]:
model_results = [
    ('Avg_Glucose_Level', mean_accuracy_svm_glucose, confidence_interval_svm_glucose),
    ('BMI', mean_accuracy_svm_bmi, confidence_interval_svm_bmi),
    ('Age', mean_accuracy_svm_age, confidence_interval_svm_age),
    ('Work Type', mean_accuracy_svm_work_type, confidence_interval_svm_work_type),
    ('BMI AND Avg_Glucose_Level', mean_accuracy_svm_glucose_bmi, confidence_interval_svm_glucose_bmi),
    ('Work Type AND Age AND BMI', mean_accuracy_svm, confidence_interval_svm)
]
ML_algorithms.plot_confidence_intervals(model_results)