In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas
pd.options.plotting.backend = 'holoviews'
from tqdm.notebook import tqdm_notebook
from sklearn.utils import all_estimators
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier, ExtraTreesRegressor, StackingClassifier, VotingClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedShuffleSplit, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.utils import all_estimators
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import warnings
import os
import sys
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore"
import panel as pn
import io
import holoviews as hv
from holoviews import opts
from bokeh.io import output_notebook
import pickle
from io import BytesIO

In [None]:
dataset = 'data/diabetes.csv'

In [None]:
df = pd.read_csv(dataset)

In [None]:
df_cm = pd.read_csv('data/confusion_matrix.csv')

# Diabetic: Causes and Diganosis. #

**Created by: Adel Ahmed**

**Jan 2022**

### OUTLINE ###
1. Executive Summary
2. Introduction
3. Methodology
4. Results
    - Visualization – Charts
    - Dashboard
5. Discussion
6. Findings & Implications
7. Conclusion
8. Appendix

##### Section 1 #####

### EXECUTIVE SUMMARY ###
- Identify insights and factors for causing Diabeties for patients
- Exploring the Ability to detect Diabeties in patients with Machine Learning

##### Section 2 #####

### INTRODUCTION ###
The report is intended for Bangkok Hospital Admnistration with the goal of identifying the factors that cause Diabeties for the patients and presenting an automated method of detecting Diabites in Patients in order to help the doctors with the diagnosis in a correct and fast way

The informaton of the patients consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.It has a total of 768 cases and 9 variables

| No. | Column | Description| Data Type | Category| Type
| :-- | :-- | :-- | :-- | :-- | :-- |
|1 | Pregnancies | Number of times pregnant | Int | Discrete | Variable |
|2 | Glucose | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | Int | Discrete | Variable |
|3 | Blood pressure | Diastolic blood pressure (mm Hg) | Int | Discrete | Variable |
|4 |Skin thickness | Triceps skin fold thickness (mm) | Int | Discrete | Variable |
|5 | Insulin | 2-Hour serum insulin (mu U/ml) | Int | Discrete | Variable |
|6 | BMI | Body mass index (weight in kg/(height in m)^2) | Float | Continuous | Variable |
|7 | DiabetesPedigreeFunction | Diabetes pedigree function| Float | Continuous | Variable |
|8 | Age | Age (years) | Int | Discrete | Variable |
|9 | Outcome | Class variable (0 or 1)| Int | Discrete | Target |

##### Section 3 #####

### METHODOLOGY ###
- Collecting Data from in-house database
- Examining all data to conduct the analysis
- Exploring the data through statistics and visualization
- Listing the fidings and insights from the analysis
- Modeling the cases with Machine Learning to find the best predictor
- Recommendations for future improvements


##### Section 4 #####

### RESULTS ###

* Exploratory Data Analysis Results
* Predictive Analysis Results


In [None]:
hv.extension("bokeh")

#### Exploratory Data Analysis Results ####

In [None]:
df['Age'].hist(bins=5,title='Patients by Age Groups')

In [None]:
df.hvplot(x='Age', y='BMI', kind='scatter',title='BMI vs. Age')

In [None]:
hv.extension('matplotlib')

#### Predictive Analysis Results ####

In [None]:
hv.HeatMap((df_cm.columns, df_cm.index, df_cm.values)).opts(invert_xaxis=True, show_values=True)

The selected model are not accurate enough to deploy into production

##### Section 5 #####

### DISCUSSION ###

There is a case with 17 pregnancies need to be further investigated, as it might be impossible to achieve that number

##### Section 6 #####

### Findings & Implications ###
1. Glucose level in the blood is main factor in deciding the patient have diabities, healthy diet is needed to avoid this disease
2. Majority of the Patients are less than 34 years of Age, while Diabeties mainly affect the eldry population, so this sample does not reflect the reality
3. the selected classifier do not perform well according to accuracy score, the use of Deep Learning is suggested to yield better results

##### Section 7 #####

### CONCLUSION ###

The data is imbalanced when it comes to age group and contains some anamolies, thus is not reflection of the real situation, and more balanced data need to be acquired and this analyis should be conducted again

##### Section 8 #####

### APPENDIX ###

No additional resources

# Thank you #
Looking to see you agian

In [None]:
hv.extension("bokeh")
tech_report = pn.Column()

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("""
**Main Objective of the Analysis.**
""",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("Train classification models on the dataset and choosing the best model based on accuracy.",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("**Brief Description of the Dataset.**",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("""
The objective of the dataset is to diagnostically predict whether or not a patient has diabetes, 
The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.It has a total of 768 rows and 9 columns

| No. | Column | Description| Data Type | Category| Type
| :-- | :-- | :-- | :-- | :-- | :-- |
|1 | Pregnancies | Number of times pregnant | Int | Discrete | Variable |
|2 | Glucose | Plasma glucose concentration a 2 hours in an oral glucose tolerance test | Int | Discrete | Variable |
|3 | Blood pressure | Diastolic blood pressure (mm Hg) | Int | Discrete | Variable |
|4 |Skin thickness | Triceps skin fold thickness (mm) | Int | Discrete | Variable |
|5 | Insulin | 2-Hour serum insulin (mu U/ml) | Int | Discrete | Variable |
|6 | BMI | Body mass index (weight in kg/(height in m)^2) | Float | Continuous | Variable |
|7 | DiabetesPedigreeFunction | Diabetes pedigree function| Float | Continuous | Variable |
|8 | Age | Age (years) | Int | Discrete | Variable |
|9 | Outcome | Class variable (0 or 1)| Int | Discrete | Target |
""",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("**Plan for Data Exploration, Feature Engineering and Modelling**",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("""
1. Packages to be installed
2. Load the libraries
3. Load the dataset
4. General information about the dataset
5. Exploratory Data Analysis (EDA)
6. Modeling
7. Recommendations
"""
,sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("## Packages to be installed ##",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("""
1. tpot
2. auto-sklearn
3. scipy
""",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("## Load the libraries ##",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("""
1. numpy
2. pandas
3. matplotlib
4. hvplot
5. tqdm
6. sklearn
7. autosklearn
8. tpot
"""
,sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("## Load the dataset ##",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("""
```python
# location of dataset
dataset = 'data/diabetes.csv'

# reading the dataset into dataframe
df = pd.read_csv(dataset)
```"""
,sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("## General Information About the Dataset", sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("Sampling the Data",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.DataFrame(df.head(), sizing_mode='scale_width', index=False)))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("Dataset Information",sizing_mode='scale_width')))

In [None]:
buffer = io.StringIO()
df.info(buf=buffer)
s = buffer.getvalue()
tech_report.append(pn.Row(pn.pane.Pane(s, sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("**Actions Taken for Data Wrangling and Feature Engineering**",sizing_mode='scale_width')))

In [None]:
num_cols = df.select_dtypes('number').columns.tolist()

In [None]:
sc = MinMaxScaler()
df2 = df.iloc[:,:-1]
num_features = num_cols[:-1]
df2[num_features] = sc.fit_transform(df2[num_features])

In [None]:
features = df2.columns
target = df.columns[-1]
X = df2
Y = df[target]
sss = StratifiedShuffleSplit(n_splits=1, test_size=.2, random_state=0)
train_indecies = list(sss.split(X,y=Y))
train_index, test_index = train_indecies[0][0], train_indecies[0][1]
X_train, X_test = X.loc[train_index], X.loc[test_index]
y_train, y_test = Y.loc[train_index], Y.loc[test_index]

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("""
- Camel-case for column names
- Classifying columns as Numerical or Categorical
- Features Encoding
- Split the data into test and train
"""
,sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("## Exploratory Data Analysis (EDA)", sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("Summary Statistics for Numerical columns",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Pane(df.describe(), sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("Visual Exploration of Numerical Columns",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("Distribution of Age for patients",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Pane(df['Age'].hist(bins=5), sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("Correlation Between Age and BMI",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Pane(df.hvplot(x='Age', y='BMI', kind='scatter'),sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("Correlation Plot of Numerical Features",sizing_mode='scale_width')))

In [None]:
fig = plt.Figure()
ax = fig.add_subplot(111)
corr = df.corr()
mask = np.triu(corr)
s = sns.heatmap(corr, ax=ax, mask=mask,cmap='Wistia', cbar=False,center=0, square=True, linewidths=.3,  annot= True,annot_kws={"fontsize":5})
s.set_yticklabels(s.get_yticklabels(), fontsize=5)
s.set_xticklabels(s.get_xticklabels(), rotation=90, fontsize=5)
tech_report.append(pn.Row(pn.pane.Matplotlib(fig,sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("Visual Exploration of Categorical Columns",sizing_mode='scale_width')))

In [None]:
# pn.pane.Pane(df.hvplot(x='Age', y='BMI', kind='scatter')).servable()

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("Feature Importance",sizing_mode='scale_width')))

In [None]:
fe = ExtraTreesRegressor(n_estimators=10)
fe.fit(X, Y)
fedf = pd.DataFrame({'Feature': features, 'Feature_importance %': fe.feature_importances_ * 100})
fedf = fedf.sort_values(by=['Feature_importance %'], ascending=False)
tech_report.append(pn.Row(pn.pane.Pane(fedf.hvplot.bar(x='Feature', y='Feature_importance %',rot=20),sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("## Modeling ##",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("""Modelling  Task Details

| No. | Description | Category | Type |
| :-- | :-- | :-- | :-- |
|1 | Data Characteristics | Structured | Categorical |
|2 | Machine Learning Type | Classification |  Binary-Class |
|3 | Relevant ML and DL models | Machine Learning | Multiple Algorithm |
|4 | Technical Metrics | Accuracy | Percentage (the higher the better) |
|5 | Hyperparameter Optimization Techniques | Grid Search | Specified Dictionary |
|6 | Computation Method | On-primse | CPU |
|7 | Additional Features | Ensemble Modeling | Voting and Stacking |
""", sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("""
We will be using all the classifiers in the Sikit-Learn library and the comparing them according to accuracy
""", sizing_mode='scale_width')))

In [None]:
estimators = all_estimators(type_filter='classifier')
# sklearn.utils.all_estimators(type_filter='classifier', estimators = all_clfs)

all_clfs = []
for name, ClassifierClass in estimators:
    try:
        if name != 'CategoricalNB' or name != 'MLPClassifier':
            clf = ClassifierClass()
            all_clfs.append((name, clf))
    except Exception as e:
        pass

voting_model = VotingClassifier(estimators=all_clfs, n_jobs=2)

stack_model = StackingClassifier(estimators=all_clfs, final_estimator=LogisticRegression(), cv=3, n_jobs=2)

In [None]:
# Compare Algorithms
results_1_df = pd.read_csv('data/classification_results_1.csv')
results_1_df = results_1_df.sort_values(by=['Accuracy'], ascending=True)
tech_report.append(pn.Row(pn.pane.Pane(results_1_df.hvplot.bar(x='Classifier', y='Accuracy', invert=True, height=500))))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("**Recommendation of final model**")))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("""
We will be using all the classifiers in the Sikit-Learn library and the comparing them according to accuracy
""", sizing_mode='scale_width')))

In [None]:
# voting_model.fit(X_train, y_train)
# y_test_pred = voting_model.predict(X_test)
# test_accuracy = accuracy_score(y_test, y_test_pred)
tech_report.append(pn.Row(pn.pane.Markdown(f"Voting Classifier predictions accuracy: {results_1_df['Accuracy'].max() * 100 :.2f}%",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("Confusion Matrix Plot",sizing_mode='scale_width')))

In [None]:
voting_model.fit(X_train, y_train)
# with open('voting_model.pkl', 'wb') as model_file:
#     pickle.dump(voting_model, model_file)
y_test_pred = voting_model.predict(X_test)
array = confusion_matrix(y_test, y_test_pred)
df_cm = pd.DataFrame(array, index=['no_diabeties', 'with_diabeties'],
                     columns=['no_diabeties', 'with_diabeties'])
# df_cm.to_csv('confusion_matrix.csv', index=False)
hv.extension('matplotlib')
tech_report.append(pn.Row(hv.HeatMap((df_cm.columns, df_cm.index, df_cm.values)).opts(invert_xaxis=True, show_values=True)))

In [None]:
hv.extension('bokeh')
tech_report.append(pn.Row(pn.pane.Markdown("**Summary of Key Findings and Insights.**",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("""
1. Glucose level in the blood is main factor for the diagnosis of diabities
2. The Voting Classifier gave the best results for the training set
3. 21% of the pateinets got misclassified by the Voting Classifier, better Classifier should be seeked
""", sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("## Recommendations ##", sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("""
Using automated machine learning yield better results than manual or gridseached models

for this dataset will use auto-sklearn and TPOT and compared thier results to results obtained before
""",sizing_mode='scale_width')))

In [None]:
results_2_df = pd.read_csv('data/classification_results_2.csv')
results_2_df = results_2_df.sort_values(by=['Accuracy'], ascending=True)
tech_report.append(pn.Row(pn.pane.Pane(results_2_df.hvplot.bar(x='Classifier', y='Accuracy', invert=True, height=500))))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown(f"AutoSklearn predictions accuracy: {results_2_df['Accuracy'].max() * 100 :.2f}%",sizing_mode='scale_width')))

In [None]:
tech_report.append(pn.Row(pn.pane.Markdown("""
The default configuration for automated machine learning give better results than the model that manually selected and modified
""",sizing_mode='scale_width')))

In [None]:
html = "<iframe src='assets/Diabeties_Analysis.slides.html', width=1330, height=530></iframe>"
html_pane = pn.pane.HTML(html, sizing_mode='scale_width')

In [None]:
data_files = pn.Column()

In [None]:
# def get_csv():
#     output = BytesIO()
#     output.name = "data.csv"
#     data.to_csv(output)
#     output.seek(0)
#     return output
# file_download_csv_zip = pn.widgets.FileDownload(filename="data.csv", callback=get_csv, button_type="primary")
data_files.append(pn.widgets.FileDownload(file='data/diabetes.csv', filename="Report Data - CSV"))
data_files.append(pn.widgets.FileDownload(file='data/classification_results_1.csv', filename='Trained Models Results - CSV'))
data_files.append(pn.widgets.FileDownload(file='data/voting_model.pkl', filename='Winning Model(pickle)'))
data_files.append(pn.widgets.FileDownload(file='data/confusion_matrix.csv', filename='Winning Model Accuracy - CSV'))

In [None]:
layout = pn.Tabs(('Summary Report', html_pane), ('Dashboard', 'TBD'),
                 ('Technical Report', tech_report), ('Code', 'TBD'),
                 ('Data', data_files), sizing_mode='scale_width')
layout.servable()