In [None]:
#importing library and packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import seaborn as sns

# **Objective**
> We will work with the **Diabetes Simple Diagnosis** dataset, which has information related to the patient's health as a way to predict the risk of diabetes. The objective is to **create a model that predicts if a patient has been diagnosed with diabetes or not**. 

In [None]:
#importing dataset
df = pd.read_csv('/kaggle/input/diabetes-simple-diagnosis/Diabetes Simple Diagnosis.csv')
df.head(5)

# **Variables dataset information**
* **Age**: Represents the age of the patient in years. Age can be a risk factor for diabetes, as the risk of diabetes increases with age.

* **Gender**: Indicates the gender of the patient, which can be a factor in the prediction of diabetes. Some studies suggest that women may have a different risk than men in developing diabetes.

* **Body Mass Index (BMI)**: BMI is a measure that uses a person's height and weight to determine whether they are in the normal weight, overweight, or obese category. A high BMI is associated with a higher risk of diabetes.

* **High Blood Pressure (High_BP)**: An indicator of whether or not a patient suffers from high blood pressure. High blood pressure is a significant risk factor for type 2 diabetes.

* **Fasting Blood Glucose (FBS)**: Represents the level of glucose in the blood after an overnight fast. Elevated fasting blood sugar levels may indicate a risk of diabetes or prediabetes.

* **HbA1c (HbA1c_level)**: A measurement of the average blood sugar level over the last 2-3 months. It is an important indicator for diabetes diagnosis and management.

* **Smoking**: Indicates whether the patient smokes or not. Smoking can be an additional risk factor for type 2 diabetes.

* **Diagonisis**: An indicator that someone has diabetes.

In [None]:
#checking dataset information
df.info()

# **Exploratory analysis**
> In this step we will check general information about the base, the
  types of variables, null values, size of the dataset, cardinality of the data, etc.

In [None]:
#renamig the first column
df.rename(columns={'Unnamed: 0':'Id'}, inplace=True)
df.head(5)

In [None]:
#checking variables types
df.dtypes

In [None]:
#checking nulls values
df.isnull().sum()

In [None]:
#checking dataset shape
df.shape

In [None]:
#checking dataset cardinality
df.nunique().sort_values(ascending=False)

In [None]:
#checking object columns in dataset
df.columns[df.dtypes == 'object']

In [None]:
#checking values in Gender column
df.Gender.value_counts()

# **Data pre processing**
> At this stage, we will deal with categorical variables, deal with numerical variables by normalizing the values. After that we define what the explanatory variable and the target variable will be.

In [None]:
#processing categoric variable
df_gender = pd.get_dummies(df.Gender, dtype='int')
df_gender.head(5)

In [None]:
#concatenating the result
df = pd.concat([df, df_gender], axis=1)
df.head(5)

In [None]:
#checking result
df[['Gender','Female','Male','Other']].value_counts()

In [None]:
#creating the scaler
transformer = RobustScaler().fit(df[['Age','BMI','FBS','HbA1c_level']])

In [None]:
#applying the transformation
df[['Age_scaler','BMI_scaler','FBS_scaler','HbA1c_level_scaler']] = transformer.transform(df[['Age','BMI','FBS','HbA1c_level']])

In [None]:
#viewing the dataset after all transformations
df.head(5)

In [None]:
#creating an explanatory variable
X = df.drop(['Id','Gender','Age','BMI','FBS','HbA1c_level','Diagnosis'], axis=1)
X.head(5)

In [None]:
#creating a response variable
y = df.Diagnosis
y.head(5)

# **Training model**
> In this other stage, we will begin developing the model. First, we divided our dataset into training, validation and testing. Then we create our classifiers and train our model on the training data. After that, we validate it on the validation data.

In [None]:
#separating the dataset in training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#separating the dataset in training and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [None]:
##creating the classifier decision tree, knn, logistic regression
clf_dt = tree.DecisionTreeClassifier(random_state=42)
clf_knn = KNeighborsClassifier(n_neighbors=3)
clf_lr = LogisticRegression(random_state=42, max_iter=1000)

In [None]:
#fitting the train data
clf_dt_temp = clf_dt.fit(X_train, y_train)
clf_knn_temp = clf_knn.fit(X_train, y_train)
clf_lr_temp = clf_lr.fit(X_train, y_train)

In [None]:
#predicting the validation data
y_pred_dt = clf_dt_temp.predict(X_val)
y_pred_knn = clf_knn_temp.predict(X_val)
y_pred_lr = clf_lr_temp.predict(X_val)

In [None]:
#checking the accuracy of predictions
accuracy_score_dt = accuracy_score(y_val, y_pred_dt)
accuracy_score_knn = accuracy_score(y_val, y_pred_knn)
accuracy_score_lr = accuracy_score(y_val, y_pred_lr)

accuracy_score_dt, accuracy_score_knn, accuracy_score_lr

# **Analyzing results**
> We validated the 3 models and the one that showed the best results was KNN, with an accuracy of 95.5%. The others were not so far from this result.

In [None]:
#creating a dataframe to display the results
df_result = pd.DataFrame({
    'Models': ['DecisionTree','KNN', 'LogisticRegression'],
    'Initial': [accuracy_score_dt, accuracy_score_knn, accuracy_score_lr]
})
df_result

# **New exploratory analysis**
> However, although our result was not bad, we will try to improve it, we will analyze our database again and see if we can make any changes that improve the performance of our models.

In [None]:
#viewing the current dataframe
df.head(5)

In [None]:
#analyzing the relation between the age variable and the target variable
df.groupby('Age')['Diagnosis'].agg(['sum','count','mean']).reset_index()

In [None]:
#checking the correlation
df[['Age','Diagnosis']].corr()

* children:  < 12
* teenagers: 12 > 19
* young adult: 19 > 30
* adult: 30 > 60
* young elderly: 60 > 80
* elderly: 80 > 110


In [None]:
# condition to create categories according to age
def condition(x):
    if x <= 12:
        return 0#'children'
    elif 12 < x <= 19:
        return 1#'teenager'
    elif 19 < x <= 29:
        return 2#'young adult'
    elif 30 < x <= 59:
        return 3#'adult'
    elif 60 < x <=80:
        return 4#'young elderly'
    else:
        return 5#'elderly'

In [None]:
#applying the condition
df['Age_range'] = df['Age'].apply(condition)
df

In [None]:
#creating an explanatory variable
X = df.drop(['Id','Gender','Age','Age_scaler','BMI','FBS','HbA1c_level','Diagnosis'], axis=1)

#creating a response variable
y = df.Diagnosis

X.head(5)

# **Training model again**
> We will follow the same previous step but with the new changes in our dataset.Split the dataset, train the model, validate and analyze the results.

In [None]:

#separating the dataset in training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#separating the dataset in training and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

#fitting the train data
clf_dt_temp = clf_dt.fit(X_train, y_train)
clf_knn_temp = clf_knn.fit(X_train, y_train)
clf_lr_temp = clf_lr.fit(X_train, y_train)

#predicting the validation data
y_pred_dt = clf_dt_temp.predict(X_val)
y_pred_knn = clf_knn_temp.predict(X_val)
y_pred_lr = clf_lr_temp.predict(X_val)

#checking the accuracy of predictions
accuracy_score_dt = accuracy_score(y_val, y_pred_dt)
accuracy_score_knn = accuracy_score(y_val, y_pred_knn)
accuracy_score_lr = accuracy_score(y_val, y_pred_lr)

print(accuracy_score_dt, accuracy_score_knn, accuracy_score_lr)
    

# **Analyzing results**
> We had a small worsening in the KNN and Logistic Regression results, and an improvement in the Decision Tree results. Therefore, the new changes were not very effective for our overall result.

In [None]:
#adding predicted values to the result dataset
df_result['Age_range'] = [accuracy_score_dt, accuracy_score_knn, accuracy_score_lr]
df_result

# **New exploratory analysis**
> Again, we will analyze our dataset to identify opportunities

In [None]:
#checking the correlation of the main variables
corr = df[['BMI', 'High_BP', 'FBS', 'HbA1c_level',
           'Smoking', 'Diagnosis']].corr()

fig, ax = plt.subplots(figsize=(10,5))

sns.heatmap(corr, annot=True, fmt='.2f')

plt.show()

In [None]:
df.groupby('BMI')['Diagnosis'].agg(['sum','count','mean']).reset_index()

In [None]:
# condition to create categories according to BMI
def condition_BMI(x):
    if x >= 30:
        return 1
    else:
        return 0

In [None]:
#applying the condition
df['BMI_range'] = df['BMI'].apply(condition_BMI)
df

In [None]:
#creating an explanatory variable
X = df.drop(['Id','Gender','Age','Age_scaler','BMI','BMI_scaler','FBS','HbA1c_level','Diagnosis'], axis=1)
#creating a response variable
y = df.Diagnosis
X.head(5)

# **Training model again**
> And again we will split our dataset, train the model, validate and analyze the results.

In [None]:
#separating the dataset in training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#separating the dataset in training and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

#fitting the train data
clf_dt_temp = clf_dt.fit(X_train, y_train)
clf_knn_temp = clf_knn.fit(X_train, y_train)
clf_lr_temp = clf_lr.fit(X_train, y_train)

#predicting the validation data
y_pred_dt = clf_dt_temp.predict(X_val)
y_pred_knn = clf_knn_temp.predict(X_val)
y_pred_lr = clf_lr_temp.predict(X_val)

#checking the accuracy of predictions
accuracy_score_dt = accuracy_score(y_val, y_pred_dt)
accuracy_score_knn = accuracy_score(y_val, y_pred_knn)
accuracy_score_lr = accuracy_score(y_val, y_pred_lr)

print(accuracy_score_dt, accuracy_score_knn, accuracy_score_lr)

# **Analyzing results**
> In this last change, we had a considerable improvement in the Decision Tree, with the highest value found so far of 96%. I believe we can proceed to do our final test by applying this model to our test dataset.

In [None]:
#adding predicted values to the result dataset
df_result['BMI_morethan30'] = [accuracy_score_dt, accuracy_score_knn, accuracy_score_lr]
df_result

# **Final results**
> Applying the decision tree model to our test dataset

In [None]:
#predicting the test data (Decision Tree model)
y_pred = clf_dt_temp.predict(X_test)

In [None]:
#checking the accuracy of prediction
accuracy_score_y = accuracy_score(y_test, y_pred)
accuracy_score_y