![](img1.png)

# Problem Definition

**Dataset:**

This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. All patients here are females at least 21 years old of Pima Indian heritage.

![](img2.png)


**Aim:**

The objective of the dataset is to diagnostically predict whether or not a person has diabetes, based on certain diagnostic measurements included in the dataset.


**Variables:**

The dataset consists of several medical predictor variables and one target variable, Outcome. Features include the number of pregnancies each person has had, BMI, insulin level, age, and so on.

**Pregnancies:** Number of pregnancies 

**Glucose:** Plasma glucose concentration a 2 hours in an oral glucose tolerance test  

**BloodPressure:** Diastolic blood pressure (mm Hg)   

**SkinThickness:** Triceps skinfold thickness (mm)  

**Insulin:** 2-Hour serum insulin (mu U/ml)  

**BMI:** Body mass index (weight in kg/(height in m)^2)  

**DiabetesPedigreeFunction:** Diabetes pedigree function  

**Age:** Age (years)  

**Outcome:** Class variable (0 or 1)

In [None]:
# Import libraries

# data analysis libraries:
import numpy as np
import pandas as pd 

# data visualization libraries:
import seaborn as sns
import matplotlib.pyplot as plt

# ML libraries:
from sklearn.preprocessing import scale 
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

from warnings import filterwarnings
filterwarnings('ignore')

# Exploratory Data Analysis

## Loading Data and Quick Look

In [None]:
# Load data

df = pd.read_csv("diabetes.csv")
df.head()

In [None]:
df.shape

In [None]:
df.describe().T

## Target Analysis

In [None]:
df["Outcome"].value_counts()

In [None]:
df["Outcome"].value_counts() / len(df) * 100

In [None]:
sns.countplot(x="Outcome", data=df);

## Analysis of Features

In [None]:
df["BloodPressure"].hist(bins=20)
plt.xlabel("BloodPressure")
plt.show()

In [None]:
def plot_numerical_col(dataframe, numerical_col):
        dataframe[numerical_col].hist(bins=20)
        plt.xlabel(numerical_col)
        plt.show()

In [None]:
cols = [col for col in df.columns if df[col].nunique() > 2]

for col in cols:
    plot_numerical_col(df, col)

## Correlation Heatmap

In [None]:
def correlation_matrix(df, cols):
    fig = plt.gcf()
    fig.set_size_inches(10, 8)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    fig = sns.heatmap(df[cols].corr(), annot=True, linewidths=0.5, annot_kws={'size': 12}, linecolor='w',cmap='RdBu')
    plt.show()

In [None]:
correlation_matrix(df, cols)

## Target vs Features

In [None]:
df.groupby("Outcome").agg({"BloodPressure": "mean"})

In [None]:
def target_summary_with_num(dataframe, target, numerical_col):
    print(dataframe.groupby(target).agg({numerical_col: "mean"}), end="\n\n\n")

In [None]:
for col in cols:
    target_summary_with_num(df, "Outcome", col)

![](img3.png)

# ML without Feature Engineering & Data Preprocessing

In [None]:
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

## Logistic Regression

In [None]:
log_model = LogisticRegression().fit(X_train, y_train)
y_pred = log_model.predict(X_test)
acc_1 = accuracy_score(y_test, y_pred)
print(acc_1)

## CART

In [None]:
cart_model = DecisionTreeClassifier(random_state=46).fit(X_train, y_train)
y_pred = cart_model.predict(X_test)
acc_2 = accuracy_score(y_test, y_pred)
print(acc_2)

## Random Forests

In [None]:
rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
acc_3 = accuracy_score(y_pred, y_test)
print(acc_3)

# Data Preprocessing

## Missing values

In [None]:
df.isnull().sum()

## Outliers

In [None]:
def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

In [None]:
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

In [None]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

In [None]:
for col in cols:
    print(col, check_outlier(df, col))

In [None]:
for col in cols:
    replace_with_thresholds(df, col)

In [None]:
for col in cols:
    print(col, check_outlier(df, col))

# Feature Engineering

In [None]:
# Glucose

df['New_Glucose_Class'] = pd.cut(x=df['Glucose'], bins=[0,139,200],labels = ["Normal","Prediabetes"])

In [None]:
# Age

df.loc[(df['Age'] < 35), "NEW_Age_CAT"] = 'Young'
df.loc[(df['Age'] >=35) & (df['Age'] <= 55), "NEW_Age_CAT"] = 'Middleage'
df.loc[(df['Age'] > 55) , "NEW_Age_CAT"] = 'Old'

In [None]:
# BMI

df['New_BMI_Range'] = pd.cut(x=df['BMI'], bins=[0,18.5,24.9,29.9,100],labels = ["Underweight","Healty","Overweight","Obese"])

In [None]:
# BloodPressure

df['New_BloodPressure'] = pd.cut(x=df['BloodPressure'], bins=[0,79,89,123],labels = ["Normal","HS1","HS2"])

In [None]:
df.head()

In [None]:
df.info()

In [None]:
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()

In [None]:
df = pd.get_dummies(df, columns=cat_cols, drop_first=True)

In [None]:
df.head()

# ML with Feature Engineering

In [None]:
y = df["Outcome"]
X = df.drop(["Outcome"], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

## Logistic Regression

In [None]:
log_model = LogisticRegression().fit(X_train, y_train)
y_pred = log_model.predict(X_test)
acc_4 = accuracy_score(y_test, y_pred)
print(acc_4)

## CART

In [None]:
cart_model = DecisionTreeClassifier(random_state=46).fit(X_train, y_train)
y_pred = cart_model.predict(X_test)
acc_5 = accuracy_score(y_test, y_pred)
print(acc_5)

## Random Forests

In [None]:
rf_model = RandomForestClassifier(random_state=46).fit(X_train, y_train)
y_pred = rf_model.predict(X_test)
acc_6 = accuracy_score(y_test, y_pred)
print(acc_6)

In [None]:
def plot_importance(model, features, num=len(X)):
    feature_imp = pd.DataFrame({'Value': model.feature_importances_, 'Feature': features.columns})
    plt.figure(figsize=(10, 10))
    sns.set(font_scale=1)
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value",
                                                                     ascending=False)[0:num])
    plt.title('Features')
    plt.tight_layout()
    plt.show()

In [None]:
plot_importance(rf_model, X_test)

# Conclusion

In [None]:
print(f'''Accuracy before feature engineering:

Logistic Regression: {acc_1} 
CART: {acc_2}
Random Forests: {acc_3}

Accuracy after feature engineering: 

Logistic Regression: {acc_4}
CART: {acc_5}
Random Forests: {acc_6}''')