# Import Libraries

In [None]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Import Dataset

In [None]:
dataset = pd.read_csv('diabetes2.csv')

In [None]:
dataset

The data was collected and made available by “National Institute of Diabetes and Digestive and Kidney Diseases” as part of the Pima Indians Diabetes Database. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here belong to the Pima Indian heritage (subgroup of Native Americans), and are females of ages 21 and above.<br>

Predict, for given feature values, a woman is diabetic or not.

# Data Visualization & Analysis

## Pregnancies Distribution

In [None]:
fig = px.histogram(dataset['Pregnancies'], title='Pregnancies Distribution', marginal='box', color=dataset['Outcome'], color_discrete_sequence=['red','blue'])
fig.update_layout(bargap=0.1)
fig.show()

***1- Diabetic; 0 - Non-Diabetic***<br>
Exponentially Decaying Distribution<br>
There are more womans with less number of pregnancies (both diabetic and non-diabetic)

## Glucose Distribution

In [None]:
fig = px.histogram(dataset['Glucose'], title='Glucose Distribution', marginal='box', color=dataset['Outcome'], color_discrete_sequence=['red','blue'])
fig.update_layout(bargap=0.1)
fig.show()

***1- Diabetic; 0 - Non-Diabetic***<br>
Normal Distribution<br>
Normal blood glucose levels for adults, without diabetes, is ***90 to 110*** mg/dL<br>
Most of the non-diabetic women have normal glucose level.(Also it is obvious medically)<br>
Most of the diabetic women have glucose level in the range 100 - 200.<br>
Compared to lack of glucose level(< 90), more women have excessive glucose level(>110).

For higher glucose levels, there is a high probability for a women to be diabetic.

***Note :*** Even women having normal glucose level are diabetic.

## BloodPressure Distribution

In [None]:
fig = px.histogram(dataset['BloodPressure'], title='BloodPressure Distribution', marginal='box', color=dataset['Outcome'], color_discrete_sequence=['red','blue'])
fig.update_layout(bargap=0.1)
fig.show()

***1- Diabetic; 0 - Non-Diabetic***<br>
Normal Distribution<br>
A normal blood pressure level is ***less than 120/80 mmHg***<br>

Most of the non-diabetic women have pressure level in the range 60-84.<br>
Most of the diabetic women have glucose level in the range 60 - 84.<br>

***Note :*** Even women having normal pressure level are diabetic.

## SkinThickness Distribution

In [None]:
fig = px.histogram(dataset['SkinThickness'], title='SkinThickness Distribution', marginal='box', color=dataset['Outcome'], color_discrete_sequence=['red','blue'])
fig.update_layout(bargap=0.1)
fig.show()

***1- Diabetic; 0 - Non-Diabetic***<br>
For skinthickness = 0 - 4, there are more diabetic and non-diabetic women<br>
For skinthickness in the range 0 - 34, there is more probability for a women to be non-diabetic.<br>
For skinthickness in the > 34, there is equal probability for a women to be diabetic and non-diabetic.<br>


## Insulin Distribution

In [None]:
fig = px.histogram(dataset['Insulin'], title='Insulin Distribution', marginal='box', color=dataset['Outcome'], color_discrete_sequence=['red','blue'])
fig.update_layout(bargap=0.1)
fig.show()

For insulin level 0 - 19, there are more diabetic and non-diabetic women compared to other level

## BMI Distribution

In [None]:
fig = px.histogram(dataset['BMI'], title='BMI Distribution', marginal='box', color=dataset['Outcome'], color_discrete_sequence=['red','blue'])
fig.update_layout(bargap=0.1)
fig.show()

Normal Distribution<br>
Most of the women have BMI in the range 23 - 41<br>
There are more diabetic women for the BMI > 41

## DiabetesPedigreeFunction Distribution

In [None]:
fig = px.histogram(dataset['DiabetesPedigreeFunction'], title='DiabetesPedigreeFunction Distribution', marginal='box', color=dataset['Outcome'], color_discrete_sequence=['red','blue'])
fig.update_layout(bargap=0.1)
fig.show()

Lesser the value, more the number of women (both cases)<br>
With increase in DiabetesPedigreeFunction value non-diabetic women count decreases<br>

## Age Distribution

In [None]:
fig = px.histogram(dataset['Age'], title='Age Distribution', marginal='box', color=dataset['Outcome'], color_discrete_sequence=['red','blue'])
fig.update_layout(bargap=0.1)
fig.show()

Exponentially Decaying Curve<br>
More number of non-diabetic women are in the age range 20 - 31<br>
More number of diabetic women are in the range 22 - 53<br>

In [None]:
dataset.corr()

# Logistic Regression

## Pregnancies Vs Outcome

In [None]:
X = dataset['Pregnancies']
X = np.array(X).reshape(-1,1)
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))
print('Correlation is',dataset['Outcome'].corr(dataset['Pregnancies']))

## Glucose Vs Outcome

In [None]:
X = dataset['Glucose']
X = np.array(X).reshape(-1,1)
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))
print('Correlation is',dataset['Outcome'].corr(dataset['Glucose']))

## BloodPressure Vs Outcome

In [None]:
X = dataset['BloodPressure']
X = np.array(X).reshape(-1,1)
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))
print('Correlation is',dataset['Outcome'].corr(dataset['BloodPressure']))

## SkinThickness Vs Outcome

In [None]:
X = dataset['SkinThickness']
X = np.array(X).reshape(-1,1)
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))
print('Correlation is',dataset['Outcome'].corr(dataset['SkinThickness']))

## Insulin Vs Outcome

In [None]:
X = dataset['Insulin']
X = np.array(X).reshape(-1,1)
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))
print('Correlation is',dataset['Outcome'].corr(dataset['Insulin']))

## BMI Vs Outcome

In [None]:
X = dataset['BMI']
X = np.array(X).reshape(-1,1)
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))
print('Correlation is',dataset['Outcome'].corr(dataset['BMI']))

## DiabetesPedigreeFunction Vs Outcome

In [None]:
X = dataset['DiabetesPedigreeFunction']
X = np.array(X).reshape(-1,1)
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))
print('Correlation is',dataset['Outcome'].corr(dataset['DiabetesPedigreeFunction']))

## Age Vs Outcome

In [None]:
X = dataset['Age']
X = np.array(X).reshape(-1,1)
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))
print('Correlation is',dataset['Outcome'].corr(dataset['Age']))

### Accuracy in Decreasing Order
***Glucose Vs Outcome*** - 0.7473958333333334<br>
***BMI Vs Outcome*** - 0.6692708333333334<br>
***Pregnancies Vs Outcome*** - 0.6640625<br>
***DiabetesPedigreeFunction Vs Outcome*** - 0.6614583333333334<br>
***Insulin Vs Outcome*** - 0.6575520833333334<br>
***SkinThickness Vs Outcome*** - 0.65234375<br>
***BloodPressure Vs Outcome*** - 0.6510416666666666<br>
***Age Vs Outcome*** - 0.6471354166666666<br>

***Note :*** Eventhough correlation between Age and Outcome is higher than DiabetesPedigreeFunction, Insulin, SkinThickness, SkinThickness it is less dependent on Outcome.<br>
This is because ***'Correlation doesn't imply Causation'***

## Glucose & BMI Vs Outcome

In [None]:
X = dataset[['Glucose', 'BMI']]
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))

Accuracy is increase slightly so continue considering BMI

## Glucose, BMI & Pregnancies Vs Outcome

In [None]:
X = dataset[['Glucose', 'BMI', 'Pregnancies']]
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))

Accuracy is increase slightly so continue considering Glucose

## Glucose, BMI, Pregnancies & DiabetesPedigreeFunction Vs Outcome

In [None]:
X = dataset[['Glucose', 'BMI', 'Pregnancies', 'DiabetesPedigreeFunction']]
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))

Accuracy is increase slightly so continue considering Glucose

## Glucose, BMI, Pregnancies, DiabetesPedigreeFunction & Insulin Vs Outcome

In [None]:
X = dataset[['Glucose', 'BMI', 'Pregnancies', 'DiabetesPedigreeFunction', 'Insulin']]
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))

Accuracy is increase slightly so continue considering Glucose

## Glucose, BMI, Pregnancies, DiabetesPedigreeFunction, Insulin & SkinThickness Vs Outcome

In [None]:
X = dataset[['Glucose', 'BMI', 'Pregnancies', 'DiabetesPedigreeFunction', 'Insulin', 'SkinThickness']]
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))

Accuracy is increase slightly so continue considering Glucose

## Glucose, BMI, Pregnancies, DiabetesPedigreeFunction, Insulin, SkinThickness & BloodPressure Vs Outcome

In [None]:
X = dataset[['Glucose', 'BMI', 'Pregnancies', 'DiabetesPedigreeFunction', 'Insulin', 'SkinThickness', 'BloodPressure']]
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))

Accuracy is decreased when 'BloodPressure' is included so it should not be considered.

## Glucose, BMI, Pregnancies, DiabetesPedigreeFunction, Insulin, SkinThickness & Age Vs Outcome

In [None]:
X = dataset[['Glucose', 'BMI', 'Pregnancies', 'DiabetesPedigreeFunction', 'Insulin', 'SkinThickness', 'Age']]
Y = dataset['Outcome']
model = LogisticRegression(solver='liblinear')
model.fit(X, Y)
print('Accuracy is :', accuracy_score(Y, model.predict(X)))

Accuracy of Glucose, BMI, Pregnancies, DiabetesPedigreeFunction, Insulin & SkinThickness Vs Outcome : 0.7760416666666666<br>
Accuracy of Glucose, BMI, Pregnancies, DiabetesPedigreeFunction, Insulin, SkinThickness & Age Vs Outcome : 0.7760416666666666<br>
Since both are same it means age doesn't contribute anything for prediction, so it can be neglected.<br>

<b>Note :</b> <br>
Accuracy of Glucose Vs Outcome - 0.7473958333333334<br>
Accuracy of Glucose, BMI, Pregnancies, DiabetesPedigreeFunction, Insulin & SkinThickness Vs Outcome - 0.7760416666666666<br>

Only 0.0286458333333332 rise in accuracy, which is not promising.So we can just use Glucose alone<br>
***But*** still feature scaling isn't done, so we continue considering.

# Train Test Split

In [None]:
train_df, test_df = train_test_split(dataset, random_state=42, test_size=0.2)

# Identify Inputs & Targets
Target Column(s) - Outcome<br>
Numeric Columns - Pregnancies, Glucose, BloodPressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age<br>
No categorical input columns<br>

In [None]:
target_cols = ['Outcome']
input_cols = ['Glucose', 'BMI', 'Pregnancies', 'DiabetesPedigreeFunction', 'Insulin', 'SkinThickness']

In [None]:
X_train = train_df[input_cols]
X_test = test_df[input_cols]
Y_train = train_df[target_cols]
Y_test = test_df[target_cols]

# Impute Nan Values (if any)

In [None]:
dataset.info()

There are no Nan Values in the dataset

# Scaling Numeric Columns

***Try minmax scaler also***

In [None]:
scaler = StandardScaler()
scaler.fit(dataset[input_cols])

In [None]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Encoding Categorical Columns
Since there are no categorical columns other than target column, no encoding is needed.

# Training

In [None]:
model = LogisticRegression(solver='liblinear')
Y_train = pd.Series.ravel(Y_train)
model.fit(X_train, Y_train)
Y_train_pred = model.predict(X_train)
print('Accuracy in Training Data Set :',accuracy_score(Y_train, Y_train_pred)*100)

# Testing

In [None]:
model = LogisticRegression(solver='liblinear')
Y_test = pd.Series.ravel(Y_test)
model.fit(X_test, Y_test)
Y_test_pred = model.predict(X_test)
print('Accuracy in Testing Data Set :',accuracy_score(Y_test, Y_test_pred)*100)

Accuracy of Glucose Vs Outcome - 0.7473958333333334

Before Feature Scaling,<br>
Accuracy is 77.60416666666666<br>
After Feature Scaling,<br>
Accuracy is 77.68729641693811<br>

Only very little improvement.