# EDA & ML on Diabetes

## Import Libraries And Load The Data

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import itertools
plt.style.use('fivethirtyeight')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
data = pd.read_csv("/kaggle/input/pima-indians-diabetes-database/diabetes.csv")

In [None]:
data.isnull().sum()

In [None]:
data.describe().T

### Replace Zeros With NaN And Then With Mean Value

In [None]:
data_copy = data.copy(deep = True)
data_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
print(data_copy.isnull().sum())

In [None]:
data_copy['Glucose'].fillna(data_copy['Glucose'].mean(), inplace = True)
data_copy['BloodPressure'].fillna(data_copy['BloodPressure'].mean(), inplace = True)
data_copy['SkinThickness'].fillna(data_copy['SkinThickness'].median(), inplace = True)
data_copy['Insulin'].fillna(data_copy['Insulin'].median(), inplace = True)
data_copy['BMI'].fillna(data_copy['BMI'].median(), inplace = True)

## EDA 

## Analysis Of Results With Diabetes

In [None]:
data1=data_copy[data['Outcome']==1]
columns=data_copy.columns[:8]
plt.subplots(figsize=(18,14))
length=len(columns)
for i,j in itertools.zip_longest(columns,range(length)):
    plt.subplot(int(length/2),3,j+1)
    plt.subplots_adjust(wspace=0.2,hspace=0.5)
    data1[i].hist(bins=20,edgecolor='black')
    plt.title(i)
plt.show()

## Distribution of The Features

In [None]:
sns.pairplot(data=data_copy,hue='Outcome',diag_kind='kde') #plot uncleaned data
plt.show()

## Heatmap For Features Correlation

In [None]:
plt.figure(figsize=(10,7))  
p=sns.heatmap(data_copy.corr(), annot=True,cmap ='RdYlGn') 

## ML Models

In [None]:
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

### Data Normalization

In [None]:
sc = StandardScaler()
X =  pd.DataFrame(sc.fit_transform(data_copy.drop(["Outcome"],axis = 1),),
        columns=['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'])
y = data_copy.Outcome

In [None]:
X.head(3)

### Test and Train Data 

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=42, stratify=y)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(y_test.shape)
print(X_test.shape)

## SVM Model

In [None]:
types=['rbf','linear']
for i in types:
    model=svm.SVC(kernel=i)
    model.fit(X_train,y_train)
    prediction=model.predict(X_test)
    print('Accuracy for SVM kernel=',i,'is',metrics.accuracy_score(prediction,y_test))

## Logistic Regression

In [None]:
model = LogisticRegression()

model.fit(X_train,y_train)
prediction=model.predict(X_test)
print('Accuracy for Logistic Regression is',metrics.accuracy_score(prediction,y_test))

## Decision Tree

In [None]:
model=DecisionTreeClassifier()

model.fit(X_train,y_train)
prediction=model.predict(X_test)
print('Accuracy for Decision Tree is',metrics.accuracy_score(prediction,y_test))

## K-Nearest Neighbours

In [None]:
a_index=list(range(1,11))
a=pd.Series()
x=[0,1,2,3,4,5,6,7,8,9,10]
for i in list(range(1,11)):
    model=KNeighborsClassifier(n_neighbors=i) 
    model.fit(X_train,y_train)
    prediction=model.predict(X_test)
    a=a.append(pd.Series(metrics.accuracy_score(prediction,y_test)))
plt.plot(a_index, a)
plt.xticks(x)
plt.show()
print('Accuracies for different values of n are:',a.values)

### The Best Result is at k = 5

In [None]:
knn = KNeighborsClassifier(5)

knn.fit(X_train,y_train)
knn.score(X_test,y_test)

In [None]:
abc=[]
classifiers=['Linear Svm','Radial Svm','Logistic Regression','KNN 5','Decision Tree']
models=[svm.SVC(kernel='linear'),svm.SVC(kernel='rbf'),LogisticRegression(),KNeighborsClassifier(n_neighbors=5),DecisionTreeClassifier()]
for i in models:
    model = i
    model.fit(X_train,y_train)
    prediction=model.predict(X_test)
    abc.append(metrics.accuracy_score(prediction,y_test))
models_dataframe=pd.DataFrame(abc,index=classifiers)   
models_dataframe.columns=['Accuracy']
models_dataframe.sort_values('Accuracy', ascending=True)

End of Analysis