In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# DATASET

In [None]:
df=pd.read_csv('diabetes.csv')
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe().T

In [None]:
df.head(10)

In [None]:
df.isnull().head(10)

In [None]:
df.isnull().sum()

In [None]:
df_copy=df.copy(deep=True)

In [None]:
df_copy

In [None]:
df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] =df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
df_copy.isnull().sum()

# Data Visualization

In [None]:
plt.style.use("seaborn")
p=df.hist(figsize=(20,20))

## Replacing NaN with mean values

In [None]:
df_copy['Glucose'].fillna(df_copy['Glucose'].mean(),inplace=True)
df_copy['BloodPressure'].fillna(df_copy['BloodPressure'].mean(),inplace=True)
df_copy['SkinThickness'].fillna(df_copy['SkinThickness'].mean(),inplace=True)
df_copy['Insulin'].fillna(df_copy['Insulin'].mean(),inplace=True)
df_copy['BMI'].fillna(df_copy['BMI'].mean(),inplace=True)

In [None]:
p=df_copy.hist(figsize=(20,20))

# Scaling The Data

In [None]:

from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X =  pd.DataFrame(sc_X.fit_transform(df_copy.drop(["Outcome"],axis = 1),), 
                  columns=['Pregnancies', 
'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age'])
X.head()

In [None]:
y=df.Outcome
y.head()

# Model Building

# Splitting the Data

In [None]:
X=df.drop('Outcome',axis=1)
y=df['Outcome']

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=7)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(n_estimators=200)
rfc.fit(X_train,y_train)

In [None]:
rfc_train=rfc.predict(X_train)
from sklearn import metrics
print("Accuaracy_Score=", format(metrics.accuracy_score(y_train,rfc_train)))

In [None]:
from sklearn import metrics
predictions=rfc.predict(X_test)
print("Accuaracy_Score=", format(metrics.accuracy_score(y_test,predictions)))

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dtree=DecisionTreeClassifier()
dtree.fit(X_train,y_train)

In [None]:
from sklearn import metrics
predictions=dtree.predict(X_test)
print("Accuracy Score =",format(metrics.accuracy_score(y_test,predictions)))

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))

# XgBoost Classifier

In [None]:
from xgboost import XGBClassifier
xgb_model=XGBClassifier(gamma=0)
xgb_model.fit(X_train,y_train)

# Support Vector Machine

In [None]:
from sklearn.svm import SVC
svc_model=SVC()
svc_model.fit(X_train,y_train)

In [None]:
svc_pred=svc_model.predict(X_test)

In [None]:
from sklearn import metrics
print("Accuracy Score", format(metrics.accuracy_score(y_test,svc_pred)))

In [None]:
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,svc_pred))
print(classification_report(y_test,svc_pred))

In [None]:
rfc.feature_importances_

In [None]:
(pd.Series(rfc.feature_importances_, index=X.columns).plot(kind='barh'))


# Saving Model - Random Forest


In [None]:
# Firstly we will be using the dump() function to save the model using pickle
import pickle
saved_model = pickle.dumps(rfc)

# Then we will be loading that saved model
rfc_from_pickle = pickle.loads(saved_model)

# lastly, after loading that model we will use this to make predictions
rfc_from_pickle.predict(X_test)

In [None]:
pip install pickle-mixin