In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
plt.style.use('seaborn-darkgrid')
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/kaggle/input/pima-indians-diabetes-database/diabetes.csv')

In [None]:
df.sample(5)

In [None]:
df.info()

In [None]:
df.describe()

### Can minimum value of below listed columns be zero (0)?
#### After doing some research I found that on these columns, a value of zero does not make sense and thus indicates missing value.

#### 1- BloodPressure
#### 2- SkinThickness
#### 3- BMI
#### 4- Glucose
### So I'm going to replace all those zeros with NAN
#### Note : a value of zero in the insulin & glocose is not normal but acceptable

In [None]:
cols = ['BloodPressure','SkinThickness','BMI','Glucose']

df[cols] = df[cols].replace(0,np.NaN)

## Splitting the data

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size= 0.2, random_state= 42)

## Cleaning train data

In [None]:
train.isnull().sum()

In [None]:
plt.figure(figsize = (20,15))
for i,col in enumerate(train):
    plt.subplot(3,3,i+1)
    sns.histplot(data = train, x=col, kde=True)
    plt.xlabel(col, fontsize = 15)
    plt.xticks(fontsize = 10)

#### I decided to fill the missing values using the iterative_imputer model with Bayesian ridge regressor from sklearn

## Iterative_imputer 

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imp = IterativeImputer(estimator= RandomForestRegressor(n_estimators=100), max_iter=10,random_state=42)
imputed_train = imp.fit_transform(train)

In [None]:
train = pd.DataFrame(imputed_train, columns = train.columns)

In [None]:
train.isna().sum()

#### Some values seems to be high compared to the meand and median so I will plot these outliers

In [None]:
plt.figure(figsize=(20,15))
for i,col in enumerate(train):
    plt.subplot(3,3,i+1)
    sns.boxplot(data = train,x=col)
    plt.xlabel(col, fontsize=15)
    plt.xticks(fontsize=10)

In [None]:
def detect_outliers(df):
    outliers= pd.DataFrame(columns=["Feature","Num of Outliers","Handled?"])
    for col in df.columns:
        #Return values at the given quantile
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        fence_low = q1 - (1.5*iqr)
        fence_high = q3 + (1.5*iqr)
        N_O_O = df.loc[(df[col] < fence_low) | (df[col] > fence_high)].shape[0]
                   
        df.loc[(df[col] < fence_low),col] = fence_low
        df.loc[(df[col] > fence_high),col] = fence_high
        #let's fix them
        outliers = outliers.append({'Feature': col, "Num of Outliers": N_O_O ,"Handled?": df[col].all() < fence_high},ignore_index=True)
    return outliers

In [None]:
detect_outliers(train)

In [None]:
plt.figure(figsize = (20,15))
for i,col in enumerate(train):
    plt.subplot(3,3,i+1)
    sns.histplot(data = train, x=col, kde=True)
    plt.xlabel(col, fontsize = 15)
    plt.xticks(fontsize = 10)

#### It seems that the outliers in the pregnancies were set to 13.5 so i will set them back to 13 because 13.5 is not possible

In [None]:
train.loc[train['Pregnancies'] > 13, 'Pregnancies'] = 13

In [None]:
train['Pregnancies'].value_counts()

## Cleaning test data

#### I'll repeat what i did with the train data but the imputations will be from the train data to prevent data lekage

In [None]:
test.isnull().sum()

### Iterative_imputer

In [None]:
imputed_test = imp.transform(test)
test = pd.DataFrame(imputed_test, columns = test.columns)

In [None]:
plt.figure(figsize = (20,15))
for i,col in enumerate(test):
    plt.subplot(3,3,i+1)
    sns.histplot(data = test, x=col, kde=True)
    plt.xlabel(col, fontsize = 15)
    plt.xticks(fontsize = 10)

In [None]:
detect_outliers(test)

## EDA

In [None]:
sns.set(font_scale=1.15)
plt.figure(figsize=(14, 10))
sns.heatmap(train.corr(),vmax=.8, linewidths=0.01, square=True,annot=True,cmap='YlGnBu',linecolor="black")

#### The only feature that shows somewhat correlation with the target column is the glocuse level

In [None]:
sns.countplot(x = 'Outcome', data = train)

#### The above graph shows that the data is biased towards datapoints having outcome value as 0 where it means that diabetes was not present actually, The number of non-diabetics is almost twice the number of diabetic patients

In [None]:
#sns.pairplot(data = train ,hue='Outcome')

### Slicing the labels

In [None]:
x_train = train.drop('Outcome', axis = 1)
y_train = train['Outcome']
x_test = test.drop('Outcome', axis = 1)
y_test = test['Outcome']

### Scaling the data

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test) 

## Modeling

### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression 
log_reg = LogisticRegression( max_iter = 1000)
log_reg.fit(x_train, y_train)

#### Logistic regression Accuracy

In [None]:
from sklearn.metrics import accuracy_score
y_pred_l = log_reg.predict(x_test)
logistic_acc = accuracy_score(y_test, y_pred_l)
print(f'Logistic regression accuracy = {logistic_acc:.4f}')

### Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier
RFC = RandomForestClassifier(n_estimators = 200, max_depth = 3 ,random_state = 42)
RFC.fit(x_train,y_train)

#### Random Forest Accuracy

In [None]:
y_pred_r = RFC.predict(x_test)
rfc_acc = accuracy_score(y_test, y_pred_r)
print(f'Random forests classifier accuracy = {rfc_acc:.4f}')

### KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
KNN = KNeighborsClassifier(n_neighbors = 111)
KNN.fit(x_train,y_train)

#### KNN Accuracy

In [None]:
y_pred_k = KNN.predict(x_test)
knn_acc = accuracy_score(y_test, y_pred_k) 
print(f'KNN classifier accuracy = {knn_acc:.4f}')

### XGB

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier(learning_rate = 0.01, n_estimators = 25, max_depth = 2)
xgb.fit(x_train,y_train)

#### XGB Accuracy

In [None]:
y_pred_x = xgb.predict(x_test)
xgb_acc = accuracy_score(y_test, y_pred_x)
print(f'XGB classifier accuracy = {xgb_acc:.4f}')

### SVM

In [None]:
from sklearn.svm import SVC
svm = SVC(kernel= 'linear')
svm.fit(x_train,y_train)

#### SVM Accuracy

In [None]:
y_pred_s = svm.predict(x_test)
svc_acc = accuracy_score(y_test, y_pred_s)
print(f'SVM classifier accuracy = {svc_acc:.4f}')

### Ensambling the five models

In [None]:
final_prediction = []
for i in range(len(y_test)):
    if ((y_pred_k[i] + y_pred_r[i] + y_pred_x[i] + y_pred_l[i] + y_pred_s[i]) / 5) > 0.5 :
        final_prediction.append(1)
    else :
        final_prediction.append(0)

In [None]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test,final_prediction)
sns.heatmap(conf_mat , annot= True , cmap='Blues')

#### Ensambeld Accuracy

In [None]:
ensambled_acc = accuracy_score(y_test,final_prediction)
print(f'Ensambeld Accuracy = {ensambled_acc:.4f}')

In [None]:
from sklearn.metrics import f1_score
ensambled_f = f1_score(y_test,final_prediction)
print(f'Ensambeld f1_score = {ensambled_f:.4f}')