# Import the nessary liberares  

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import ConfusionMatrixDisplay , classification_report , accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
import warnings


warnings.filterwarnings("ignore")

# Read the Data

In [None]:
# read the data 
df=pd.read_csv("/kaggle/input/healthcare-diabetes/Healthcare-Diabetes.csv")

# EDA

In [None]:
# print the first  10 rows in the data 
df.head()

In [None]:
# print the last 10 rows in the data 
df.tail()

In [None]:
# print the shape of data
df.shape

In [None]:
# print the columns of the data 
df.columns.to_list()

In [None]:
# print information about data
df.info()

In [None]:
# describe the columns in the data set 
df.describe()

In [None]:
sns.boxplot(data=df, x = "Pregnancies")

In [None]:
sns.histplot(data = df["Pregnancies"])

In [None]:
sns.boxplot(data=df, x = "Glucose")

In [None]:
sns.histplot(data = df["Glucose"])

In [None]:
sns.boxplot(data=df, x = "BloodPressure")

In [None]:
sns.histplot(data = df["BloodPressure"])

In [None]:
sns.boxplot(data=df, x = "SkinThickness")

In [None]:
sns.histplot(data = df["SkinThickness"])

In [None]:
sns.boxplot(data=df, x = "Insulin")

In [None]:
sns.histplot(data = df["Insulin"])

In [None]:
sns.boxplot(data=df, x = "Age")

In [None]:
sns.histplot(data = df["Age"])

In [None]:
sns.boxplot(data=df, x = "DiabetesPedigreeFunction")

In [None]:
sns.histplot(data = df["DiabetesPedigreeFunction"])

In [None]:
sns.boxplot(data=df, x = "BMI")

In [None]:
sns.histplot(data = df["BMI"])

In [None]:
df["Outcome"].value_counts()

In [None]:
sns.countplot(data=df, x='Outcome')

In [None]:
df.columns

In [None]:
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
       'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
for feature in features:
    print(feature)
    print(df[feature].skew())

In [None]:
acc_baseline = df['Outcome'].value_counts(normalize= True).max()
print(f'Baseline Accuracy is {acc_baseline}')

In [None]:
cor = df.drop(["Id"],axis=1).select_dtypes(include='number').copy()
corr = cor.corr()
fig , ax = plt.subplots(figsize=(10 , 10))
sns.heatmap(corr ,annot= True , ax=ax , cmap= 'Blues');

In [None]:
df.drop(["Id"],axis=1,inplace=True)

## Model before handleing outlier

In [None]:
x = df.drop(["Outcome"],axis=1)
y = df["Outcome"]

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.25,random_state=42,stratify = y)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
lr = LogisticRegression()
lr.fit(x_train,y_train)

In [None]:
print(lr.score(x_train,y_train))
print(lr.score(x_test,y_test))

In [None]:
features = x_test.columns
importances = lr.coef_[0]

In [None]:
feat_imp = pd.Series(importances , index=features).sort_values()
feat_imp.tail().plot(kind= 'barh')
plt.xlabel("Gini Importance")
plt.ylabel("Feature")
plt.title("Feature Importance");

In [None]:
lin = LinearRegression()
lin.fit(x_train,y_train)

In [None]:
print(lin.score(x_train,y_train))
print(lin.score(x_test,y_test))

In [None]:
kn = KNeighborsClassifier(n_neighbors=2)
kn.fit(x_train,y_train)

In [None]:
print(kn.score(x_train,y_train))
print(kn.score(x_test,y_test))

In [None]:
from sklearn.inspection import permutation_importance

# Assuming kn is your trained KNeighborsClassifier model
result = permutation_importance(kn, x_test, y_test, n_repeats=10, random_state=42)
importances = result.importances_mean

# Match importances with feature names
feature_importance_dict = dict(zip(x_test.columns, importances))

# Sort feature importances
sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

# Display feature importances
for feature, importance in sorted_feature_importance:
    print(f"Feature: {feature}, Importance: {importance}")

## handling outliers

In [None]:
def outliers(df,ft):
    q1 = df[ft].quantile(0.25)
    q3 = df[ft].quantile(0.75)
    iqr = q3 - q1 
    
    lower_limit = q1 - iqr *1.5
    upper_limit = q3 + iqr *1.5
    
    ls = df.index[(df[ft]<lower_limit) | (df[ft]>upper_limit)]
    
    return ls

In [None]:
df.columns

In [None]:
index_list = []
num = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction']
for feature in num:
    index_list.extend(outliers(df,feature))

In [None]:
def remove(df,ls):
    ls = sorted(set(ls))
    df = df.drop(ls)
    return df

In [None]:
df = remove(df,index_list)

In [None]:
df.shape

In [None]:
sns.boxplot(data=df, x = "DiabetesPedigreeFunction")

## model after hanling outliers

In [None]:
x = df.drop(["Outcome"],axis=1)
y = df["Outcome"]

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.25,random_state=42,stratify = y)

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
lr = LogisticRegression(max_iter=500)
lr.fit(x_train,y_train)

In [None]:
print(lr.score(x_train,y_train))
print(lr.score(x_test,y_test))

In [None]:
kn = KNeighborsClassifier(n_neighbors=2)
kn.fit(x_train,y_train)

In [None]:
print(kn.score(x_train,y_train))
print(kn.score(x_test,y_test))