# Importing Modules

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


In [2]:
df=pd.read_csv("diabetes_dataset.csv") # reading csv file of dataset.

# Data Cleaning 

In [3]:
df.head() #Checking columns.

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [4]:
df['gender'].value_counts() # values of records as per gender

gender
Female    58552
Male      41430
Other        18
Name: count, dtype: int64

In [5]:
gender_other=df[df['gender']=='Other'] #observations of gender other.

In [6]:
gender_other

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
12669,Other,10.0,0,0,not current,14.09,5.0,140,0
14838,Other,19.0,0,0,No Info,27.32,5.7,158,0
16702,Other,39.0,0,0,not current,31.24,6.2,85,0
18691,Other,10.0,0,0,not current,16.59,6.1,160,0
23266,Other,23.0,0,0,No Info,24.23,6.1,140,0
31985,Other,53.0,0,0,No Info,27.32,6.6,160,0
33805,Other,45.0,0,0,never,27.32,4.0,159,0
34929,Other,47.0,0,0,never,36.76,6.6,90,0
35006,Other,47.0,0,0,never,36.76,3.5,200,0
40337,Other,18.0,0,0,not current,30.19,6.1,90,0


As Other gender has only 18 observations and all are classifying diabetes. So, it would better to drop these records.

In [7]:
df1=df[df.gender!='Other']

In [8]:
df1

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
...,...,...,...,...,...,...,...,...,...
99995,Female,80.0,0,0,No Info,27.32,6.2,90,0
99996,Female,2.0,0,0,No Info,17.37,6.5,100,0
99997,Male,66.0,0,0,former,27.83,5.7,155,0
99998,Female,24.0,0,0,never,35.42,4.0,100,0


In [9]:
df1['gender'].value_counts()

gender
Female    58552
Male      41430
Name: count, dtype: int64

In [10]:
df1['smoking_history'].value_counts()

smoking_history
No Info        35810
never          35092
former          9352
current         9286
not current     6439
ever            4003
Name: count, dtype: int64

As there is no proper dependency and so much outliners, droping this feature.

In [11]:
df2=df1.drop(['smoking_history'],axis=1)

In [12]:
df1.isnull().any() # checking any null value

gender                 False
age                    False
hypertension           False
heart_disease          False
smoking_history        False
bmi                    False
HbA1c_level            False
blood_glucose_level    False
diabetes               False
dtype: bool

Replacing catagorical values of gender to numerical values.

In [13]:
new_fields={
    'gender':{
        'Male':0,
        'Female':1
    }
}

df3=df2.replace(new_fields)

In [14]:
df3.head()

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
0,1,80.0,0,1,25.19,6.6,140,0
1,1,54.0,0,0,27.32,6.6,80,0
2,0,28.0,0,0,27.32,5.7,158,0
3,1,36.0,0,0,23.45,5.0,155,0
4,0,76.0,1,1,20.14,4.8,155,0


# Details about columns

Age is an important factor as diabetes is more commonly diagnosed in older adults.Age ranges from 0-80 in our dataset.

Hypertension is a medical condition in which the blood pressure in the arteries is persistently elevated. It has values a 0 or 1 where 0 indicates they don’t have hypertension and for 1 it means they have hypertension.

Heart disease is another medical condition that is associated with an increased risk of developing diabetes. It has values a 0 or 1 where 0 indicates they don’t have heart disease and for 1 it means they have heart disease.

BMI (Body Mass Index) is a measure of body fat based on weight and height. Higher BMI values are linked to a higher risk of diabetes. The range of BMI in the dataset is from 10.16 to 71.55. BMI less than 18.5 is underweight, 18.5-24.9 is normal, 25-29.9 is overweight, and 30 or more is obese.

HbA1c (Hemoglobin A1c) level is a measure of a person's average blood sugar level over the past 2-3 months. Higher levels indicate a greater risk of developing diabetes. Mostly more than 6.5% of HbA1c Level indicates diabetes.

Blood glucose level refers to the amount of glucose in the bloodstream at a given time. High blood glucose levels are a key indicator of diabetes.

Diabetes is the target variable being predicted, with values of 1 indicating the presence of diabetes and 0 indicating the absence of diabetes.



In [15]:
# sns.pairplot(df,hue="diabetes")

# KNN Classifier

In [16]:
#importing sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

from warnings import simplefilter
# ignore all warnings
simplefilter(action='ignore')

In [17]:
X=df3.drop(['diabetes'],axis=1)
X

Unnamed: 0,gender,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level
0,1,80.0,0,1,25.19,6.6,140
1,1,54.0,0,0,27.32,6.6,80
2,0,28.0,0,0,27.32,5.7,158
3,1,36.0,0,0,23.45,5.0,155
4,0,76.0,1,1,20.14,4.8,155
...,...,...,...,...,...,...,...
99995,1,80.0,0,0,27.32,6.2,90
99996,1,2.0,0,0,17.37,6.5,100
99997,0,66.0,0,0,27.83,5.7,155
99998,1,24.0,0,0,35.42,4.0,100


In [18]:
y=df3['diabetes']
y

0        0
1        0
2        0
3        0
4        0
        ..
99995    0
99996    0
99997    0
99998    0
99999    0
Name: diabetes, Length: 99982, dtype: int64

In [19]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=54)

Optimizing KNN

In [20]:
# score_list=[] # list to store the score
# k_list=[] # list to store the value of k
# for i in range (1,21):
#     knn = KNeighborsClassifier(n_neighbors=i)
#     knn.fit(X_train, y_train)
#     test_score=knn.score(X_test,y_test)
#     score_list.append(test_score)
#     k_list.append(i)

In [21]:
# #Plotting the score vs value of k
# plt.plot(k_list,score_list,color='blue', linestyle='dashed', marker='o',
# markerfacecolor='red', markersize=10)
# plt.title('Scores vs. K Value')
# plt.xlabel('K')
# plt.ylabel('Scores')


After Optimizing the final value for k is 7.

In [22]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X,y)

Predicting the data.

In [23]:
X.columns

Index(['gender', 'age', 'hypertension', 'heart_disease', 'bmi', 'HbA1c_level',
       'blood_glucose_level'],
      dtype='object')

creating prediction function for predicting data.

In [24]:
def diabetes(gender,age,hypertension,heart,bmi,hblevel,bglevel):
    arr=np.array([gender,age,hypertension,heart,bmi,hblevel,bglevel])
    return knn.predict([arr])

In [25]:
print(diabetes(0,73,0,0,25.19,9.0,170))

[1]


creating model for prediction

In [26]:
import pickle
pickle_out=open("classifier.pkl","wb")
pickle.dump(knn,pickle_out)
pickle_out.close()