In [4]:
#Here we are going to use PIMA Indian Daibities Dataset
#The variable names are as follows:
Pregnancies – Number of times pregnant
Glucose – Plasma glucose concentration a 2 hours in an oral glucose tolerance test
Blood Pressure – Diastolic blood pressure (mm Hg)
Skin Thickness – Triceps skinfold thickness (mm)
Insulin – 2-Hour serum insulin (mu U/ml)
BMI – Body mass index (weight in kg/(height in m)^2)
DiabetesPedigreeFunction – Diabetes pedigree function
Age – Age in years
Outcome – Class variable (0 or 1)

SyntaxError: invalid character in identifier (<ipython-input-4-e27b442bbe0c>, line 3)

In [5]:
#Importing necessary modules
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
#Loading the dataset using pandas .read_csv()
data = pd.read_csv("diabetes.csv")

In [7]:
#Checking the shape of our data
data.shape

(768, 9)

In [8]:
#Printing top 5 rows to make us familier with dataset
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
#Using .info() function on a DataFrame we can get basic information about our DataFrame like features and their datatypes, no of missing values, number of rows or columns etc. as shown below:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [10]:
#Checking basic test statistics like mean, median etc.
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [11]:
#let’s check how much data our in dataset is missing (in form of 0):
(data.iloc[:,1:6] == 0).sum()

Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64

In [12]:
#Let's convert the 0’s into NaN by using .replace() method
for i in range(1,6):
    data.iloc[:, i].replace(0, np.nan, inplace=True)

In [13]:
#Again checking how much data in our dataset is mentioned as 0:
(data.iloc[:,1:6] == 0).sum()

Glucose          0
BloodPressure    0
SkinThickness    0
Insulin          0
BMI              0
dtype: int64

In [14]:
#let’s again for confirmation, check our dataset using .head() method and print the top 10 rows
data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,,,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,,,,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,,,,0.232,54,1


In [15]:
#The above process is not generally advised because it will delete all observations where any of variables is missing ultimately reducing the size of our dataset and quality of our model.
data = data.dropna()

In [16]:
#Checking how much data is left after using .dropna() method
data.shape

(392, 9)

In [17]:
#Almost 50% of our data is deleted which is not good for us. 
#If only a few rows contain missing values, then it’s not so bad, but generally, we need a more robust method. 
#So this method is only advised to use if NaN values are few in numbers.

In [18]:
#Filling missing values with a test statistics like mean, median or mode.
#Showing here with mean only

mean_value=data['Age'].mean()
data['Age']=data['Age'].fillna(mean_value)

In [19]:
#Using Forward fill and Backward fill
#Backward fill
data.fillna(method='bfill')
#Forward Fill
data.fillna(method='ffill')

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
13,1,189.0,60.0,23.0,846.0,30.1,0.398,59,1
...,...,...,...,...,...,...,...,...,...
753,0,181.0,88.0,44.0,510.0,43.3,0.222,26,1
755,1,128.0,88.0,39.0,110.0,36.5,1.057,37,1
760,2,88.0,58.0,26.0,16.0,28.4,0.766,22,0
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63,0


In [20]:
#Using Imputer method from sklearn.preprocessing:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X)
X = imputer.transform(X)
#Above we can use strategy=’most_frequent’ for using mode

NameError: name 'X' is not defined

In [None]:
#Using imputer and fitting out the model at once by using a Sklearn’s pipeline object as shown below:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
log = LogisticRegression()
steps = [('imputation', imp), ('logistic_regresson', log)]
pipeline = pipeline(steps)