In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('diabetes.csv')

In [3]:
data.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

In [4]:
data.head(3)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1


In [5]:
data.shape

(768, 9)

In [6]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [7]:
# random 10 data points
data.sample(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
422,0,102,64,46,78,40.6,0.496,21,0
162,0,114,80,34,285,44.2,0.167,27,0
84,5,137,108,0,0,48.8,0.227,37,1
265,5,96,74,18,67,33.6,0.997,43,0
324,2,112,75,32,0,35.7,0.148,21,0
271,2,108,62,32,56,25.2,0.128,21,0
430,2,99,0,0,0,22.2,0.108,23,0
705,6,80,80,36,0,39.8,0.177,28,0
93,4,134,72,0,0,23.8,0.277,60,1
176,6,85,78,0,0,31.2,0.382,42,0


In [8]:
# column names in a list

column_names = list(data)
column_names

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

### Marking missing values

In [9]:
# counting number of missing values for each column

num_missing = (data[column_names[1:6]] == 0).sum()
num_missing

Glucose            5
BloodPressure     35
SkinThickness    227
Insulin          374
BMI               11
dtype: int64

In [10]:
# replacing value of 0 with  nan
data[column_names[1:6]] = data[column_names[1:6]].replace(0,np.nan)

# count number of nan values in each column
data.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [11]:
data.sample(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
608,0,152.0,82.0,39.0,272.0,41.5,0.27,27,0
624,2,108.0,64.0,,,30.8,0.158,21,0
712,10,129.0,62.0,36.0,,41.2,0.441,38,1
676,9,156.0,86.0,,,24.8,0.23,53,1
534,1,77.0,56.0,30.0,56.0,33.3,1.251,24,0
591,2,112.0,78.0,50.0,140.0,39.4,0.175,24,0
427,1,181.0,64.0,30.0,180.0,34.1,0.328,38,1
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
650,1,91.0,54.0,25.0,100.0,25.2,0.234,23,0
74,1,79.0,75.0,30.0,,32.0,0.396,22,0


### Removing rows with Missing Values

In [12]:
# Finding % of all missing values for each Column in

data.isnull().mean()*100

Pregnancies                  0.000000
Glucose                      0.651042
BloodPressure                4.557292
SkinThickness               29.557292
Insulin                     48.697917
BMI                          1.432292
DiabetesPedigreeFunction     0.000000
Age                          0.000000
Outcome                      0.000000
dtype: float64

In [13]:
# dropna() function can be used to drop rows with missing data

print('Shape before dropna',data.shape)

data.dropna(inplace=True)

print('Shape after dropna',data.shape)

Shape before dropna (768, 9)
Shape after dropna (392, 9)
