In [35]:
import pandas as pd
import numpy as np
import os

In [36]:
csv_path = os.path.join('datasets', 'data-prep')
diabetes_path = os.path.join(csv_path, 'diabetes.csv')

In [37]:
diabetes = pd.read_csv(diabetes_path)
diabetes.sample(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
201,1,138,82,0,0,40.1,0.236,28,0
74,1,79,75,30,0,32.0,0.396,22,0
615,3,106,72,0,0,25.8,0.207,27,0
31,3,158,76,36,245,31.6,0.851,28,1
297,0,126,84,29,215,30.7,0.52,24,0


In [38]:
diabetes.shape

(768, 9)

In [39]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [40]:
diabetes.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Pregnancies,768.0,3.845052,3.369578,0.0,1.0,3.0,6.0,17.0
Glucose,768.0,120.894531,31.972618,0.0,99.0,117.0,140.25,199.0
BloodPressure,768.0,69.105469,19.355807,0.0,62.0,72.0,80.0,122.0
SkinThickness,768.0,20.536458,15.952218,0.0,0.0,23.0,32.0,99.0
Insulin,768.0,79.799479,115.244002,0.0,0.0,30.5,127.25,846.0
BMI,768.0,31.992578,7.88416,0.0,27.3,32.0,36.6,67.1
DiabetesPedigreeFunction,768.0,0.471876,0.331329,0.078,0.24375,0.3725,0.62625,2.42
Age,768.0,33.240885,11.760232,21.0,24.0,29.0,41.0,81.0
Outcome,768.0,0.348958,0.476951,0.0,0.0,0.0,1.0,1.0


In [41]:
diabetes['Glucose'].replace(0, np.nan, inplace=True)
diabetes['BloodPressure'].replace(0, np.nan, inplace=True)
diabetes['SkinThickness'].replace(0, np.nan, inplace=True)
diabetes['Insulin'].replace(0, np.nan, inplace=True)
diabetes['BMI'].replace(0, np.nan, inplace=True)

In [42]:
diabetes.isnull().sum() # Same as isna()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [47]:
arr = diabetes['SkinThickness'].values.reshape(-1,1)
arr.shape

(768, 1)

In [50]:
# Provides basic strategies for imputing missing values using - constant, mean, median, mode
from sklearn.impute import SimpleImputer

In [52]:
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent') # Mode used here
imp.fit(arr)

SimpleImputer(add_indicator=False, copy=True, fill_value=None,
              missing_values=nan, strategy='most_frequent', verbose=0)

In [53]:
diabetes['SkinThickness'] = imp.transform(arr)

In [56]:
diabetes['SkinThickness'].describe()

count    768.000000
mean      29.994792
std        8.886506
min        7.000000
25%       25.000000
50%       32.000000
75%       32.000000
max       99.000000
Name: SkinThickness, dtype: float64

In [57]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness                 0
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [58]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='median')
arr = diabetes['Glucose'].values.reshape(-1,1)
imp.fit(arr)
diabetes['Glucose'] = imp.transform(arr)
diabetes['Glucose'].describe()

count    768.000000
mean     121.656250
std       30.438286
min       44.000000
25%       99.750000
50%      117.000000
75%      140.250000
max      199.000000
Name: Glucose, dtype: float64

In [59]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                35
SkinThickness                 0
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [60]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(diabetes['BloodPressure'].values.reshape(-1,1))
diabetes['BloodPressure'] = imp.transform(diabetes['BloodPressure'].values.reshape(-1,1))
diabetes['BloodPressure'].describe()

count    768.000000
mean      72.405184
std       12.096346
min       24.000000
25%       64.000000
50%       72.202592
75%       80.000000
max      122.000000
Name: BloodPressure, dtype: float64

In [61]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [64]:
diabetes[['BMI']].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
BMI,757.0,32.457464,6.924988,18.2,27.5,32.3,36.6,67.1


In [65]:
from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value = 32) # 32 is the mean value
imp.fit(diabetes['BMI'].values.reshape(-1,1))
diabetes['BMI'] = imp.transform(diabetes['BMI'].values.reshape(-1,1))
diabetes['BMI'].describe()

count    768.000000
mean      32.450911
std        6.875366
min       18.200000
25%       27.500000
50%       32.000000
75%       36.600000
max       67.100000
Name: BMI, dtype: float64

In [66]:
diabetes.isnull().sum()

Pregnancies                   0
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                     374
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [74]:
diabetes.to_csv(os.path.join(csv_path,'diabetes_processed_incomplete.csv'), index = False)

In [75]:
ls "datasets/data-prep"

 Volume in drive C is Windows
 Volume Serial Number is 2A82-BEAC

 Directory of C:\Users\pankaj\00_python_codes\Pluralsight\PythonProgramming\corepy\PythonPractise\datasets\data-prep

12-04-2020  19:24    <DIR>          .
12-04-2020  19:24    <DIR>          ..
12-04-2020  17:42            13,441 auto_mpg_processed.csv
12-04-2020  17:42            18,131 auto-mpg.csv
12-04-2020  17:42            25,336 cars.csv
12-04-2020  18:23            13,530 cars_processed.csv
12-04-2020  19:24            11,065 cars_processed2.csv
12-04-2020  17:42            23,873 diabetes.csv
12-04-2020  17:42            39,972 diabetes_processed.csv
13-04-2020  12:11            32,381 diabetes_processed_incomplete.csv
12-04-2020  17:42         1,616,446 GoSales_Tx_LogisticRegression.csv
               9 File(s)      1,794,175 bytes
               2 Dir(s)  667,812,823,040 bytes free
