## Import all the required libraries

In [None]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## Loading the dataset

In [None]:
dataset = pd.read_csv("pima-indian-diabetes.csv")
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1].values

In [None]:
print(X)

[[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]


In [None]:
print(y)

[1 0 1 0 1 0 1 0 1 1 0 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 1 0 0 0 0 1 0 0 0 0 0
 1 1 1 0 0 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0
 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1
 1 0 0 1 1 1 0 0 0 1 0 0 0 1 1 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
 0 0 0 0 1 0 1 1 0 0 0 1 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0
 1 1 1 1 1 0 0 1 1 0 1 0 1 1 1 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 1 1 0 1 1 1 1
 0 0 0 0 0 1 0 0 1 1 0 0 0 1 1 1 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 0 0
 1 0 1 0 0 1 0 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 0 1 1 1 0 0
 1 0 1 0 1 1 0 1 0 0 1 0 1 1 0 0 1 0 1 0 0 1 0 1 0 1 1 1 0 0 1 0 1 0 0 0 1
 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 1
 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 1 0 1
 0 1 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 1
 1 1 0 0 1 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 0 

In [None]:
print(dataset.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


## Taking care of missing data

In [None]:
null_values = dataset.isnull().any()

In [None]:
print(null_values)

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool


In [None]:
nan_values = dataset.isna().any()

In [None]:
print(nan_values)

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool


In [None]:
print(zero_values)

Pregnancies                  True
Glucose                      True
BloodPressure                True
SkinThickness                True
Insulin                      True
BMI                          True
DiabetesPedigreeFunction    False
Age                         False
Outcome                      True
dtype: bool


In [None]:
X.replace(0, np.nan, inplace = True)

In [None]:
zero_values = (X == 0).any()

In [None]:
nan_values = X.isna().any()

In [None]:
print(nan_values)

Pregnancies                  True
Glucose                      True
BloodPressure                True
SkinThickness                True
Insulin                      True
BMI                          True
DiabetesPedigreeFunction    False
Age                         False
dtype: bool


In [None]:
nan_values_count = X.isna().sum()
print(nan_values_count)

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
dtype: int64


In [None]:
X = X.values

In [None]:
# This dataset has 0 values in many columns which need to be replaced by a meaningful value

imputer = SimpleImputer(missing_values = 0, strategy = "mean")
imputer.fit(X)
X = imputer.transform(X)

In [None]:
print(X)

[[  6.    148.     72.    ...  33.6     0.627  50.   ]
 [  1.     85.     66.    ...  26.6     0.351  31.   ]
 [  8.    183.     64.    ...  23.3     0.672  32.   ]
 ...
 [  5.    121.     72.    ...  26.2     0.245  30.   ]
 [  1.    126.     60.    ...  30.1     0.349  47.   ]
 [  1.     93.     70.    ...  30.4     0.315  23.   ]]


In [None]:
type(X)

numpy.ndarray

In [None]:
print(np.all(X))

True


In [None]:
## None of the values now contain zero

## Encoding Categorical Data

In [None]:
## There is no categorical data in this dataset.

## Splitting the Data into the Training Set and Test Set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [None]:
print(X_train.shape)

(614, 8)


In [None]:
print(X_test.shape)

(154, 8)


In [None]:
print(y_train.shape)

(614,)


In [None]:
print(y_test.shape)

(154,)


## Feature Scaling

In [None]:
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
print(X_train)

[[-0.82506128 -1.25828206  0.01321033 ...  0.01501323 -0.49073479
  -1.03594038]
 [ 1.57255664 -0.32735374  0.8068672  ... -0.59935041  2.41502991
   1.48710085]
 [-1.16757813  0.57032714 -2.17095414 ... -0.52719904  0.54916055
  -0.94893896]
 ...
 [ 1.91507348 -0.69307558  1.13773624 ...  1.91151712  1.981245
   0.44308379]
 [ 0.02940616  0.63682202  0.01321033 ...  1.44974838 -0.78487662
  -0.33992901]
 [ 0.02940616  0.10486298  1.96490883 ... -1.42187598 -0.61552223
  -1.03594038]]


In [None]:
print(X_test)

[[ 0.5450061  -0.7928179  -1.17834702 ...  0.23760544 -0.11637247
   0.87809089]
 [-0.82506128 -0.32735374  0.22784639 ...  0.48292008 -0.954231
  -1.03594038]
 [-0.82506128 -0.4603435  -0.68204347 ... -0.22416331 -0.9245197
  -1.03594038]
 ...
 [ 1.23003979 -0.89256022 -0.02030539 ...  0.64165309  0.04703966
   2.0961108 ]
 [-0.82506128  0.80305922 -0.18573991 ... -0.62821095 -0.39268751
  -0.33992901]
 [ 1.23003979 -1.59075646 -0.18573991 ...  0.42519899  0.70068816
   0.53008521]]
