# 1. Pima Indians Diabetes Dataset  

The Pima Indians Diabetes Dataset involves predicting the onset of diabetes within 5 years in Pima Indians given medical details.

It is a binary (2-class) classification problem. The number of observations for each class is not balanced. There are 768 observations with 8 input variables and 1 output variable. The variable names are as follows:

0. Number of times pregnant.
1. Plasma glucose concentration a 2 hours in an oral glucose tolerance test.
2. Diastolic blood pressure (mm Hg).
3. Triceps skinfold thickness (mm).
4. 2-Hour serum insulin (mu U/ml).
5. Body mass index (weight in kg/(height in m)^2).
6. Diabetes pedigree function.
7. Age (years).
8. Class variable (0 or 1).

In [56]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import Imputer

def model_fit(dataset):
    values = dataset.values
    X = values[:,1:8]
    Y = values[:,8]
    lda = LinearDiscriminantAnalysis()
    kfold = KFold(n_splits=3, random_state=7)
    result = cross_val_score(lda, X, Y, cv = kfold, scoring="accuracy")
    print("Result of LDA:", result.mean())

In [76]:
pima = pd.read_csv("pima-indians-diabetes.data.csv", header=None)
print(pima.shape)
print((pima[[1,2,3,4,5]] == 0).sum())
pima.describe()


(768, 9)
1      5
2     35
3    227
4    374
5     11
dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


# 2.Mark Missing Values

In [40]:
pima[[1,2,3,4,5]] =  pima[[1,2,3,4,5]].replace(0, np.NaN)
print(pima.isnull().sum())

0      0
1      5
2     35
3    227
4    374
5     11
6      0
7      0
8      0
dtype: int64


In [41]:
pima.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,,,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,,,,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,,,,0.232,54,1


In [42]:
model_fit(pima)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

> We got a **error** in train the model Because of ***Missing Values contain in Dataset***

# 3.Remove Rows With Missing Values

In [45]:
pima.shape

(768, 9)

In [46]:
import numpy
pima[[1,2,3,4,5]] = pima[[1,2,3,4,5]].replace(0, numpy.NaN)
pima.dropna(inplace=True)

In [47]:
pima.shape

(392, 9)

In [48]:
model_fit(pima)

Result of LDA: 0.7883734586024662


# 4.Impute Missing Values

In [51]:
pima.shape

(768, 9)

# 1.Impute value with ***mean()***

In [52]:
pima[[1,2,3,4,5]] = pima[[1,2,3,4,5]].replace(0, numpy.NaN)
# fill missing values with mean column values
pima.fillna(pima.mean(), inplace=True)

In [53]:
pima.shape

(768, 9)

In [54]:
pima.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,29.15342,155.548223,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,72.405184,29.15342,155.548223,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,29.15342,155.548223,32.457464,0.232,54,1


In [55]:
model_fit(pima)

Result of LDA: 0.7643229166666666


# 2.Impute value with ***Impute() Function***

In [58]:
pima[[1,2,3,4,5]] = pima[[1,2,3,4,5]].replace(0, numpy.NaN)

In [80]:
values = pima.values
X = values[:,1:8]
Y = values[:,8]
impute = Imputer()
transformated_values = impute.fit_transform(values)
print(numpy.isnan(transformated_values).sum())
lda = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(lda, transformated_values, y, cv=kfold, scoring='accuracy')
print("Result of LDA:", result.mean())

0
Result of LDA: 0.7734375




In [86]:
transformated_values = pd.DataFrame(transformated_values)
transformated_values.isnull().sum()


0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64