In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
"""
diabetes = pd.read_csv('/Users/ahmetokur/Desktop/Datasets/Diabetes_Preprocessing.csv')
"""
#Import the diabetes dataset by using url:
diabetes = pd.read_csv('https://raw.githubusercontent.com/ammishra08/MachineLearning/master/Datasets/pima-indians-diabetes.data', header = None)


In [3]:
diabetes.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
diabetes.shape

(768, 9)

In [5]:
diabetes.isnull().sum()

0    0
1    0
2    0
3    0
4    0
5    0
6    0
7    0
8    0
dtype: int64

#### Find the features of dataset

In [6]:
# Use the .NAMES file to view and set the features of the dataset
feature_names = ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age','Outcome']

In [7]:
# Use the feature names set earlier and fix it as the column headers of the dataset
diabetes_df = pd.read_csv('/Users/ahmetokur/Desktop/Datasets/Diabetes_Preprocessing.csv') #, names = feature_names

In [8]:
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,5,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,4.0,35.0,168.0,43.1,2.288,33,1


#### Handling missing values

In [9]:
diabetes_df.isnull().sum().sort_values(ascending = False)

Insulin                     374
SkinThickness               227
BloodPressure                35
BMI                          11
Glucose                       5
Pregnancies                   0
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [10]:
# Replace missing value by mean - continious data, discrete = median, categorical = mode
np.round(diabetes_df['Insulin'].mean())

106.0

In [11]:
diabetes_df['Insulin'].unique()

array([ nan,  94., 168.,  88., 543., 846., 175.,  23.,  83.,  96., 235.,
       146., 115.,  14.,  11., 245.,  54., 192.,  27.,   7.,  24.,  82.,
        36.,   3., 342.,  34., 142., 128.,  38.,   1.,   9.,  71., 125.,
       176.,  48.,  64., 228.,  76.,  22.,   4., 152.,  18., 135., 495.,
        37.,  51.,  99., 145., 225.,  49.,   5.,  92., 325.,  63., 284.,
       119., 155., 485.,  53., 114.,  15., 285., 156.,  78.,  13.,  55.,
        58.,  16.,  21., 318.,  44.,  19.,  28.,  87., 271., 129.,  12.,
       478.,  56.,  32., 744.,  45., 194.,  68.,  42., 258., 375.,  67.,
        57., 116., 278., 122., 545.,  75.,  74., 182., 215., 184., 132.,
       148.,  25.,  85., 231.,  29.,  52., 255., 171.,  73.,  43., 167.,
       249., 293.,  66., 465.,  89., 158.,  84.,  72.,  59.,  81., 196.,
       415., 275., 165., 579.,  31.,  61., 474.,  17., 277.,   6.,  95.,
       237., 191., 328., 265., 193.,  79.,  86., 326., 188.,  65., 166.,
       274.,  77., 126.,  33., 185.,  41., 272., 32

In [13]:
diabetes_df['Insulin'].replace(np.nan, np.round(diabetes_df['Insulin'].mean()), inplace = True)

In [14]:
diabetes_df.isnull().sum().sort_values(ascending = False)

SkinThickness               227
BloodPressure                35
BMI                          11
Glucose                       5
Pregnancies                   0
Insulin                       0
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

### Impute Methode

In [17]:
from sklearn.impute import SimpleImputer
# strategy = mean, median, most_frequent
imputer = SimpleImputer(strategy = 'median')

In [18]:
X_data = imputer.fit_transform(diabetes)

In [19]:
X_data

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

#### Prepare Clean DAtaFrame

In [20]:
diabetes_df2 = pd.DataFrame(X_data, columns = diabetes_df.columns)

In [21]:
diabetes_df2['Pregnancies'] = diabetes_df.Pregnancies
diabetes_df2['Outcome'] = diabetes_df.Outcome

#### 4: Find the response  of the dataset

In [22]:
#Select features from the dataset to create the model
X_feature = diabetes_df2.iloc[:,:-1]
X_feature

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,0.0,33.6,0.627,50.0
1,1,85.0,66.0,29.0,0.0,26.6,0.351,31.0
2,8,183.0,64.0,0.0,0.0,23.3,0.672,32.0
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21.0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33.0
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63.0
764,2,122.0,70.0,27.0,0.0,36.8,0.340,27.0
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30.0
766,1,126.0,60.0,0.0,0.0,30.1,0.349,47.0


In [23]:
#Create the reponse object
Y_target = diabetes_df2['Outcome']
Y_target.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [24]:
#View the shape of the feature object
X_feature.shape

(768, 8)

In [25]:
#View the shape of the target object
Y_target.shape

(768,)

#### 5.Train the model by using train and test datasets

In [26]:
# Split the dataset to test and train the model
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_feature,Y_target,test_size = 0.25, random_state = 1)

In [29]:
x_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
118,4,97.0,60.0,23.0,0.0,28.2,0.443,22.0
205,5,111.0,72.0,28.0,0.0,23.9,0.407,27.0
506,0,180.0,90.0,26.0,90.0,36.5,0.314,35.0
587,6,103.0,66.0,0.0,0.0,24.3,0.249,29.0
34,10,122.0,78.0,31.0,0.0,27.6,0.512,45.0
...,...,...,...,...,...,...,...,...
645,2,157.0,74.0,35.0,440.0,39.4,0.134,30.0
715,7,187.0,50.0,33.0,392.0,33.9,0.826,34.0
72,13,126.0,90.0,0.0,0.0,43.4,0.583,42.0
235,4,171.0,72.0,0.0,0.0,43.6,0.479,26.0


In [28]:
# Normalizing the features by MinMax Scaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
x_train_scaler = scaler.fit_transform(x_train)
x_test_scaler = scaler.transform(x_test)

In [30]:
x_train_scaler

array([[0.26666667, 0.48989899, 0.49180328, ..., 0.42026826, 0.15953654,
        0.01960784],
       [0.33333333, 0.56060606, 0.59016393, ..., 0.3561848 , 0.14349376,
        0.11764706],
       [0.        , 0.90909091, 0.73770492, ..., 0.54396423, 0.10204991,
        0.2745098 ],
       ...,
       [0.86666667, 0.63636364, 0.73770492, ..., 0.64679583, 0.22192513,
        0.41176471],
       [0.26666667, 0.86363636, 0.59016393, ..., 0.64977645, 0.17557932,
        0.09803922],
       [0.6       , 0.51515152, 0.62295082, ..., 0.49031297, 0.25846702,
        0.49019608]])

### 6. Create a model to predict the diabetes outcome

In [36]:
# train a logistic regression model on the training set
from sklearn.linear_model import LogisticRegression
log_Reg = LogisticRegression(solver = 'liblinear', max_iter= 1e9, C = 1e7, penalty = 'l1')

In [37]:
log_Reg.fit(x_train_scaler, y_train)

LogisticRegression(C=10000000.0, max_iter=1000000000.0, penalty='l1',
                   solver='liblinear')

In [38]:
#Make predictions using the testing set
y_pred = log_Reg.predict(x_test_scaler)
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0])

#### Check the accuracy of model

In [39]:
# Evaluate the accuracy of yor model
log_Reg.score(x_test_scaler, y_test)

0.7760416666666666

In [40]:
# print the first 30 true and predicted responses
print ('actual:   ', y_test.values[0:30])
print ('predicted:', y_pred[0:30])

actual:    [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 1 1 0 0 0 1 0 1]
predicted: [0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0]


In [41]:
"""
https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html


"""

'\nhttps://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n'