### PIMA with LightGBM && MLP


In [3]:
# Python libraries
# Classic,data manipulation and linear algebra
import pandas as pd
import numpy as np

# Plots
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# Data processing, metrics and modeling
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import precision_score, recall_score, confusion_matrix,  roc_curve, precision_recall_curve, accuracy_score, roc_auc_score
import lightgbm as lgbm
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_curve,auc
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_predict
#from yellowbrick.classifier import DiscriminationThreshold

# Stats
import scipy.stats as ss
from scipy import interp
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform 
    
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from keras import Sequential
from keras.layers import Dense
from keras.layers import Dense, Dropout, Activation, Flatten

#ignore warning messages 
import warnings
warnings.filterwarnings('ignore') 

Using TensorFlow backend.


In [4]:
# support functions
# prints missing data per column
def missing(dff):
    return pd.DataFrame(round((dff.isnull().sum() * 100/ len(dff)),2).sort_values(ascending=False))
#obtains median for target for each column
def median_target(var):   
    temp = data[data[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
    return (temp)


In [5]:
data = pd.read_csv('e:/$Notebooks/pima-indians-diabetes.csv')

In [6]:
display(data.info(),data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


None

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


The datasets consist of several medical predictor (independent) variables and one target (dependent) variable, Outcome. Independent variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

The above graph shows that the data is unbalanced. The number of non-diabetic is 268 the number of diabetic patients is 500

In [7]:
data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
missing(data)

Unnamed: 0,0
Insulin,48.7
SkinThickness,29.56
BloodPressure,4.56
BMI,1.43
Glucose,0.65
Outcome,0.0
Age,0.0
DiabetesPedigreeFunction,0.0
Pregnancies,0.0


    Fill missing values with median (against target)

In [8]:
res = median_target('Insulin')
res.loc[res['Outcome'] == 0].Insulin[0]

102.5

In [9]:
res = median_target('Insulin')

data.loc[(data['Outcome'] == 0 ) & (data['Insulin'].isnull()), 'Insulin'] = res.loc[res['Outcome'] == 0].Insulin[0]
data.loc[(data['Outcome'] == 1 ) & (data['Insulin'].isnull()), 'Insulin'] = res.loc[res['Outcome'] == 1].Insulin[1]

res = median_target('SkinThickness')
data.loc[(data['Outcome'] == 0 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = res.loc[res['Outcome'] == 0].SkinThickness[0]
data.loc[(data['Outcome'] == 1 ) & (data['SkinThickness'].isnull()), 'SkinThickness'] = res.loc[res['Outcome'] == 1].SkinThickness[1]

res = median_target('BloodPressure')
data.loc[(data['Outcome'] == 0 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = res.loc[res['Outcome'] == 0].BloodPressure[0]
data.loc[(data['Outcome'] == 1 ) & (data['BloodPressure'].isnull()), 'BloodPressure'] = res.loc[res['Outcome'] == 1].BloodPressure[1]

res = median_target('BMI')
data.loc[(data['Outcome'] == 0 ) & (data['BMI'].isnull()), 'BMI'] = res.loc[res['Outcome'] == 0].BMI[0]
data.loc[(data['Outcome'] == 1 ) & (data['BMI'].isnull()), 'BMI'] = res.loc[res['Outcome'] == 1].BMI[1]

res = median_target('Glucose')
data.loc[(data['Outcome'] == 0 ) & (data['Glucose'].isnull()), 'Glucose'] = res.loc[res['Outcome'] == 0].Glucose[0]


Now - Generate new features (not done)

In [10]:
# You could trick the features creating new ones by combinations of columns. It has not been done in this example 

In [11]:
# Def X and y
X = data.iloc[:,0:8]
y = data.iloc[:,8]

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [12]:
lgb_train = lgbm.Dataset(X_train, y_train)
lgb_eval = lgbm.Dataset(X_test, y_test, reference=lgb_train)
params = {
        'task': 'train',
        'objective': 'binary',    
        'metric': 'binary_error', 
        'verbose': 1
}
gbm = lgbm.train(params,
            lgb_train,
            num_boost_round=50,       
            early_stopping_rounds=10, 
            valid_sets=[lgb_train,lgb_eval],
            valid_names=['train', 'eval'])
#Confusion matrix
y_pred=gbm.predict(X_test)
for i in range(0,len(y_pred)):
    if y_pred[i]>=.5:       # setting threshold to .5
       y_pred[i]=1
    else:  
       y_pred[i]=0
cm = confusion_matrix(y_test, y_pred)
accuracy_lgbm = accuracy_score(y_pred,y_test)

print('confussion matrix: ',cm)
print('accuracy:', accuracy_lgbm)

[1]	train's binary_error: 0.370577	eval's binary_error: 0.298701
Training until validation scores don't improve for 10 rounds.
[2]	train's binary_error: 0.370577	eval's binary_error: 0.298701
[3]	train's binary_error: 0.16946	eval's binary_error: 0.186147
[4]	train's binary_error: 0.124767	eval's binary_error: 0.17316
[5]	train's binary_error: 0.0875233	eval's binary_error: 0.17316
[6]	train's binary_error: 0.0875233	eval's binary_error: 0.177489
[7]	train's binary_error: 0.0819367	eval's binary_error: 0.168831
[8]	train's binary_error: 0.0856611	eval's binary_error: 0.177489
[9]	train's binary_error: 0.0819367	eval's binary_error: 0.164502
[10]	train's binary_error: 0.0819367	eval's binary_error: 0.164502
[11]	train's binary_error: 0.0726257	eval's binary_error: 0.160173
[12]	train's binary_error: 0.0726257	eval's binary_error: 0.155844
[13]	train's binary_error: 0.0744879	eval's binary_error: 0.168831
[14]	train's binary_error: 0.0726257	eval's binary_error: 0.168831
[15]	train's bin

In [13]:
def model_diabetes(optimizer='Adam', init='uniform', activation='relu', dropout = 0.5, layer_size=16):
    
    model = Sequential()
    model.add(Dense(12, input_dim=8, activation=activation,kernel_initializer=init))
    model.add(Dropout(dropout))
    model.add(Dense(layer_size, activation=activation))
    model.add(Dropout(dropout))
    model.add(Dense(1, activation=activation))
    model.compile(loss='binary_crossentropy', optimizer='Adam', metrics=['accuracy'])
    return model

NN = model_diabetes()
NN.summary()
#Fitting the data to the training dataset
NN.fit(X_train,y_train, batch_size=10, epochs=10)
y_pred=NN.predict(X_test)
for i in range(0,len(y_pred)):
    if y_pred[i]>=.5:       # setting threshold to .5
       y_pred[i]=1
    else:  
       y_pred[i]=0
cm = confusion_matrix(y_test, y_pred)
accuracy=accuracy_score(y_pred,y_test)

print('confussion matrix: ',cm)
print('accuracy:', accuracy)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 12)                108       
_________________________________________________________________
dropout_1 (Dropout)          (None, 12)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 16)                208       
_________________________________________________________________
dropout_2 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 333
Trainable params: 333
Non-trainable params: 0
_______________________

In [16]:
print('Accuracy lgbm : ', accuracy_lgbm, 'Accuracy MLP : ', accuracy)

Accuracy lgbm :  0.8571428571428571 Accuracy MLP :  0.7012987012987013
