In [1]:
# To enable plotting graphs in Jupyter notebook
%matplotlib inline 
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier


#Let us break the X and y dataframes into training set and test set. For this we will use
#Sklearn package's data splitting function which is based on random function

from sklearn.model_selection import train_test_split

#import numpy as np


# calculate accuracy measures and confusion matrix
#from sklearn import metrics




In [2]:
# Since it is a data file with no header, we will supply the column names which have been obtained from the above URL 
# Create a python list of column names called "names"

colnames = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

#Load the file from local directory using pd.read_csv which is a special form of read_table
#while reading the data, supply the "colnames" list

pima_df = pd.read_csv("pima-indians-diabetes.data", names= colnames)
pima_df.groupby(["class"]).count()  # count the instances of each class in the data to check if data is skewed towards a class


Unnamed: 0_level_0,preg,plas,pres,skin,test,mass,pedi,age
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,500,500,500,500,500,500,500,500
1,268,268,268,268,268,268,268,268


In [3]:
# replace the missing values in pima_df with median value :Note, we do not need to specify the column names
# every column's missing value is replaced with that column's median respectively
pima_df = pima_df.fillna(pima_df.median())

In [4]:
array = pima_df.values
X = array[:,0:7] # select all rows and first 8 columns which are the attributes
Y = array[:,8]   # select all rows and the 8th column which is the classification "Yes", "No" for diabeties
test_size = 0.30 # taking 70:30 training and test set
seed = 7  # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

In [5]:
#dt_model = DecisionTreeClassifier(criterion = 'entropy', class_weight={0:.80,1:.20}, max_depth = 9, min_samples_leaf=5 )
dt_model = DecisionTreeClassifier(criterion = 'entropy', random_state = 1)
# When the weights are changed in favor of class 1 (diabetic) because their instances are half that in the other class, the 
# accuracy drops.... Why?

# When the weights are changee to favor the 0 class, the accuracy improves. ... Why?

In [6]:
lrcl = LogisticRegression(random_state=1)
rfcl = RandomForestClassifier(random_state=1)
nbcl = GaussianNB()
bgcl = BaggingClassifier(base_estimator=dt_model, n_estimators=50 , random_state=1)  
#the base_estimator can be null. The bagging classifer  will build it's own tree

enclf = VotingClassifier(estimators = [('lor', lrcl), ('rf', rfcl), ('nb', nbcl), ('bg', bgcl)], voting = 'hard')

In [7]:
for clf, label in zip([lrcl , rfcl, nbcl, enclf, bgcl], ['Logistic Regression', 'RandomForest', 'NaiveBayes', 'Ensemble', 'Bagging']):
    scores = cross_val_score(clf, X, Y, cv=5, scoring='accuracy')
    print("Accuracy: %0.02f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label ))



Accuracy: 0.77 (+/- 0.02) [Logistic Regression]
Accuracy: 0.74 (+/- 0.02) [RandomForest]
Accuracy: 0.77 (+/- 0.02) [NaiveBayes]




Accuracy: 0.76 (+/- 0.02) [Ensemble]
Accuracy: 0.75 (+/- 0.02) [Bagging]
