In [37]:
# read the data into a pandas DataFrame
import pandas as pd
path = 'data/pima-indians-diabetes.data'
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
pima = pd.read_csv(path, header=None, names=col_names)

In [33]:
pima

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [38]:
pima.dtypes

pregnant      int64
glucose       int64
bp            int64
skin          int64
insulin       int64
bmi         float64
pedigree    float64
age           int64
label         int64
dtype: object

**Question: Can we predict the diabetes status of a patient given their health measurements?**

In [39]:
# define X and y
X = pima[['pregnant', 'insulin', 'bmi', 'age']]
y = pima.label

In [40]:
X

Unnamed: 0,pregnant,insulin,bmi,age
0,6,0,33.6,50
1,1,0,26.6,31
2,8,0,23.3,32
3,1,94,28.1,21
4,0,168,43.1,33
...,...,...,...,...
763,10,180,32.9,63
764,2,0,36.8,27
765,5,112,26.2,30
766,1,0,30.1,47


In [41]:
y

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: label, Length: 768, dtype: int64

In [42]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [43]:
# train a logistic regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='liblinear')
logreg.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

Unnamed: 0,pregnant,insulin,bmi,age
729,2,0,22.9,22
127,3,135,42.9,30
399,3,0,21.1,25
540,3,155,36.4,32
500,6,193,29.3,39
...,...,...,...,...
763,9,0,22.5,33
192,9,94,33.1,40
629,5,0,34.6,45
559,11,0,46.2,42


In [46]:
# make predictions
y_pred_class = logreg.predict(X_test)

## Classification accuracy: percentage of correct predictions

In [47]:
# calculate accuracy
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.6927083333333334

**Null Accuracy**: Accuracy that could be achieved by always predicting the most frequent class

In [49]:
# Examine the testing set
y_test.value_counts()

0    130
1     62
Name: label, dtype: int64

In [51]:
#********************************
# calculate percantage of ones
#*******************************
y_test.sum()/len(y_test)

0.3229166666666667

In [52]:
# or 
y_test.mean()

0.3229166666666667

In [53]:
#********************************
# calculate percantage of zeros
#*******************************
1 - y_test.mean()

0.6770833333333333

In [56]:
#********************************
# calculate null accuracy
#*******************************
max(y_test.mean(), 1 - y_test.mean())

0.6770833333333333

In [58]:
# print the first 25 true and predicted responses
print('True:', y_test.values[0:25])
print('Pred:', y_pred_class[0:25])

True: [1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0]
Pred: [0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


### Confusion matrix
Table that describes the performance of a classification model

In [59]:
print(metrics.confusion_matrix(y_test, y_pred_class))

[[118  12]
 [ 47  15]]


In [62]:
cfmatrix = metrics.confusion_matrix(y_test, y_pred_class)

**True Positives (TP)**: We correctly predicted that they do have diabetes i.e. 1

**True Negatives (TN)** We correctly predicted that do not have diabetes.

**False Positives (FP)** We incorrectly predicted that they do have diabetes -- *Type I error*

**False Negatives (FN)** We incorrectly predicted that they do not have diabetes -- *Type II error*

In [70]:
# print the first 25 true and predicted responses
print('True:', y_test.values[0:25])
print('Pred:', y_pred_class[0:25])

True: [1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0]
Pred: [0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [73]:
confusion = metrics.confusion_matrix(y_test, y_pred_class)
confusion

array([[118,  12],
       [ 47,  15]], dtype=int64)

In [76]:
TP = confusion[1,1]
TP

15

In [77]:
TN = confusion[0,0]
TN

118

In [80]:
FP = confusion[0, 1]
FN = confusion[1, 0]

In [84]:
#*************************
# Classification Accuracy
#************************
print((TP + TN)/(TP + TN + FP + FN))

0.6927083333333334


In [85]:
print(metrics.accuracy_score(y_test, y_pred_class))

0.6927083333333334


In [86]:
#*************************
# Classification Error
#************************
print((FP + FN) / (TP + TN + FP + FN))

0.3072916666666667


In [87]:
print(1 - metrics.accuracy_score(y_test, y_pred_class))

0.30729166666666663


### Sensitivity

When **actual** value is **postive**, how often is the prediction **correct**?

- How sensitive is the classifier to detecting positive instances?

- Also known as  "True Positive Rate" or "Recall"

In [88]:
print(TP / (TP + FN))
print(metrics.recall_score(y_test, y_pred_class))

0.24193548387096775
0.24193548387096775


### Specificity

When **actual** value is **negative**, how often the prediction correct?

In [90]:
TN/(TN + FP)

0.9076923076923077

### Precision
When a positive value is predicted, how often is the prediction correct?

In [91]:
TP/(TP + FP)

0.5555555555555556