# Reviewing Classification Problems

**GOALS**:
- Identify big idea with `LogisticRegression`
- Evaluate performance in terms of Accuracy, Precision, and Recall


In [12]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_breast_cancer

In [13]:
cancer = load_breast_cancer()

In [14]:
df = pd.DataFrame(cancer.data, columns= cancer.feature_names)

In [15]:
df['target'] = cancer.target

In [16]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
mean radius                569 non-null float64
mean texture               569 non-null float64
mean perimeter             569 non-null float64
mean area                  569 non-null float64
mean smoothness            569 non-null float64
mean compactness           569 non-null float64
mean concavity             569 non-null float64
mean concave points        569 non-null float64
mean symmetry              569 non-null float64
mean fractal dimension     569 non-null float64
radius error               569 non-null float64
texture error              569 non-null float64
perimeter error            569 non-null float64
area error                 569 non-null float64
smoothness error           569 non-null float64
compactness error          569 non-null float64
concavity error            569 non-null float64
concave points error       569 non-null float64
symmetry error             569 

In [18]:
X = df[['mean radius', 'mean fractal dimension']]
X_train, X_test, y_train, y_test = train_test_split(X, cancer.target)
clf = LogisticRegression()

In [19]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [20]:
pred = clf.predict(X_test)
print(classification_report(pred, y_test))

             precision    recall  f1-score   support

          0       0.75      1.00      0.85        38
          1       1.00      0.88      0.93       105

avg / total       0.93      0.91      0.91       143



In [21]:
from sklearn.metrics import confusion_matrix

In [22]:
confusion_matrix(clf.predict(X_test), y_test)

array([[38,  0],
       [13, 92]])

### Problem

Using the PIMA diabetes dataset, your goal is to build a classifier that is:

1. Accurate
2. Appropriate

For information about the data, please see the brief description of the variables here: https://www.kaggle.com/uciml/pima-indians-diabetes-database/home 

Your results should include a clear framing of the question, brief description of the approach you used, and suggestions as to what else might be done to effect a better model.

In [25]:
pima = pd.read_csv('data/pima_diabetes.csv', index_col=0)

In [26]:
pima.head()

Unnamed: 0,preg,plas,pres,skin,test,mass,pedi,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [27]:
pima['bmi_age'] = pima['mass']/pima['age']

In [28]:
corr = pima.corr()

In [29]:
corr['class']

preg       0.221898
plas       0.466581
pres       0.065068
skin       0.074752
test       0.130548
mass       0.292695
pedi       0.173844
age        0.238356
class      1.000000
bmi_age   -0.029840
Name: class, dtype: float64

In [33]:
X = pima[['preg', 'plas', 'test', 'mass', 'pedi', 'age']]
y = pima['class']

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
clf = LogisticRegression()
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
pred = clf.predict(X_test)
print(classification_report(pred, y_test))

             precision    recall  f1-score   support

          0       0.92      0.81      0.86       147
          1       0.55      0.76      0.64        45

avg / total       0.83      0.80      0.81       192



In [27]:
X = pima.drop('class', axis = 1)
y = pima['class']
X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = LogisticRegression()
clf.fit(X_train, y_train)
train_pred = classification_report(y_train, clf.predict(X_train))
test_pred = classification_report(y_test, clf.predict(X_test))
print("Training Performance\n", train_pred, "\nTest Performance\n", test_pred)

Training Performance
              precision    recall  f1-score   support

          0       0.79      0.90      0.84       373
          1       0.75      0.57      0.65       203

avg / total       0.78      0.78      0.77       576
 
Test Performance
              precision    recall  f1-score   support

          0       0.77      0.90      0.83       127
          1       0.70      0.48      0.57        65

avg / total       0.75      0.76      0.74       192



In [38]:
from sklearn.dummy import DummyClassifier

In [39]:
pima['class'].value_counts()

0    500
1    268
Name: class, dtype: int64

In [44]:
dum = DummyClassifier(strategy='most_frequent')
dum.fit(X_train, y_train)
pred = dum.predict(X_test)
print(classification_report(pred, y_test))

             precision    recall  f1-score   support

          0       1.00      0.68      0.81       192
          1       0.00      0.00      0.00         0

avg / total       1.00      0.68      0.81       192



  'recall', 'true', average, warn_for)


In [45]:
confusion_matrix(pred, y_test)

array([[130,  62],
       [  0,   0]])

In [43]:
DummyClassifier?

[0;31mInit signature:[0m [0mDummyClassifier[0m[0;34m([0m[0mstrategy[0m[0;34m=[0m[0;34m'stratified'[0m[0;34m,[0m [0mrandom_state[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mconstant[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
DummyClassifier is a classifier that makes predictions using simple rules.

This classifier is useful as a simple baseline to compare with other
(real) classifiers. Do not use it for real problems.

Read more in the :ref:`User Guide <dummy_estimators>`.

Parameters
----------
strategy : str, default="stratified"
    Strategy to use to generate predictions.

    * "stratified": generates predictions by respecting the training
      set's class distribution.
    * "most_frequent": always predicts the most frequent label in the
      training set.
    * "prior": always predicts the class that maximizes the class prior
      (like "most_frequent") and ``predict_proba`` returns the class prior.
    * "uniform": 

### Solution Possibilities

In [28]:
X = pima.drop(['class', 'preg', 'plas'], axis = 1)
y = pima['class']
X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = LogisticRegression()
clf.fit(X_train, y_train)
train_pred = classification_report(y_train, clf.predict(X_train))
test_pred = classification_report(y_test, clf.predict(X_test))
print("Training Performance\n", train_pred, "\nTest Performance\n", test_pred)

Training Performance
              precision    recall  f1-score   support

          0       0.72      0.89      0.80       379
          1       0.61      0.34      0.43       197

avg / total       0.68      0.70      0.67       576
 
Test Performance
              precision    recall  f1-score   support

          0       0.68      0.90      0.78       121
          1       0.62      0.28      0.39        71

avg / total       0.66      0.67      0.63       192



In [29]:
X = pima.mass
y = pima['class']
X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = LogisticRegression()
clf.fit(X_train.reshape(-1,1), y_train)
train_pred = classification_report(y_train, clf.predict(X_train.reshape(-1,1)))
test_pred = classification_report(y_test, clf.predict(X_test.reshape(-1,1)))
print("Training Performance\n", train_pred, "\nTest Performance\n", test_pred)

Training Performance
              precision    recall  f1-score   support

          0       0.69      0.93      0.79       377
          1       0.60      0.20      0.30       199

avg / total       0.66      0.68      0.62       576
 
Test Performance
              precision    recall  f1-score   support

          0       0.67      0.90      0.77       123
          1       0.54      0.20      0.29        69

avg / total       0.62      0.65      0.60       192



  """
  
  import sys


In [34]:
X = pima[['pres', 'pedi']]
y = pima['class']
X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = LogisticRegression()
clf.fit(X_train, y_train)
train_pred = classification_report(y_train, clf.predict(X_train))
test_pred = classification_report(y_test, clf.predict(X_test))
print("Training Performance\n", train_pred, "\nTest Performance\n", test_pred)

Training Performance
              precision    recall  f1-score   support

          0       0.67      0.97      0.79       379
          1       0.55      0.08      0.14       197

avg / total       0.63      0.66      0.57       576
 
Test Performance
              precision    recall  f1-score   support

          0       0.65      0.97      0.77       121
          1       0.64      0.10      0.17        71

avg / total       0.64      0.65      0.55       192



In [35]:
clf.score(X_test, y_test)

0.6458333333333334

In [36]:
from sklearn.metrics import accuracy_score

In [37]:
accuracy_score(y_test, clf.predict(X_test))

0.6458333333333334