# Reviewing Classification Problems

**GOALS**:
- Identify big idea with `LogisticRegression`
- Evaluate performance in terms of Accuracy, Precision, and Recall


In [55]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, classification_report,confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.datasets import load_breast_cancer

In [2]:
cancer = load_breast_cancer()

In [3]:
df = pd.DataFrame(cancer.data, columns= cancer.feature_names)

In [4]:
df['target'] = cancer.target

In [5]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
mean radius                569 non-null float64
mean texture               569 non-null float64
mean perimeter             569 non-null float64
mean area                  569 non-null float64
mean smoothness            569 non-null float64
mean compactness           569 non-null float64
mean concavity             569 non-null float64
mean concave points        569 non-null float64
mean symmetry              569 non-null float64
mean fractal dimension     569 non-null float64
radius error               569 non-null float64
texture error              569 non-null float64
perimeter error            569 non-null float64
area error                 569 non-null float64
smoothness error           569 non-null float64
compactness error          569 non-null float64
concavity error            569 non-null float64
concave points error       569 non-null float64
symmetry error             569 

In [7]:
X = df[['mean radius', 'mean fractal dimension']]
X_train, X_test, y_train, y_test = train_test_split(X, cancer.target)
clf = LogisticRegression()

In [8]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [9]:
pred = clf.predict(X_test)
print(classification_report(pred, y_test))

             precision    recall  f1-score   support

          0       0.70      0.86      0.77        35
          1       0.95      0.88      0.91       108

avg / total       0.89      0.87      0.88       143



# For preceision:  it's ok to miss true positives--the goal is to not include any true negatives.

* Precision = TP/(TP+FP)

# For recall:  it's OK to have false positive--the goal is to not miss any true positives.

* Recall = TP/(TP+FN)

In [16]:
from sklearn.metrics import confusion_matrix

In [17]:
confusion_matrix(clf.predict(X_test), y_test)

array([[30,  5],
       [13, 95]])

### Problem

Using the PIMA diabetes dataset, your goal is to build a classifier that is:

1. Accurate
2. Appropriate

For information about the data, please see the brief description of the variables here: https://www.kaggle.com/uciml/pima-indians-diabetes-database/home 

Your results should include a clear framing of the question, brief description of the approach you used, and suggestions as to what else might be done to effect a better model.

In [11]:
pima = pd.read_csv('data/pima_diabetes.csv', index_col=0)

In [16]:
pima.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 0 to 767
Data columns (total 10 columns):
preg        768 non-null int64
plas        768 non-null int64
pres        768 non-null int64
skin        768 non-null int64
test        768 non-null int64
mass        768 non-null float64
pedi        768 non-null float64
age         768 non-null int64
class       768 non-null int64
age_mass    768 non-null float64
dtypes: float64(3), int64(7)
memory usage: 66.0 KB


In [41]:
pima['age_mass'] = pima['age'] * pima['mass']
pima['ins_plas'] = pima['plas'] * pima['test'] * pima['pedi']

In [45]:
X = pima.drop(['class'], axis = 1)
y = pima['class']
X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = LogisticRegression()
clf.fit(X_train, y_train)
train_pred = classification_report(y_train, clf.predict(X_train))
test_pred = classification_report(y_test, clf.predict(X_test))
print("Training Performance\n", train_pred, "\nTest Performance\n", test_pred)

Training Performance
              precision    recall  f1-score   support

          0       0.79      0.89      0.84       366
          1       0.75      0.60      0.67       210

avg / total       0.78      0.78      0.78       576
 
Test Performance
              precision    recall  f1-score   support

          0       0.78      0.85      0.81       134
          1       0.57      0.45      0.50        58

avg / total       0.72      0.73      0.72       192



In [28]:
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [46]:
X = pima.drop(['pres', 'skin','class'], axis = 1)
y = pima['class']
X_train, X_test, y_train, y_test = train_test_split(X, y)
pipe = make_pipeline(LogisticRegression())
pipe.fit(X_train, y_train)
train_pred = classification_report(y_train, pipe.predict(X_train))
test_pred = classification_report(y_test, pipe.predict(X_test))
print("Training Performance\n", train_pred, "\nTest Performance\n", test_pred)

Training Performance
              precision    recall  f1-score   support

          0       0.78      0.90      0.83       378
          1       0.72      0.51      0.60       198

avg / total       0.76      0.76      0.75       576
 
Test Performance
              precision    recall  f1-score   support

          0       0.80      0.96      0.87       122
          1       0.89      0.57      0.70        70

avg / total       0.83      0.82      0.81       192



In [47]:
from sklearn.dummy import DummyClassifier

In [57]:
dum = DummyClassifier()
dum.fit (X_train, y_train)
dum.predict(X_test)
print(classification_report(dum.predict(X_test), y_test))
confusion_matrix(y_test, dum.predict(X_test))

             precision    recall  f1-score   support

          0       0.57      0.58      0.57       119
          1       0.29      0.27      0.28        73

avg / total       0.46      0.46      0.46       192



array([[77, 45],
       [38, 32]], dtype=int64)

In [48]:
from sklearn.cross_validation import cross_val_predict

In [49]:
cross_val_predict?

[1;31mSignature:[0m [0mcross_val_predict[0m[1;33m([0m[0mestimator[0m[1;33m,[0m [0mX[0m[1;33m,[0m [0my[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mcv[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mn_jobs[0m[1;33m=[0m[1;36m1[0m[1;33m,[0m [0mverbose[0m[1;33m=[0m[1;36m0[0m[1;33m,[0m [0mfit_params[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [0mpre_dispatch[0m[1;33m=[0m[1;34m'2*n_jobs'[0m[1;33m)[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Generate cross-validated estimates for each input data point

.. deprecated:: 0.18
    This module will be removed in 0.20.
    Use :func:`sklearn.model_selection.cross_val_predict` instead.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
estimator : estimator object implementing 'fit' and 'predict'
    The object to use to fit the data.

X : array-like
    The data to fit. Can be, for example a list, or an array at least 2d.

y : array-like, optional, default: None
    The target variable 

### Solution Possibilities

In [28]:
X = pima.drop(['class', 'preg', 'plas'], axis = 1)
y = pima['class']
X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = LogisticRegression()
clf.fit(X_train, y_train)
train_pred = classification_report(y_train, clf.predict(X_train))
test_pred = classification_report(y_test, clf.predict(X_test))
print("Training Performance\n", train_pred, "\nTest Performance\n", test_pred)

Training Performance
              precision    recall  f1-score   support

          0       0.72      0.89      0.80       379
          1       0.61      0.34      0.43       197

avg / total       0.68      0.70      0.67       576
 
Test Performance
              precision    recall  f1-score   support

          0       0.68      0.90      0.78       121
          1       0.62      0.28      0.39        71

avg / total       0.66      0.67      0.63       192



In [29]:
X = pima.mass
y = pima['class']
X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = LogisticRegression()
clf.fit(X_train.reshape(-1,1), y_train)
train_pred = classification_report(y_train, clf.predict(X_train.reshape(-1,1)))
test_pred = classification_report(y_test, clf.predict(X_test.reshape(-1,1)))
print("Training Performance\n", train_pred, "\nTest Performance\n", test_pred)

Training Performance
              precision    recall  f1-score   support

          0       0.69      0.93      0.79       377
          1       0.60      0.20      0.30       199

avg / total       0.66      0.68      0.62       576
 
Test Performance
              precision    recall  f1-score   support

          0       0.67      0.90      0.77       123
          1       0.54      0.20      0.29        69

avg / total       0.62      0.65      0.60       192



  """
  
  import sys


In [34]:
X = pima[['pres', 'pedi']]
y = pima['class']
X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = LogisticRegression()
clf.fit(X_train, y_train)
train_pred = classification_report(y_train, clf.predict(X_train))
test_pred = classification_report(y_test, clf.predict(X_test))
print("Training Performance\n", train_pred, "\nTest Performance\n", test_pred)

Training Performance
              precision    recall  f1-score   support

          0       0.67      0.97      0.79       379
          1       0.55      0.08      0.14       197

avg / total       0.63      0.66      0.57       576
 
Test Performance
              precision    recall  f1-score   support

          0       0.65      0.97      0.77       121
          1       0.64      0.10      0.17        71

avg / total       0.64      0.65      0.55       192



In [35]:
clf.score(X_test, y_test)

0.6458333333333334

In [36]:
from sklearn.metrics import accuracy_score

In [37]:
accuracy_score(y_test, clf.predict(X_test))

0.6458333333333334