## Classification Evaluation Metrics Practice 
Load the Wisconsin breast cancer data from sklearn (binary classification problem), do a train/test split, and fit a logistic regression and 10 nearest neighbors model. Instead of using any built-in sklearn scoring methods, write your own accuracy, precision, recall, and F1 evaluation functions that take arrays of actual and predicted target labels as arguments. Score your models on the test set.

e.g. def accuracy(actuals, preds)

In [67]:
import math
import pickle
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", 150)
pd.set_option("display.precision", 3)
pd.options.mode.chained_assignment = None

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, Ridge, RidgeCV, LogisticRegression
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier


In [7]:
sklearn.datasets.load_breast_cancer

<function sklearn.datasets._base.load_breast_cancer(*, return_X_y=False, as_frame=False)>

In [31]:
data = load_breast_cancer(as_frame=False)

In [32]:
data

{'data': array([[1.799e+01, 1.038e+01, 1.228e+02, ..., 2.654e-01, 4.601e-01,
         1.189e-01],
        [2.057e+01, 1.777e+01, 1.329e+02, ..., 1.860e-01, 2.750e-01,
         8.902e-02],
        [1.969e+01, 2.125e+01, 1.300e+02, ..., 2.430e-01, 3.613e-01,
         8.758e-02],
        ...,
        [1.660e+01, 2.808e+01, 1.083e+02, ..., 1.418e-01, 2.218e-01,
         7.820e-02],
        [2.060e+01, 2.933e+01, 1.401e+02, ..., 2.650e-01, 4.087e-01,
         1.240e-01],
        [7.760e+00, 2.454e+01, 4.792e+01, ..., 0.000e+00, 2.871e-01,
         7.039e-02]]),
 'target': array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0,
        1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0,
        1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1,
        1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0

In [35]:
df = pd.DataFrame(data["data"])

In [36]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
0,17.99,10.38,122.80,1001.0,0.118,0.278,0.300,0.147,0.242,0.079,1.095,0.905,8.589,153.40,0.006,0.049,0.054,0.016,0.030,0.006,25.380,17.33,184.60,2019.0,0.162,0.666,0.712,0.265,0.460,0.119
1,20.57,17.77,132.90,1326.0,0.085,0.079,0.087,0.070,0.181,0.057,0.543,0.734,3.398,74.08,0.005,0.013,0.019,0.013,0.014,0.004,24.990,23.41,158.80,1956.0,0.124,0.187,0.242,0.186,0.275,0.089
2,19.69,21.25,130.00,1203.0,0.110,0.160,0.197,0.128,0.207,0.060,0.746,0.787,4.585,94.03,0.006,0.040,0.038,0.021,0.022,0.005,23.570,25.53,152.50,1709.0,0.144,0.424,0.450,0.243,0.361,0.088
3,11.42,20.38,77.58,386.1,0.142,0.284,0.241,0.105,0.260,0.097,0.496,1.156,3.445,27.23,0.009,0.075,0.057,0.019,0.060,0.009,14.910,26.50,98.87,567.7,0.210,0.866,0.687,0.258,0.664,0.173
4,20.29,14.34,135.10,1297.0,0.100,0.133,0.198,0.104,0.181,0.059,0.757,0.781,5.438,94.44,0.011,0.025,0.057,0.019,0.018,0.005,22.540,16.67,152.20,1575.0,0.137,0.205,0.400,0.163,0.236,0.077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.111,0.116,0.244,0.139,0.173,0.056,1.176,1.256,7.673,158.70,0.010,0.029,0.052,0.025,0.011,0.004,25.450,26.40,166.10,2027.0,0.141,0.211,0.411,0.222,0.206,0.071
565,20.13,28.25,131.20,1261.0,0.098,0.103,0.144,0.098,0.175,0.055,0.765,2.463,5.203,99.04,0.006,0.024,0.040,0.017,0.019,0.002,23.690,38.25,155.00,1731.0,0.117,0.192,0.322,0.163,0.257,0.066
566,16.60,28.08,108.30,858.1,0.085,0.102,0.093,0.053,0.159,0.056,0.456,1.075,3.425,48.55,0.006,0.037,0.047,0.016,0.013,0.004,18.980,34.12,126.70,1124.0,0.114,0.309,0.340,0.142,0.222,0.078
567,20.60,29.33,140.10,1265.0,0.118,0.277,0.351,0.152,0.240,0.070,0.726,1.595,5.772,86.22,0.007,0.062,0.071,0.017,0.023,0.006,25.740,39.42,184.60,1821.0,0.165,0.868,0.939,0.265,0.409,0.124


In [37]:
df2 = pd.DataFrame(data["target"])

In [39]:
df3 = pd.concat([df, df2], axis=1)

In [40]:
df3

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,0.1
0,17.99,10.38,122.80,1001.0,0.118,0.278,0.300,0.147,0.242,0.079,1.095,0.905,8.589,153.40,0.006,0.049,0.054,0.016,0.030,0.006,25.380,17.33,184.60,2019.0,0.162,0.666,0.712,0.265,0.460,0.119,0
1,20.57,17.77,132.90,1326.0,0.085,0.079,0.087,0.070,0.181,0.057,0.543,0.734,3.398,74.08,0.005,0.013,0.019,0.013,0.014,0.004,24.990,23.41,158.80,1956.0,0.124,0.187,0.242,0.186,0.275,0.089,0
2,19.69,21.25,130.00,1203.0,0.110,0.160,0.197,0.128,0.207,0.060,0.746,0.787,4.585,94.03,0.006,0.040,0.038,0.021,0.022,0.005,23.570,25.53,152.50,1709.0,0.144,0.424,0.450,0.243,0.361,0.088,0
3,11.42,20.38,77.58,386.1,0.142,0.284,0.241,0.105,0.260,0.097,0.496,1.156,3.445,27.23,0.009,0.075,0.057,0.019,0.060,0.009,14.910,26.50,98.87,567.7,0.210,0.866,0.687,0.258,0.664,0.173,0
4,20.29,14.34,135.10,1297.0,0.100,0.133,0.198,0.104,0.181,0.059,0.757,0.781,5.438,94.44,0.011,0.025,0.057,0.019,0.018,0.005,22.540,16.67,152.20,1575.0,0.137,0.205,0.400,0.163,0.236,0.077,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.111,0.116,0.244,0.139,0.173,0.056,1.176,1.256,7.673,158.70,0.010,0.029,0.052,0.025,0.011,0.004,25.450,26.40,166.10,2027.0,0.141,0.211,0.411,0.222,0.206,0.071,0
565,20.13,28.25,131.20,1261.0,0.098,0.103,0.144,0.098,0.175,0.055,0.765,2.463,5.203,99.04,0.006,0.024,0.040,0.017,0.019,0.002,23.690,38.25,155.00,1731.0,0.117,0.192,0.322,0.163,0.257,0.066,0
566,16.60,28.08,108.30,858.1,0.085,0.102,0.093,0.053,0.159,0.056,0.456,1.075,3.425,48.55,0.006,0.037,0.047,0.016,0.013,0.004,18.980,34.12,126.70,1124.0,0.114,0.309,0.340,0.142,0.222,0.078,0
567,20.60,29.33,140.10,1265.0,0.118,0.277,0.351,0.152,0.240,0.070,0.726,1.595,5.772,86.22,0.007,0.062,0.071,0.017,0.023,0.006,25.740,39.42,184.60,1821.0,0.165,0.868,0.939,0.265,0.409,0.124,0


In [42]:
data.feature_names

array(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error',
       'fractal dimension error', 'worst radius', 'worst texture',
       'worst perimeter', 'worst area', 'worst smoothness',
       'worst compactness', 'worst concavity', 'worst concave points',
       'worst symmetry', 'worst fractal dimension'], dtype='<U23')

In [50]:
columns = list(data.feature_names)

In [51]:
columns

['mean radius',
 'mean texture',
 'mean perimeter',
 'mean area',
 'mean smoothness',
 'mean compactness',
 'mean concavity',
 'mean concave points',
 'mean symmetry',
 'mean fractal dimension',
 'radius error',
 'texture error',
 'perimeter error',
 'area error',
 'smoothness error',
 'compactness error',
 'concavity error',
 'concave points error',
 'symmetry error',
 'fractal dimension error',
 'worst radius',
 'worst texture',
 'worst perimeter',
 'worst area',
 'worst smoothness',
 'worst compactness',
 'worst concavity',
 'worst concave points',
 'worst symmetry',
 'worst fractal dimension']

In [53]:
df.columns = columns

In [55]:
df4 = pd.concat([df, df2], axis=1)

In [58]:
df4.rename(columns={0:"target"}, inplace=True)

In [59]:
df4

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.80,1001.0,0.118,0.278,0.300,0.147,0.242,0.079,1.095,0.905,8.589,153.40,0.006,0.049,0.054,0.016,0.030,0.006,25.380,17.33,184.60,2019.0,0.162,0.666,0.712,0.265,0.460,0.119,0
1,20.57,17.77,132.90,1326.0,0.085,0.079,0.087,0.070,0.181,0.057,0.543,0.734,3.398,74.08,0.005,0.013,0.019,0.013,0.014,0.004,24.990,23.41,158.80,1956.0,0.124,0.187,0.242,0.186,0.275,0.089,0
2,19.69,21.25,130.00,1203.0,0.110,0.160,0.197,0.128,0.207,0.060,0.746,0.787,4.585,94.03,0.006,0.040,0.038,0.021,0.022,0.005,23.570,25.53,152.50,1709.0,0.144,0.424,0.450,0.243,0.361,0.088,0
3,11.42,20.38,77.58,386.1,0.142,0.284,0.241,0.105,0.260,0.097,0.496,1.156,3.445,27.23,0.009,0.075,0.057,0.019,0.060,0.009,14.910,26.50,98.87,567.7,0.210,0.866,0.687,0.258,0.664,0.173,0
4,20.29,14.34,135.10,1297.0,0.100,0.133,0.198,0.104,0.181,0.059,0.757,0.781,5.438,94.44,0.011,0.025,0.057,0.019,0.018,0.005,22.540,16.67,152.20,1575.0,0.137,0.205,0.400,0.163,0.236,0.077,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,21.56,22.39,142.00,1479.0,0.111,0.116,0.244,0.139,0.173,0.056,1.176,1.256,7.673,158.70,0.010,0.029,0.052,0.025,0.011,0.004,25.450,26.40,166.10,2027.0,0.141,0.211,0.411,0.222,0.206,0.071,0
565,20.13,28.25,131.20,1261.0,0.098,0.103,0.144,0.098,0.175,0.055,0.765,2.463,5.203,99.04,0.006,0.024,0.040,0.017,0.019,0.002,23.690,38.25,155.00,1731.0,0.117,0.192,0.322,0.163,0.257,0.066,0
566,16.60,28.08,108.30,858.1,0.085,0.102,0.093,0.053,0.159,0.056,0.456,1.075,3.425,48.55,0.006,0.037,0.047,0.016,0.013,0.004,18.980,34.12,126.70,1124.0,0.114,0.309,0.340,0.142,0.222,0.078,0
567,20.60,29.33,140.10,1265.0,0.118,0.277,0.351,0.152,0.240,0.070,0.726,1.595,5.772,86.22,0.007,0.062,0.071,0.017,0.023,0.006,25.740,39.42,184.60,1821.0,0.165,0.868,0.939,0.265,0.409,0.124,0


In [61]:
X = df4.iloc[:, :-1]
y = df4.iloc[:, -1:]

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [64]:
lm1 = LogisticRegression()

In [65]:
lm1.fit(X_train, y_train)

  return f(**kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [66]:
lm1.score(X_train, y_train)

0.9560439560439561

In [76]:
logregguess = lm1.predict(X_test)

In [77]:
type(logregguess)

numpy.ndarray

In [70]:
knn = KNeighborsClassifier(n_neighbors=10)

In [71]:
knn.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


KNeighborsClassifier(n_neighbors=10)

In [72]:
knn.score(X_train, y_train)

0.9296703296703297

In [75]:
knn_guess = knn.predict(X_test)

In [84]:
y_test["logguess"] = logregguess

In [85]:
y_test["knnguess"] = knn_guess

In [87]:
y_test.reset_index(inplace=True)

In [88]:
y_test

Unnamed: 0,index,target,logguess,knnguess
0,204,1,1,1
1,70,0,0,0
2,131,0,0,0
3,431,1,1,1
4,540,1,1,1
5,567,0,0,0
6,369,0,0,0
7,29,0,0,0
8,81,1,1,1
9,477,1,1,1


In [97]:
def accuracy(actuals, preds):
    correct = 0
    for i in range(len(actuals)):
        if actuals[i] == preds[i]:
            correct += 1
    return correct / len(actuals)
        
def precision(actuals, preds):
    our_positives = 0
    real_positives = 0
    for i in range(len(preds)):
        if preds[i] == 1:
            our_positives += 1
            if actuals[i] == 1:
                real_positives += 1
    return real_positives / our_positives

    
def recall(actuals, preds):
    recall = 0
    real_positives = 0
    for i in range(len(preds)):
        if actuals[i] == 1:
            real_positives += 1
            if preds[i] == 1:
                recall += 1
    return recall / real_positives

In [92]:
accuracy(y_test["target"], y_test["logguess"])

0.956140350877193

In [93]:
accuracy(y_test["target"], y_test["knnguess"])

0.9736842105263158

In [115]:
precision_log = precision(y_test["target"], y_test["logguess"])
precision_log

0.9459459459459459

In [116]:
precision_knn = precision(y_test["target"], y_test["knnguess"])
precision_knn

0.9722222222222222

In [100]:
recall(y_test["target"], y_test["logguess"])

0.9859154929577465

In [101]:
recall(y_test["target"], y_test["knnguess"])

0.9859154929577465

In [103]:
false_positives_log = 1 - precision(y_test["target"], y_test["logguess"])

In [105]:
false_positives_knn = 1 - precision(y_test["target"], y_test["knnguess"])

In [106]:
recall_log = recall(y_test["target"], y_test["logguess"])

In [110]:
recall_knn = recall(y_test["target"], y_test["knnguess"])

In [108]:
recall_log / false_positives_log # TPR/FPR

18.239436619718308

In [111]:
recall_knn / false_positives_knn # TPR/FPR

35.49295774647886

In [113]:
F1 = 2 * (precision_log * recall_log) / (precision_log + recall_log)

In [114]:
F1

0.9655172413793103

In [117]:
F1_knn = 2 * (precision_knn * recall_knn) / (precision_knn + recall_knn)

In [118]:
F1_knn

0.979020979020979