In [17]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import pandas as pd

In [None]:
# want base data and target data

data = load_breast_cancer()

# These var names are convention for base and target
X = data.data
y = data.target

print(y)
print(data.target_names)

In [None]:
df = pd.DataFrame(data.data, columns = data.feature_names)

print(df.head())
# df.to_csv('rawdata.csv')

In [22]:
# MUST Split training and testing data otherwise you'll overfit

# Test size is % of total. All remaining data goes into train set
# Shuffle the data with random_state to minimize bias from the dataset's creation. value is a seed
# The method shuffles data by default but the seed is needed to get the same shuffle repeatedly

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=16)


print(X.shape)
print(X_train.shape)
print(X_test.shape)

(569, 30)
(426, 30)
(143, 30)


In [21]:
# K nearest neighbor
knn_model = KNeighborsClassifier()
knn_model.fit(X_train,y_train)
knn_pred = knn_model.predict(X_test)

print(knn_pred)
print(y_test)

acc = accuracy_score(y_test, knn_pred)
print('acc:', acc) # % of accurate predictions (true hits over all hits) (TP + TN) / (TP + TN + FP + FN)
prc = precision_score(y_test, knn_pred) # Percent of accurate positives (TP / (TP + FP))
print('prc:', prc)
rec = recall_score(y_test, knn_pred) # How many cases you missed. TP / (TP + FN)
print('rec:', rec)
# These all need to be as high as possible

cfu = confusion_matrix(y_test, knn_pred) # As many hits as possible on the diagonal is the goal
# [TP FP]
# [FN TN]
print(cfu)


[1 1 1 1 1 1 0 1 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 0 1 1 1 0 1 1 0 1 1 1 1 1 0
 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 0 1 1 1 1 1 1 0 0 1
 0 1 1 0 1 0 0 1 0 1 1 0 1 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1 0 1 1 0 1 1 1 0 1
 0 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1 0 1 0 0 1 0 1 1 0 1 1 0 0 1 0 1]
[1 1 1 1 1 1 1 1 0 1 0 0 1 0 1 0 0 1 1 0 0 1 0 0 1 1 1 0 0 1 0 1 1 1 1 1 0
 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1 1 1 0 1 0 1 0 0 1 1 1 0 0 1 1 1 1 1 1 0 0 1
 1 1 1 0 1 0 0 1 0 1 1 0 1 1 1 1 1 1 0 0 0 0 0 1 0 1 1 1 0 1 1 0 1 1 1 0 1
 0 1 1 1 1 0 0 1 1 1 0 1 0 1 1 1 0 1 1 0 1 0 1 1 0 1 1 0 0 1 0 1]
acc: 0.916083916083916
prc: 0.9157894736842105
rec: 0.9560439560439561
[[44  8]
 [ 4 87]]


### Explore what a k neighbors classifier is; what parameters can you give it to increase scores
Instance-based / non-generalizing

kNN calculates the distance between a sample point and all other training data points. Can be straight line distance or taxicab. Selects k nearest to compare
("Classification is computed from a simple majority vote of the nearest neighbors of each point: a query point is assigned the data class which has the most representatives within the nearest neighbors of the point.")


Higher k = smoother decision boundary, lower variance, higher bias, more time
Lower k = less time, more noise impact, more overfitting likelihood
Cross Validation: Try many values of k

By default, neighbors are uniformly weighted. Can be changed to distance weighting: weight proportional to inverse distance