In [72]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import VALID_METRICS
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics.pairwise import distance_metrics
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
import pandas as pd

data = load_breast_cancer()

X = data.data   #input parameters 
y = data.target #classification output

In [6]:
#explore data using pandas
df = pd.DataFrame(X, columns = data.feature_names)
df["target"] = y
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [9]:
#split data into testing vs training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 16)
#test size : what proportion of data for training , random_state : options for randomization seed, if you don't pass this then each time u run the split u will receive a new training set


In [14]:
#learning algorithm
knn_model = KNeighborsClassifier()


### Learning Algorithm: K Nearest Neighbors
* A classification algorithm 
* Assign a label to n based on the most common value of its k nearest neighbors
* Analogous to regression but is used for discrete values 

#### SKLearning KNeighborsClassifier()
* n_neighbors : default = 5
* weights : 'uniform' OR 'distance'
    - how points are weighted, either uniformly or based on distance
    - distance weights are useful when u have multiple clusters
* algorithm : 'auto', 'ball_tree', 'kd_tree', 'brute' : default = 'auto' 
    * search algorithm to find the nearest neighbors
    * 'auto' will automatically select based on data
    * mostly a performance (i.e. runtime) parameter
* leaf_size : int : default = 30
    * size of leaf passed to search algorithm (KD or Ball Trees)
* p : float : default = 2
    * power parameter for Minkowski metric (x^p + y^p)
    * n = 1 is manhattan distance
    * n = 2 is euclidian
* metric : default = 'minkowski'
    * distance metric
    * see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.distance_metrics.html#sklearn.metrics.pairwise.distance_metrics for all metrics
* metric_params : 
    * addl args for metric, see above
* n_jobs : int : default = None
    * number of parallel searches 
    * prolly just affects processing time ? 

### Sources
* https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
* https://www.ibm.com/topics/knn


In [None]:
#train model using training
knn_model.fit(X_train, y_train)

#predict using the test data
knn_pred = knn_model.predict(X_test)

#check prediction performance 
accuracy = accuracy_score(y_test, knn_pred) #% of correct predictions (True Pos + True Neg)/Total

precision = precision_score(y_test, knn_pred) #how many true positives 

recall = recall_score(y_test, knn_pred)

confusion = confusion_matrix(y_test, knn_pred)

## Model Evaluation
* A good model will maximize all of these metrics 
### Accuracy
* How many correct predictions (pos and neg) out of total predictions 
### Precision
* How many true positives / total pos (T and F)
* How many misdiagnosed cases
### Recall
* TP / (TP + FN)
* How many cases did you miss!

### Confusion Matrix
* Returns 2x2 with total values of: <br>
TP | FP <br>
FN | TN

In [65]:
#Finding best model based on parameters 
stats = pd.DataFrame(columns = ["Neighbors", "Search Alg", "Weights", "Accuracy", "Precision", "Recall", "Sum"])
algorithms = ['ball_tree', 'kd_tree', 'brute']
weights = ['uniform', 'distance']
for x in range(1, 50):
    for alg in algorithms: 
        # dist_metrics = VALID_METRICS[alg]
        # print(dist_metrics)
        for w in weights: 
            knn_model = KNeighborsClassifier(n_neighbors = x, algorithm=alg, weights=w)
            knn_model.fit(X_train, y_train)
            knn_pred = knn_model.predict(X_test)
            a = accuracy_score(y_test, knn_pred) #% of correct predictions (True Pos + True Neg)/Total   
            p = precision_score(y_test, knn_pred) #how many true positives 
            r= recall_score(y_test, knn_pred)
            stats.loc[len(stats.index)] = [x, alg, w, a, p, r, a + p + r]
max_sum = stats["Sum"].max()
sum_max = stats.index[stats['Sum'] == max_sum].tolist()
pre_max = stats["Precision"].idxmax()
acc_max = stats["Accuracy"].idxmax()
re_max = stats["Recall"].idxmax()
best = stats.iloc[sum_max]
best_p = stats.iloc[pre_max]
best_a = stats.iloc[acc_max]
best_r = stats.iloc[re_max]
print(f"The  model with best overall performace is \n{best}")
print(f"The  model with best precision is \n{best_p}")
print(f"The  model with best accuracy is \n{best_a}")
print(f"The  model with best recall is \n{best_r}")

    

The  model with best overall performace is 
    Neighbors Search Alg   Weights  Accuracy  Precision    Recall       Sum
0           1  ball_tree   uniform   0.93007   0.909091  0.989011  2.828172
1           1  ball_tree  distance   0.93007   0.909091  0.989011  2.828172
2           1    kd_tree   uniform   0.93007   0.909091  0.989011  2.828172
3           1    kd_tree  distance   0.93007   0.909091  0.989011  2.828172
4           1      brute   uniform   0.93007   0.909091  0.989011  2.828172
5           1      brute  distance   0.93007   0.909091  0.989011  2.828172
7           2  ball_tree  distance   0.93007   0.909091  0.989011  2.828172
9           2    kd_tree  distance   0.93007   0.909091  0.989011  2.828172
11          2      brute  distance   0.93007   0.909091  0.989011  2.828172
The  model with best precision is 
Neighbors            14
Search Alg    ball_tree
Weights         uniform
Accuracy        0.93007
Precision      0.926316
Recall         0.967033
Sum            2.

### Logistic Regression
* fit a linear curve to divide the data space into 2 sections

In [71]:
#train model using training
lr_model = LogisticRegression(max_iter = 3000)
lr_model.fit(X_train, y_train)

#predict using the test data
lr_pred = lr_model.predict(X_test)

#check prediction performance 
accuracy = accuracy_score(y_test, lr_pred) #% of correct predictions (True Pos + True Neg)/Total

precision = precision_score(y_test, lr_pred) #how many true positives 

recall = recall_score(y_test, lr_pred)

confusion = confusion_matrix(y_test, lr_pred)


### Support Vector Machine (SVM)
* Basically high dimensionality linear regression ; fit a hyperplane to divide the data into 2 segments 
* can perform non-linear transformation to get data into correct shape (kernel)

In [75]:
svm_model = SVC(kernel = 'linear')
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)

#check prediction performance 
accuracy = accuracy_score(y_test, svm_pred) #% of correct predictions (True Pos + True Neg)/Total

precision = precision_score(y_test, svm_pred) #how many true positives 

recall = recall_score(y_test, svm_pred)

confusion = confusion_matrix(y_test, svm_pred)
