[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)

In [264]:
import pandas as pd
import numpy as np
import math

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

# My own implementation of K Nearest Neighbors

An implementation of KNN from scratch. The metric of proximity used is Euclidean distance.

In [232]:
iris = pd.read_csv('iris.csv', names = column_names).values
def euclidean_distance(train, test):
    return math.sqrt(sum((train - test) ** 2))


def get_neighbors(train, labels, test_vals, k_val):
    distances = []
    for i in range(len(train)):
        train_vals = train[i]
        train_label = labels[i]
        distance = euclidean_distance(train_vals, test_vals)
        distances.append((train_vals, train_label, distance))
    distances.sort(key = lambda x: x[2])
    neighbors = distances[:k_val]
    return neighbors  


def predict_label(test_neighbors):
    label_count = {}
    for vals in test_neighbors:
        neighbor_vals = vals[0]
        neighbor_label = vals[1]
        if neighbor_label not in label_count:
            label_count[neighbor_label] = 1
        else:
            label_count[neighbor_label] += 1
    prediction = max(label_count, key = label_count.get)
    return prediction


def print_accuracy(predictions, actual, k_val):
    num_correct = 0
    for i in range(len(predictions)):
        predicted_label = predictions[i]
        actual_label = actual[i]
        if predicted_label == actual_label:
            num_correct += 1
    accuracy = str((float(num_correct)/len(actual)) * 100) + '%'
    print("With a k-value of", k_val, "we classified", num_correct, "labels accurately out of", len(actual))
    print("Accuracy of: ", accuracy)

    

def knn(df, k_val):
    X = df[:, :-1]
    y = df[:, 4]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, stratify = y)
    predictions = []
    for vals in X_test:
        neighbors = get_neighbors(X_train, y_train, vals, k_val)
        predicted_label = predict_label(neighbors)
        predictions.append(predicted_label)    
    print_accuracy(predictions, y_test, k_val)
        
        
knn(iris, int(input("K value: "))) 

K value: 21
With a k-value of 21 we classified 29 labels accurately out of 30
Accuracy of:  96.66666666666667%


# Comparison of sklearn classification methods

## 1. K Nearest Neighbors

In [177]:
iris = pd.read_csv('iris.csv', names = column_names)
X = iris.iloc[:, :-1].values  
y = iris.iloc[:, 4].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)
classifier = KNeighborsClassifier(n_neighbors=5, metric = 'euclidean')  
classifier.fit(X_train, y_train)
predictions = classifier.predict(X_test)
print_accuracy(predictions, y_test, 5)

With a k-value of 5 we classified 29 labels accurately out of 30
Accuracy of:  96.66666666666667%


In [193]:
print(predictions)

['Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-virginica'
 'Iris-setosa' 'Iris-virginica' 'Iris-setosa' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-virginica' 'Iris-setosa'
 'Iris-versicolor' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-virginica' 'Iris-versicolor' 'Iris-setosa' 'Iris-setosa'
 'Iris-virginica' 'Iris-setosa' 'Iris-setosa' 'Iris-versicolor'
 'Iris-versicolor' 'Iris-setosa']


In [195]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[11  0  0]
 [ 0 12  1]
 [ 0  0  6]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       1.00      0.92      0.96        13
 Iris-virginica       0.86      1.00      0.92         6

    avg / total       0.97      0.97      0.97        30



## 2. SVC

In [263]:
svm = SVC(kernel ='rbf', random_state= 0, gamma=0.10,C=1.0)
svm.fit(X_train, y_train)
print("Train Accuracy: ", svm.score(X_train, y_train))

Train Accuracy:  0.9666666666666667


In [200]:
predictions = svm.predict(X_test)
predictions

array(['Iris-virginica', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-setosa'], dtype=object)

In [201]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[11  0  0]
 [ 0 13  0]
 [ 0  0  6]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       1.00      1.00      1.00        13
 Iris-virginica       1.00      1.00      1.00         6

    avg / total       1.00      1.00      1.00        30



## 3. Random Forests

In [262]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)
print("Train Accuracy: ", random_forest.score(X_train, y_train))

random_forest.fit(X_test, y_test)
print("Test Accuracy: ", random_forest.score(X_test, y_test))

Train Accuracy:  1.0
Test Accuracy:  1.0


In [229]:
predictions = random_forest.predict(X_test)
predictions

array(['Iris-virginica', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-setosa'], dtype=object)

In [230]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[11  0  0]
 [ 0 13  0]
 [ 0  0  6]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       1.00      1.00      1.00        13
 Iris-virginica       1.00      1.00      1.00         6

    avg / total       1.00      1.00      1.00        30



## 4. MLPClassifier

In [244]:
mlpc = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1)

In [245]:
mlpc.fit(X_train, y_train)

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 2), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [260]:
predictions = mlpc.predict(X_test)
predictions

array(['Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica',
       'Iris-virginica', 'Iris-virginica', 'Iris-virginica'], dtype='<U15')

In [261]:
print("Train Accuracy: ", mlpc.score(X_train, y_train))
print("Test Accuracy: ", mlpc.score(X_test, y_test))

Train Accuracy:  0.36666666666666664
Test Accuracy:  0.2


In [249]:
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

[[ 0  0 11]
 [ 0  0 13]
 [ 0  0  6]]
                 precision    recall  f1-score   support

    Iris-setosa       0.00      0.00      0.00        11
Iris-versicolor       0.00      0.00      0.00        13
 Iris-virginica       0.20      1.00      0.33         6

    avg / total       0.04      0.20      0.07        30



  'precision', 'predicted', average, warn_for)


## 5. Logistic Regression

In [253]:
log_regression = LogisticRegression(C=1e5, solver='lbfgs', multi_class='multinomial')
log_regression.fit(X_train, y_train)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='multinomial', n_jobs=1, penalty='l2',
          random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
          warm_start=False)

In [254]:
predictions = log_regression.predict(X_test)
predictions

array(['Iris-virginica', 'Iris-versicolor', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-virginica', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-virginica', 'Iris-versicolor', 'Iris-setosa', 'Iris-setosa',
       'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-setosa'], dtype=object)

In [255]:
print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test, predictions))

[[11  0  0]
 [ 0 13  0]
 [ 0  0  6]]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        11
Iris-versicolor       1.00      1.00      1.00        13
 Iris-virginica       1.00      1.00      1.00         6

    avg / total       1.00      1.00      1.00        30



In [258]:
print("Train Accuracy: ", log_regression.score(X_train, y_train))
print("Test Accuracy: ", log_regression.score(X_test, y_test))

Train Accuracy:  0.9833333333333333
Test Accuracy:  1.0


## 6. Stochastic Gradient Descent

In [265]:
clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [266]:
predictions = clf.predict(X_test)
predictions

array(['Iris-virginica', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-virginica',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor',
       'Iris-versicolor', 'Iris-versicolor', 'Iris-versicolor'],
      dtype='<U15')

In [267]:
print(confusion_matrix(y_test,predictions))  
print(classification_report(y_test, predictions))

[[ 0 11  0]
 [ 0 13  0]
 [ 0  4  2]]
                 precision    recall  f1-score   support

    Iris-setosa       0.00      0.00      0.00        11
Iris-versicolor       0.46      1.00      0.63        13
 Iris-virginica       1.00      0.33      0.50         6

    avg / total       0.40      0.50      0.37        30



  'precision', 'predicted', average, warn_for)


In [268]:
print("Train Accuracy: ", clf.score(X_train, y_train))
print("Test Accuracy: ", clf.score(X_test, y_test))

Train Accuracy:  0.31666666666666665
Test Accuracy:  0.5
