# Import Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Pandas used to import dataset, additional libraries are added for use later in code.

In [None]:
df = pd.read_csv('data.csv')

In [None]:
df

Display data table

# Split Dataset

In [None]:
x = df.loc[:,'MDVP:Fo(Hz)':'PPE'].values
y = df.loc[:,'status'].values

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = \
    train_test_split(x,y,
                        test_size = 0.2,
                        stratify = y,
                        random_state = 6000)

Splitting the data for training a knn classifier in a holdout + cross-validation fashion. Split data is used to establish graphs plotted later in code.

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

In [None]:
pipe_K = make_pipeline(StandardScaler(),PCA(n_components=2),KNeighborsClassifier())

In [None]:
pipe_K.fit(x_train, y_train)
y_predict = pipe_K.predict(x_test)
print('TEST ACCURACY: %.3f'% pipe_K.score(x_test,y_test))

Find the test accuracy using the "make_pipeline" function to establish pipe_K.

In [None]:
from sklearn.grid_search import GridSearchCV

k = [4,5,6,7,8,9,10]
param_grid = [{'kneighborsclassifier__n_neighbors': k}]

grids = GridSearchCV(estimator=pipe_K, param_grid=param_grid, scoring='accuracy', cv = 5)

grids = grids.fit(x_train, y_train)
print(grids.best_score_)
print(grids.best_params_)

Imput k values into the k-nearest neighbors classifier. Then, using Grid Search, identify the optimal k.

In [None]:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = \
                learning_curve(estimator = pipe_K,
                              X = x_train,
                              y = y_train,
                              train_sizes=np.linspace(0.1, 1.0, 10),
                              cv = 5, n_jobs=1)

train_mean = np.mean(train_scores, axis = 1)
train_std = np.std(train_scores, axis = 1)
test_mean = np.mean(test_scores, axis = 1)
test_std = np.std(test_scores, axis = 1)

plt.plot(train_sizes, train_mean,
        color = 'blue', marker = 'o',
        markersize = 5, label = 'Training Accuracy')

plt.fill_between(train_sizes,
                train_mean + train_std,
                train_mean - train_std,
                alpha = 0.15, color = 'blue')

plt.plot(train_sizes, test_mean,
         color = 'green', linestyle = '--',
         marker = 's', markersize = 5,
         label = 'Validation Accuracy')

plt.fill_between(train_sizes,
                test_mean + test_std,
                test_mean - test_std,
                alpha = 0.15, color = 'green')

plt.grid()
plt.xlabel('Number of Training Samples')
plt.ylabel('Accuracy')
plt.legend(loc = 'lower right')
plt.ylim([0.75, .95])
plt.tight_layout()
plt.show()

The code above is the graph setup for the learning curve using matplotlib. Train data and test data are separated into 2 separate colors and marker types. Then, the "fill-between" function is used to connect data points, with a dotted connector for the validation accuracy. I limited the y scale in order to show all data points without including extra space.

In [None]:
from sklearn.model_selection import validation_curve

train_scores, test_scores = validation_curve(
                estimator = pipe_K,
                X = x_train,
                y = y_train,
                param_name = 'kneighborsclassifier__n_neighbors',
                param_range = k,
                cv = 5)

train_mean = np.mean(train_scores, axis = 1)
train_std = np.std(train_scores, axis = 1)
test_mean = np.mean(test_scores, axis = 1)
test_std = np.std(test_scores, axis = 1)

plt.plot(k, train_mean,
        color = 'blue', marker = 'o',
        markersize = 5, label = 'Training Accuracy')

plt.fill_between(k,
                train_mean + train_std,
                train_mean - train_std,
                alpha = 0.15, color = 'blue')

plt.plot(k, test_mean,
         color = 'green', linestyle = '--',
         marker = 's', markersize = 5,
         label = 'Validation Accuracy')

plt.fill_between(k,
                test_mean + test_std,
                test_mean - test_std,
                alpha = 0.15, color = 'green')

plt.grid()
plt.legend(loc = 'lower right')
plt.xlabel('K Nearest Neighbors')
plt.ylabel('Accuracy')
plt.ylim([0.8, .95])
plt.tight_layout()
plt.show()

The code above is the graph setup for the validation curve using matplotlib. Train data and test data are separated into 2 separate colors and marker types. Then, the "fill-between" function is used to connect data points, with a dotted connector for the validation accuracy. I limited the y scale in order to show all data points without including extra space.