In [3]:
import numpy as np
import pandas as pd 
import scipy.spatial.distance as dist
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification

## data clean

In [None]:
df = pd.read_excel('data/Flash Point and Cetane Number Predictions for Fuel Compounds.xls', skiprows=4)

In [63]:
df = df.rename(index=str, columns={"Unnamed: 0": "name", 
                                   "Unnamed: 1": "family", 
                                   "Unnamed: 2": "SMILES", 
                                   "Unnamed: 3": "FP_set", 
                                   "Unnamed: 4": "FP_exp", 
                                   "Unnamed: 5": "FP_calc", 
                                   "Unnamed: 6": "FP_std", 
                                   "Unnamed: 7": "CN_set", 
                                   "Unnamed: 8": "CN_exp", 
                                   "Unnamed: 9": "CN_calc", 
                                   "Unnamed: 10": "CN_std", })
X = df.iloc[:,12:39]
y, labels=pd.factorize(df.family)


train_fp = df.loc[df['FP_set']=='Training']
X_train_fp = train_fp.iloc[:,12:39]
y_train_fp, y_train_labels_fp = pd.factorize(train_fp.family)

test_fp = df.loc[df['FP_set']=='Validation']
X_test_fp = test_fp.iloc[:,12:39]
y_test_fp, y_test_labels_fp = pd.factorize(test_fp.family)


In [67]:
k = 5
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X, y)
y_pred_fp = knn.predict(X_test_fp)

In [68]:
y_pred_fp

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2, 1, 5, 3, 3, 5, 3, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 6, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7], dtype=int64)

In [69]:
y_test_fp

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7], dtype=int64)

In [71]:
accu = metrics.accuracy_score(y_test_fp, y_pred_fp)
accu

0.93495934959349591

### define functions

In [76]:
def choose_k(X_train, y_train, X_test, y_test, list_of_k):
    dic = {}
    for k in list_of_k:
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X, y)
        y_pred = knn.predict(X_test)
        accu = metrics.accuracy_score(y_test, y_pred)
        dic['k value: %s, accuracy' % k] = accu
    return dic

In [78]:
k=[1,2,3,4,5]
choose_k(X, y, X_test_fp, y_test_fp, k)

{'k value: 1, accuracy': 1.0,
 'k value: 2, accuracy': 0.98373983739837401,
 'k value: 3, accuracy': 0.95934959349593496,
 'k value: 4, accuracy': 0.95934959349593496,
 'k value: 5, accuracy': 0.93495934959349591}