K Nearest Neighbour

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [3]:
data = pd.read_csv("../Resources/exoplanet_data.csv")

# Drop null columns
data = data.dropna(axis='columns', how='all')

# Drop null rows
data = data.dropna()

# Convert dtypes of int64 to float64
for column, content in data.items():
    if data[column].dtype == 'int64':
        data = data.astype({column: 'float64'})

In [4]:
# Assign data to X and y
X = data.drop("koi_disposition", axis=1)
y = data["koi_disposition"]

# Split data into training and testing groups
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [5]:
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

Train the Model

In [6]:
train_scores = []
test_scores = []
for k in range (1, 30, 2):
    knn= KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train)
    train_score = knn.score(X_train_scaled, y_train)
    test_score = knn.score(X_test_scaled, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)

In [7]:
print(f"k: {k},  Training Data Score: {train_score:.3f}")
print(f"k: {k},  Testing Data Score: {test_score:.3f}")

k: 29,  Training Data Score: 0.832
k: 29,  Testing Data Score: 0.824


In [12]:
param_grid = {"n_neighbors": [3,5,7,9,11,13,15,17,19,21]}

tuned_model = GridSearchCV(knn, param_grid, verbose=3)
grid = GridSearchCV(knn, param_grid, cv=5, verbose=3, n_jobs=-1)

# Train the model with GridSearch
_ = grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END .....................n_neighbors=3;, score=0.800 total time=   0.4s
[CV 2/5] END .....................n_neighbors=3;, score=0.805 total time=   0.4s
[CV 4/5] END .....................n_neighbors=3;, score=0.812 total time=   0.5s
[CV 5/5] END .....................n_neighbors=3;, score=0.814 total time=   0.5s
[CV 3/5] END .....................n_neighbors=3;, score=0.815 total time=   0.5s
[CV 1/5] END .....................n_neighbors=5;, score=0.809 total time=   0.5s
[CV 2/5] END .....................n_neighbors=5;, score=0.812 total time=   0.5s
[CV 3/5] END .....................n_neighbors=5;, score=0.812 total time=   0.6s
[CV 1/5] END .....................n_neighbors=7;, score=0.826 total time=   0.6s
[CV 3/5] END .....................n_neighbors=7;, score=0.816 total time=   0.6s
[CV 5/5] END .....................n_neighbors=5;, score=0.816 total time=   0.6s
[CV 4/5] END .....................n_neighbors=5;

In [13]:
tuned_model.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END .....................n_neighbors=3;, score=0.800 total time=   0.1s
[CV 2/5] END .....................n_neighbors=3;, score=0.805 total time=   0.1s
[CV 3/5] END .....................n_neighbors=3;, score=0.815 total time=   0.1s
[CV 4/5] END .....................n_neighbors=3;, score=0.812 total time=   0.1s
[CV 5/5] END .....................n_neighbors=3;, score=0.814 total time=   0.1s
[CV 1/5] END .....................n_neighbors=5;, score=0.809 total time=   0.1s
[CV 2/5] END .....................n_neighbors=5;, score=0.812 total time=   0.1s
[CV 3/5] END .....................n_neighbors=5;, score=0.812 total time=   0.1s
[CV 4/5] END .....................n_neighbors=5;, score=0.820 total time=   0.1s
[CV 5/5] END .....................n_neighbors=5;, score=0.816 total time=   0.1s
[CV 1/5] END .....................n_neighbors=7;, score=0.826 total time=   0.1s
[CV 2/5] END .....................n_neighbors=7;

GridSearchCV(estimator=KNeighborsClassifier(n_neighbors=29),
             param_grid={'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19, 21]},
             verbose=3)

In [24]:
n_neighbors = grid.best_params_['n_neighbors']



# Tuned model
tuned_model = KNeighborsClassifier(n_neighbors = n_neighbors)
tuned_model.fit(X_train_scaled, y_train)

model_3_training_score = round(tuned_model.score(X_train_scaled, y_train)*100,3)
tuned_accuracy = round(tuned_model.score(X_test_scaled, y_test)*100,3)

print(f"Training Data Score: {model_3_training_score} %")
print(f"Testing Data Score: {tuned_accuracy} %")

Training Data Score: 86.515 %
Testing Data Score: 82.723 %


In [25]:
filename = '../Models/KNN.sav'
_ = joblib.dump(tuned_model, filename)