In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, cross_validate,train_test_split
from sklearn.compose import make_column_transformer
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

In [None]:
space_df = pd.read_csv('../input/spaceship-titanic/train.csv')
spaceTest = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
train_df, test_df = train_test_split(space_df, test_size=.2, random_state=123)
X_train, y_train = train_df.drop(columns=['Transported']), train_df[['Transported']]
X_test, y_test = test_df.drop(columns=['Transported']), test_df[['Transported']]

In [None]:
numeric_features = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']  
categorical_features = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
drop_features = ['PassengerId', 'Name']
target = 'Transported'

# Create our preprocessor
knn_preprocessor = make_column_transformer(
    # Apply standard scaling and imputation to our numeric features
    (make_pipeline(SimpleImputer(strategy='mean'), StandardScaler()), numeric_features),
    # Apply one-hot encoding and imputation to categorical features
    (make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(handle_unknown='ignore')), categorical_features),
    # Drop our bad features
    ('drop', drop_features)
)

In [None]:
results_dict = {
    "neighbors": [],
    "train_accuracy": [],
    "cv_accuracy": [],
    "fit_time": [],
    "score_time": []
}

train_scores = []
test_scores = []
neighbors = []
scores
#Find the best n_neighbors
for i in range(1, 30, 2):
    pipe_hyper_knn = make_pipeline(knn_preprocessor, KNeighborsClassifier(n_neighbors=i))
    pipe_hyper_knn.fit(X_train, y_train.values.ravel())
    scores = cross_validate(pipe_hyper_knn, X_train,y_train.values.ravel(), cv=5, return_train_score = True)

    results_dict["neighbors"].append(i)
    results_dict['train_accuracy'].append(np.mean(scores['train_score']))
    results_dict['cv_accuracy'].append(np.mean(scores['test_score']))
    results_dict['fit_time'].append(np.mean(scores['fit_time']))
    results_dict['score_time'].append(np.mean(scores['score_time']))

In [None]:
pd.DataFrame(results_dict)

In [None]:
results_df = pd.DataFrame(results_dict)
results_df = results_df.set_index("neighbors")
results_df[["train_accuracy", "cv_accuracy"]].plot()
display(results_df)

In [None]:
#Best n_neighbors is 25
pipe_hyper_knn = make_pipeline(knn_preprocessor, KNeighborsClassifier(n_neighbors=25))
pipe_hyper_knn.fit(X_train, y_train.values.ravel())

X_test_predictions = pipe_hyper_knn.predict(X_test)
modelScores['kNN with Hyperparameter'] = pipe_hyper_knn.score(X_test, y_test)
display(modelScores['kNN with Hyperparameter'])

kNNPredictions = pipe_hyper_knn.predict(spaceTest)

In [None]:
# Save our results in the proper format
result = zip(spaceTest['PassengerId'], pd.Series(kNNPredictions))
out = pd.DataFrame(result, columns=['PassengerId', 'Transported']).sort_values('PassengerId')
out.to_csv('./knn_with_25_hyperparameter.csv', index=False)
out.describe()