In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [17]:
# Read the dataset and preprocess as described
full_train = pd.read_csv("datasets/train_clean_pca_transformed.csv")
full_test = pd.read_csv("datasets/test_clean_pca_transformed.csv")

target_col = "Survived"
X = full_train.drop(target_col, axis=1)
y = full_train[target_col]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# X_train, y_train = X, y

# Create the KNN Classifier
knn_model = KNeighborsClassifier()

# # Define the parameter grid for GridSearchCV
# param_grid = {
#     'n_neighbors': [3, 5, 7, 9],
#     'weights': ['uniform', 'distance'],
#     'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
#     'p': [1, 2]  # Manhattan or Euclidean distance
# }

param_grid = {
    "n_neighbors": [3, 4, 5, 6, 7, 8, 9, 10, 11],  # Increase the range of neighbors to consider
    "weights": ["uniform", 'distance'],
    "algorithm": ["ball_tree", "kd_tree", "brute"],
    "p": [1, 2, 3, 4, 5],  # Add more values for p parameter for different distance measures
    "leaf_size": [
        5,
        10,
        15,
        20,
        25,
    ],  # Explore different leaf sizes for BallTree or KDTree
    "metric": [
        "manhattan",
        "euclidean",
        "chebyshev",
        "minkowski",
    ],  # Include different distance metrics
    "n_jobs": [-1],  # Utilize all processors
}

# Create the GridSearchCV object
grid_search = GridSearchCV(
    knn_model, param_grid, n_jobs=1, cv=7, scoring="accuracy", verbose=4
)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)

print(f"Best Parameters: {best_params}")
# print(f"Best Score: {grid_search.best_score_:.4f}")
print(f"Accuracy on Test Set: {accuracy:.4f}")

Fitting 7 folds for each of 5400 candidates, totalling 37800 fits
[CV 1/7] END algorithm=ball_tree, leaf_size=5, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.804 total time=   0.0s
[CV 2/7] END algorithm=ball_tree, leaf_size=5, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.784 total time=   0.0s
[CV 3/7] END algorithm=ball_tree, leaf_size=5, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.873 total time=   0.0s
[CV 4/7] END algorithm=ball_tree, leaf_size=5, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.824 total time=   0.0s
[CV 5/7] END algorithm=ball_tree, leaf_size=5, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.762 total time=   0.0s
[CV 6/7] END algorithm=ball_tree, leaf_size=5, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.861 total time=   0.0s
[CV 7/7] END algorithm=ball_tree, leaf_size=5, metric=manhattan,

In [1]:
import joblib

joblib.dump(grid_search, 'grid_search_model.joblib')

NameError: name 'grid_search' is not defined

In [5]:
loaded_grid_search = joblib.load('grid_search_model.joblib')

In [6]:
for k, v in loaded_grid_search.cv_results_.items():
    print(k, v, sep="\t")

mean_fit_time	[0.00260191 0.00199995 0.00220642 0.00159397 0.00199981 0.0019989
 0.00200057 0.00139914 0.00220017 0.00200047 0.00159974 0.00220118
 0.0018034  0.002      0.00200253 0.00159864 0.00199962 0.00219984
 0.0026021  0.00200028 0.00219979 0.00240245 0.00260797 0.00219979
 0.00239968 0.00199971 0.00179873 0.00180082 0.00220098 0.00180087
 0.00160146 0.00200028 0.00219889 0.00200005 0.00200028 0.00219936
 0.00219846 0.00200229 0.00219936 0.00200052 0.00200248 0.001998
 0.00180178 0.00180006 0.00198932 0.00120525 0.00259995 0.00240588
 0.00199895 0.00199833 0.00179987 0.00180011 0.00160489 0.00199637
 0.00220027 0.00180001 0.00240107 0.00180044 0.00220046 0.00240092
 0.0015995  0.0022006  0.00159945 0.00179434 0.00199723 0.00179553
 0.00199995 0.00199881 0.00159845 0.00180054 0.00180283 0.00220022
 0.00200515 0.00199747 0.00220141 0.00120058 0.00199981 0.00199556
 0.002      0.00140009 0.00200014 0.00199938 0.00179863 0.00159926
 0.00159984 0.00219822 0.00159955 0.00159445 0.0018

In [13]:
real_test = pd.read_csv("datasets/test_clean_pca_transformed.csv")

In [14]:
pred = pd.DataFrame(best_model.predict(real_test.drop(columns=["PassengerId"]))).rename(columns={0: "Survived"})

In [15]:
pd.concat([real_test[["PassengerId"]], pred], axis=1).to_csv("KNN_v1.csv", index=None)