In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [24]:
# Read the dataset and preprocess as described
full_train = pd.read_csv("datasets/train_clean_pca_transformed.csv")
full_test = pd.read_csv("datasets/test_clean_pca_transformed.csv")

target_col = "Survived"
X = full_train.drop(target_col, axis=1)
y = full_train[target_col]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# X_train, y_train = X, y

# Create the KNN Classifier
knn_model = KNeighborsClassifier()

# # Define the parameter grid for GridSearchCV
# param_grid = {
#     'n_neighbors': [3, 5, 7, 9],
#     'weights': ['uniform', 'distance'],
#     'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
#     'p': [1, 2]  # Manhattan or Euclidean distance
# }

param_grid = {
    "n_neighbors": [3, 4, 5, 6],  # Increase the range of neighbors to consider
    "weights": ["uniform", 'distance'],
    "algorithm": ["ball_tree", "kd_tree", "brute"],
    "p": [1, 2, 3],  # Add more values for p parameter for different distance measures
    "leaf_size": [
        1, 2, 3, 4,
        5,
    ],  # Explore different leaf sizes for BallTree or KDTree
    "metric": [
        "manhattan",
        "euclidean",
        "chebyshev",
        "minkowski",
    ],  # Include different distance metrics
    "n_jobs": [-1],  # Utilize all processors
}

# param_grid = {'algorithm': ['ball_tree'], 'leaf_size': [5], 'metric': ['euclidean'], 'n_jobs': [-1], 'n_neighbors': [4], 'p': [1], 'weights': ['uniform']}
    

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(
    knn_model, param_grid, n_jobs=1, cv=skf, scoring="accuracy", verbose=4
)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)

print(f"Best Parameters: {best_params}")
# print(f"Best Score: {grid_search.best_score_:.4f}")
print(f"Accuracy on Test Set: {accuracy:.4f}")

Fitting 5 folds for each of 1440 candidates, totalling 7200 fits
[CV 1/5] END algorithm=ball_tree, leaf_size=1, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.811 total time=   0.0s
[CV 2/5] END algorithm=ball_tree, leaf_size=1, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.789 total time=   0.0s
[CV 3/5] END algorithm=ball_tree, leaf_size=1, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.831 total time=   0.0s
[CV 4/5] END algorithm=ball_tree, leaf_size=1, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.789 total time=   0.0s
[CV 5/5] END algorithm=ball_tree, leaf_size=1, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.817 total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=1, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=distance;, score=0.797 total time=   0.0s
[CV 2/5] END algorithm=ball_tree, leaf_size=1, metric=manhattan,

In [18]:
import joblib

joblib.dump(grid_search, 'grid_search_model.joblib')

['grid_search_model.joblib']

In [19]:
loaded_grid_search = joblib.load('grid_search_model.joblib')

In [20]:
for k, v in loaded_grid_search.cv_results_.items():
    print(k, v, sep="\t")

mean_fit_time	[0.00259852 0.00340066 0.00240078 ... 0.00199919 0.00160041 0.00179977]
std_fit_time	[0.00080121 0.00049068 0.00049103 ... 0.00063264 0.00048932 0.0003998 ]
mean_score_time	[0.02000017 0.01799951 0.01819839 ... 0.00760026 0.02439957 0.02340107]
std_score_time	[0.00442763 0.00209694 0.00337052 ... 0.00119998 0.00272765 0.00195884]
param_algorithm	['ball_tree' 'ball_tree' 'ball_tree' ... 'brute' 'brute' 'brute']
param_leaf_size	[5 5 5 ... 25 25 25]
param_metric	['manhattan' 'manhattan' 'manhattan' ... 'minkowski' 'minkowski'
 'minkowski']
param_n_jobs	[-1 -1 -1 ... -1 -1 -1]
param_n_neighbors	[3 3 3 ... 6 6 6]
param_p	[1 1 2 ... 2 3 3]
param_weights	['uniform' 'distance' 'uniform' ... 'distance' 'uniform' 'distance']
params	[{'algorithm': 'ball_tree', 'leaf_size': 5, 'metric': 'manhattan', 'n_jobs': -1, 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}, {'algorithm': 'ball_tree', 'leaf_size': 5, 'metric': 'manhattan', 'n_jobs': -1, 'n_neighbors': 3, 'p': 1, 'weights': 'distan

In [21]:
real_test = pd.read_csv("datasets/test_clean_pca_transformed.csv")

In [22]:
pred = pd.DataFrame(best_model.predict(real_test.drop(columns=["PassengerId"]))).rename(columns={0: "Survived"})

In [23]:
pd.concat([real_test[["PassengerId"]], pred], axis=1).to_csv("KNN_v1.csv", index=None)