In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

{0: 'O', 1: 'C', 2: 'B', 3: 'E', 4: 'D', 5: 'A', 6: 'F', 7: 'G', 8: 'T'}

In [77]:
# Read the dataset and preprocess as described
full_train = pd.read_csv("datasets/train_clean.csv")

deck_map = pd.DataFrame(full_train["cabin_highest_class_deck"].value_counts().index)
deck_map["index"] = deck_map.index

n_to_deck = deck_map.to_dict()[0]
deck_to_n = {v: k for k, v in n_to_deck.items()}

full_train["cabin_highest_class_deck"] = full_train["cabin_highest_class_deck"].map(
    deck_to_n
)
to_drop = [
    "Name",
    "Ticket",
    "Embarked",
    "PassengerId",
    *[c for c in full_train if "cabin" in c.lower() and c != "cabin_highest_class_deck"],
]

full_train.drop(columns=to_drop, inplace=True)
target_col = "Survived"
X = full_train.drop(target_col, axis=1)
y = full_train[target_col]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# X_train, y_train = X, y

# Create the KNN Classifier
knn_model = KNeighborsClassifier()

# # Define the parameter grid for GridSearchCV
# param_grid = {
#     'n_neighbors': [3, 5, 7, 9],
#     'weights': ['uniform', 'distance'],
#     'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
#     'p': [1, 2]  # Manhattan or Euclidean distance
# }

param_grid = {
    "n_neighbors": [3, 4, 5, 6, 7, 8, 9, 10, 11],  # Increase the range of neighbors to consider
    "weights": ["uniform", 'distance'],
    "algorithm": ["ball_tree"],
    "p": [1, 2, 3, 4, 5],  # Add more values for p parameter for different distance measures
    "leaf_size": [
        5,
        10,
        15,
        20,
        25,
    ],  # Explore different leaf sizes for BallTree or KDTree
    "metric": [
        "manhattan",
    ],  # Include different distance metrics
    "n_jobs": [-1],  # Utilize all processors
}

# Create the GridSearchCV object
grid_search = GridSearchCV(
    knn_model, param_grid, n_jobs=1, cv=5, scoring="accuracy", verbose=4
)

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, y_pred)

print(f"Best Parameters: {best_params}")
# print(f"Best Score: {grid_search.best_score_:.4f}")
print(f"Accuracy on Test Set: {accuracy:.4f}")

Fitting 5 folds for each of 450 candidates, totalling 2250 fits
[CV 1/5] END algorithm=ball_tree, leaf_size=5, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.797 total time=   3.0s
[CV 2/5] END algorithm=ball_tree, leaf_size=5, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.803 total time=   0.0s
[CV 3/5] END algorithm=ball_tree, leaf_size=5, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.838 total time=   0.0s
[CV 4/5] END algorithm=ball_tree, leaf_size=5, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.824 total time=   0.0s
[CV 5/5] END algorithm=ball_tree, leaf_size=5, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=uniform;, score=0.803 total time=   0.0s
[CV 1/5] END algorithm=ball_tree, leaf_size=5, metric=manhattan, n_jobs=-1, n_neighbors=3, p=1, weights=distance;, score=0.783 total time=   0.0s
[CV 2/5] END algorithm=ball_tree, leaf_size=5, metric=manhattan, 

In [78]:
import joblib

joblib.dump(grid_search, 'grid_search_model.joblib')

['grid_search_model.joblib']

In [79]:
loaded_grid_search = joblib.load('grid_search_model.joblib')

In [80]:
for k, v in loaded_grid_search.cv_results_.items():
    print(k, v, sep="\t")

mean_fit_time	[0.00380111 0.00239954 0.00239987 0.00259843 0.0027987  0.00300021
 0.00280151 0.00259933 0.00259838 0.00259886 0.00299811 0.00299592
 0.00219898 0.00299988 0.00259991 0.0027997  0.00239954 0.00199809
 0.00259957 0.0026011  0.00340667 0.0025991  0.00260315 0.00319958
 0.00280104 0.00259767 0.00260129 0.00220013 0.00260687 0.00239582
 0.00259895 0.00299788 0.00340042 0.00299244 0.00240149 0.0033987
 0.00260167 0.00360346 0.00299883 0.00340233 0.00219941 0.00280232
 0.00259881 0.00319896 0.00299892 0.00299835 0.00260005 0.00219736
 0.00220747 0.00379891 0.00259967 0.00279965 0.00260091 0.00259895
 0.00239978 0.0024003  0.00200028 0.00279827 0.00239992 0.00280008
 0.00219903 0.0021986  0.00259805 0.00299916 0.00280228 0.00239234
 0.00239229 0.00300026 0.00219975 0.00239882 0.00279994 0.0026053
 0.0029994  0.00259852 0.00259838 0.00239849 0.00219951 0.00259919
 0.0022007  0.00239754 0.00260015 0.00300093 0.00220141 0.00240355
 0.00239978 0.00279799 0.00240335 0.00279789 0.002

In [81]:
real_test = pd.read_csv("datasets/test_clean.csv")

In [82]:
real_test.drop(columns=list(set(to_drop)-{"PassengerId"}), inplace=True)

In [83]:
real_test = real_test.interpolate(method='from_derivatives')

real_test["cabin_highest_class_deck"] = real_test["cabin_highest_class_deck"].map(
    deck_to_n
)

In [84]:
pred = pd.DataFrame(best_model.predict(real_test.drop(columns=["PassengerId"]))).rename(columns={0: "Survived"})

In [85]:
pd.concat([real_test[["PassengerId"]], pred], axis=1).to_csv("KNN_v1.csv", index=None)