In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split, PredefinedSplit, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
sex_pipeline = make_pipeline(OrdinalEncoder(handle_unknown='error'))
age_pipeline = make_pipeline(SimpleImputer(),StandardScaler())
fare_pipeline = make_pipeline(SimpleImputer(),MinMaxScaler())
loc_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"),OneHotEncoder(handle_unknown="ignore"))

In [3]:
preprocessing = ColumnTransformer(
                transformers = [
                    ("sex", sex_pipeline, ["Sex"]),
                    ("pass", "passthrough", ["Pclass"]),
                    ("age", age_pipeline, ["Age"]),
                    ("fare", fare_pipeline, ["Fare"]),
                    ("loc", loc_pipeline, ["Embarked"])
                ],
                remainder = "drop"
)

In [4]:
model_pipeline = Pipeline([("preprocessing",preprocessing), ("KNN",KNeighborsClassifier())])

In [5]:
train_data = pd.read_pickle("./train.pkl")
val_data = pd.read_pickle("./val.pkl")
train_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
103,104,3,"Johansson, Mr. Gustaf Joel",male,33.0,0,0,7540,8.6542,,S,0
349,350,3,"Dimic, Mr. Jovan",male,42.0,0,0,315088,8.6625,,S,0
7,8,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,0
345,346,2,"Brown, Miss. Amelia ""Mildred""",female,24.0,0,0,248733,13.0,F33,S,1
652,653,3,"Kalvik, Mr. Johannes Halvorsen",male,21.0,0,0,8475,8.4333,,S,0


In [6]:
def split_data(data):
    X = data.drop("Survived", axis=1)
    y = data.loc[:,["Survived"]]

    return([X,y])

In [7]:
X_train, y_train = split_data(train_data)
X_val, y_val = split_data(val_data)

In [8]:
model_pipeline.fit(X_train, y_train.to_numpy().ravel())

In [9]:
y_pred = model_pipeline.predict(X_val)
accuracy_score(y_val,y_pred)

0.81

In [10]:
X_combined = pd.concat([X_train,X_val])
y_combined = pd.concat([y_train,y_val])

In [11]:
test_fold = np.array([-1]*X_train.shape[0] + [0]*X_val.shape[0])

In [12]:
ps = PredefinedSplit(test_fold)

In [13]:
param_grid = [{"KNN__n_neighbors":[1,3,5,7,10], "KNN__weights":("uniform","distance")}]

In [14]:
clf = GridSearchCV(model_pipeline,
            param_grid,
            cv=ps,
            scoring="accuracy",
            n_jobs=-1,
            verbose=1,
            refit=True)

In [15]:
clf.fit(X_combined, y_combined.to_numpy().ravel())

Fitting 1 folds for each of 10 candidates, totalling 10 fits


In [16]:
cv_results = pd.DataFrame(clf.cv_results_)
cv_test_score = max(cv_results["mean_test_score"])
cv_test_score*100

82.0