In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

CSV_MANHATTAN_INPUT_PATH = r""
CSV_EUCLIDEAN_INPUT_PATH = r""

x_train_m28 = pd.read_csv(CSV_MANHATTAN_INPUT_PATH +
                          r"\28 distanze\train.csv", header=None)
x_test_m28 = pd.read_csv(CSV_MANHATTAN_INPUT_PATH +
                         r"\28 distanze\test.csv", header=None)
y_train_m28 = x_train_m28.iloc[:, 0]
x_train_m28 = x_train_m28.drop(0, axis=1)
y_test_m28 = x_test_m28.iloc[:, 0]
x_test_m28 = x_test_m28.drop(0, axis=1)

x_val_m28, x_test_m28, y_val_m28, y_test_m28 = train_test_split(
    x_test_m28, y_test_m28, test_size=0.5, random_state=42)

cv = [([*range(0, y_train_m28.shape[0])],
       [*range(y_train_m28.shape[0], y_train_m28.shape[0] + y_val_m28.shape[0])])]

y_train_m28 = np.concatenate((y_train_m28, y_val_m28), axis=0, dtype=np.int32)
x_train_m28 = np.concatenate(
    (x_train_m28, x_val_m28), axis=0, dtype=np.float32)
y_test_m28 = y_test_m28.to_numpy(np.int32)
x_test_m28 = x_test_m28.to_numpy(np.float32)


x_train_m24 = pd.read_csv(CSV_MANHATTAN_INPUT_PATH +
                          r"\24 distanze\train.csv", header=None)
x_test_m24 = pd.read_csv(CSV_MANHATTAN_INPUT_PATH +
                         r"\24 distanze\test.csv", header=None)
y_train_m24 = x_train_m24.iloc[:, 0]
x_train_m24 = x_train_m24.drop(0, axis=1)
y_test_m24 = x_test_m24.iloc[:, 0]
x_test_m24 = x_test_m24.drop(0, axis=1)

x_val_m24, x_test_m24, y_val_m24, y_test_m24 = train_test_split(
    x_test_m24, y_test_m24, test_size=0.5, random_state=42)

y_train_m24 = np.concatenate((y_train_m24, y_val_m24), axis=0, dtype=np.int32)
x_train_m24 = np.concatenate(
    (x_train_m24, x_val_m24), axis=0, dtype=np.float32)
y_test_m24 = y_test_m24.to_numpy(np.int32)
x_test_m24 = x_test_m24.to_numpy(np.float32)


x_train_e28 = pd.read_csv(CSV_EUCLIDEAN_INPUT_PATH +
                          r"\28 distanze\train.csv", header=None)
x_test_e28 = pd.read_csv(CSV_EUCLIDEAN_INPUT_PATH +
                         r"\28 distanze\test.csv", header=None)
y_train_e28 = x_train_e28.iloc[:, 0]
x_train_e28 = x_train_e28.drop(0, axis=1)
y_test_e28 = x_test_e28.iloc[:, 0]
x_test_e28 = x_test_e28.drop(0, axis=1)

x_val_e28, x_test_e28, y_val_e28, y_test_e28 = train_test_split(
    x_test_e28, y_test_e28, test_size=0.5, random_state=42)

y_train_e28 = np.concatenate((y_train_e28, y_val_e28), axis=0, dtype=np.int32)
x_train_e28 = np.concatenate(
    (x_train_e28, x_val_e28), axis=0, dtype=np.float32)
y_test_e28 = y_test_e28.to_numpy(np.int32)
x_test_e28 = x_test_e28.to_numpy(np.float32)


x_train_e24 = pd.read_csv(CSV_EUCLIDEAN_INPUT_PATH +
                          r"\24 distanze\train.csv", header=None)
x_test_e24 = pd.read_csv(CSV_EUCLIDEAN_INPUT_PATH +
                         r"\24 distanze\test.csv", header=None)
y_train_e24 = x_train_e24.iloc[:, 0]
x_train_e24 = x_train_e24.drop(0, axis=1)
y_test_e24 = x_test_e24.iloc[:, 0]
x_test_e24 = x_test_e24.drop(0, axis=1)

x_val_e24, x_test_e24, y_val_e24, y_test_e24 = train_test_split(
    x_test_e24, y_test_e24, test_size=0.5, random_state=42)

y_train_e24 = np.concatenate((y_train_e24, y_val_e24), axis=0, dtype=np.int32)
x_train_e24 = np.concatenate(
    (x_train_e24, x_val_e24), axis=0, dtype=np.float32)
y_test_e24 = y_test_e24.to_numpy(np.int32)
x_test_e24 = x_test_e24.to_numpy(np.float32)

datasets = {
    "manhattan28": [x_train_m28, y_train_m28, x_test_m28, y_test_m28],
    "manhattan24": [x_train_m24, y_train_m24, x_test_m24, y_test_m24],
    "euclidea28": [x_train_e28, y_train_e28, x_test_e28, y_test_e28],
    "euclidea24": [x_train_e24, y_train_e24, x_test_e24, y_test_e24]
}


In [2]:
import mlflow
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

EXPERIMENT_NAME = "knn"

mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

scaler = StandardScaler()
knn = KNeighborsClassifier()
pipe = Pipeline(steps=[("scaler", scaler), ("pca", PCA()), ("knn", knn)])

parameters = {
    "pca": ["passthrough", PCA(0.95, random_state=42), PCA(0.99, random_state=42)],
    "knn__n_neighbors": list(range(5, 50, 2))
}

clf = GridSearchCV(pipe, parameters, scoring="accuracy",
                   n_jobs=3, cv=cv)
for key, dataset in datasets.items():
    with mlflow.start_run(experiment_id=experiment.experiment_id, nested=True):
        clf.fit(dataset[0], dataset[1])

        mlflow.log_param("dataset", key)
        if (type(clf.best_estimator_["pca"]) == str):
            mlflow.log_param("pca", clf.best_estimator_["pca"])
        else:
            mlflow.log_param("pca", clf.best_estimator_["pca"].n_components)
        mlflow.log_param("knn__n_neighbors",
                         clf.best_estimator_["knn"].n_neighbors)

        for i in range(len((clf.cv_results_["mean_test_score"]))):
            with mlflow.start_run(experiment_id=experiment.experiment_id, nested=True):
                mlflow.log_param("dataset", key)
                if (type(clf.cv_results_["param_pca"][i]) == str):
                    mlflow.log_param("pca", clf.cv_results_["param_pca"][i])
                else:
                    mlflow.log_param("pca", clf.cv_results_[
                                     "param_pca"][i].n_components)
                mlflow.log_param("knn__n_neighbors", clf.cv_results_[
                                 "param_knn__n_neighbors"][i])
                mlflow.log_metric("mean_fit_time", clf.cv_results_[
                                  "mean_fit_time"][i])
                mlflow.log_metric("mean_score_time", clf.cv_results_[
                                  "mean_score_time"][i])
                mlflow.log_metric("mean_test_score", clf.cv_results_[
                                  "mean_test_score"][i])
