In [1]:
import pandas as pd
import numpy as np

CSV_MANHATTAN_INPUT_PATH = r""
CSV_EUCLIDEAN_INPUT_PATH = r""

x_train_m = pd.read_csv(CSV_MANHATTAN_INPUT_PATH +
                        r"\train.csv", header=None)
x_val_m = pd.read_csv(CSV_MANHATTAN_INPUT_PATH +
                      r"\val.csv", header=None)
x_test_m = pd.read_csv(CSV_MANHATTAN_INPUT_PATH +
                       r"\test.csv", header=None)
y_train_m = x_train_m.iloc[:, 0]
x_train_m = x_train_m.drop(0, axis=1)
y_val_m = x_val_m.iloc[:, 0]
x_val_m = x_val_m.drop(0, axis=1)
y_test_m = x_test_m.iloc[:, 0]
x_test_m = x_test_m.drop(0, axis=1)

cv = [([*range(0, y_train_m.shape[0])],
       [*range(y_train_m.shape[0], y_train_m.shape[0] + y_val_m.shape[0])])]

y_train_m = np.concatenate((y_train_m, y_val_m), axis=0, dtype=np.int32)
x_train_m = np.concatenate(
    (x_train_m, x_val_m), axis=0, dtype=np.float32)
y_test_m = y_test_m.to_numpy(np.int32)
x_test_m = x_test_m.to_numpy(np.float32)


x_train_e = pd.read_csv(CSV_EUCLIDEAN_INPUT_PATH +
                        r"\train.csv", header=None)
x_val_e = pd.read_csv(CSV_EUCLIDEAN_INPUT_PATH +
                      r"\val.csv", header=None)
x_test_e = pd.read_csv(CSV_EUCLIDEAN_INPUT_PATH +
                       r"\test.csv", header=None)
y_train_e = x_train_e.iloc[:, 0]
x_train_e = x_train_e.drop(0, axis=1)
y_val_e = x_val_e.iloc[:, 0]
x_val_e = x_val_e.drop(0, axis=1)
y_test_e = x_test_e.iloc[:, 0]
x_test_e = x_test_e.drop(0, axis=1)

y_train_e = np.concatenate((y_train_e, y_val_e), axis=0, dtype=np.int32)
x_train_e = np.concatenate(
    (x_train_e, x_val_e), axis=0, dtype=np.float32)
y_test_e = y_test_e.to_numpy(np.int32)
x_test_e = x_test_e.to_numpy(np.float32)

labels = np.unique(y_test_m)

y_test_m = np.select([y_test_m == label for label in labels],
                     [*range(len(labels))])
y_train_m = np.select([y_train_m == label for label in labels], [
    *range(len(labels))])

y_test_e = np.select([y_test_e == label for label in labels],
                     [*range(len(labels))])
y_train_e = np.select([y_train_e == label for label in labels], [
    *range(len(labels))])

datasets = {
    "manhattan": [x_train_m, y_train_m, x_test_m, y_test_m],
    "euclidea": [x_train_e, y_train_e, x_test_e, y_test_e]
}


In [2]:
import mlflow
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

EXPERIMENT_NAME = "forest2"

mlflow.set_experiment(EXPERIMENT_NAME)
experiment = mlflow.get_experiment_by_name(EXPERIMENT_NAME)

scaler = StandardScaler()
forest = RandomForestClassifier(random_state=42)
pipe = Pipeline(steps=[("scaler", scaler), ("pca", PCA()), ("forest", forest)])

parameters = {
    "pca": ["passthrough", PCA(0.95, random_state=42), PCA(0.99, random_state=42)],
    "forest__n_estimators": [100, 200, 300, 400, 500]
}

clf = GridSearchCV(pipe, parameters, scoring="accuracy",
                   n_jobs=3, refit=False, cv=cv)
for key, dataset in datasets.items():
    with mlflow.start_run(experiment_id=experiment.experiment_id, nested=True):
        clf.fit(dataset[0], dataset[1])

        mlflow.log_param("dataset", key)

        for i in range(len((clf.cv_results_["mean_test_score"]))):
            with mlflow.start_run(experiment_id=experiment.experiment_id, nested=True):
                mlflow.log_param("dataset", key)
                if (type(clf.cv_results_["param_pca"][i]) == str):
                    mlflow.log_param("pca", clf.cv_results_["param_pca"][i])
                else:
                    mlflow.log_param("pca", clf.cv_results_[
                                     "param_pca"][i].n_components)
                mlflow.log_param("forest__n_estimators", clf.cv_results_[
                                 "param_forest__n_estimators"][i])
                mlflow.log_metric("mean_fit_time", clf.cv_results_[
                                  "mean_fit_time"][i])
                mlflow.log_metric("mean_score_time", clf.cv_results_[
                                  "mean_score_time"][i])
                mlflow.log_metric("mean_test_score", clf.cv_results_[
                                  "mean_test_score"][i])


In [3]:
""" pipe = Pipeline(steps=[("scaler", StandardScaler()), ("pca", "passthrough"),
                ("forest", RandomForestClassifier(300, random_state=42))])

pipe.fit(datasets["manhattan"][0][:len(cv[0][0]) - 1][:],
         datasets["manhattan"][1][:len(cv[0][0]) - 1][:])
print(pipe.score(datasets["manhattan"][2], datasets["manhattan"][3]))
 """


0.6111111111111112


In [4]:
""" import os
from joblib import dump

MODEL__PATH = r""


version = "v1.1"

os.chdir(MODEL__PATH.format(type=type, version=version))
dump(pipe, "filename.joblib")
 """


['filename.joblib']