# Pipeline of Digits

This is a starting notebook for solving the "Pipeline of Digits" assignment.


This notebook was created by [Santiago L. Valdarrama](https://twitter.com/svpino) as part of the [Machine Learning School](https://www.ml.school) program.

Let's make sure we are running the latest version of the SakeMaker's SDK. **Restart the notebook** after you upgrade the library.

In [None]:
# !pip install -q --upgrade awscli
# !pip install -q --upgrade pip
# !pip install -q --upgrade sagemaker

# !pip install -q --upgrade pip
# !pip install -q --upgrade awscli boto3
# !pip install -q --upgrade PyYAML==6.0
# !pip install -q --upgrade sagemaker==2.165.0

# !pip show sagemaker

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import boto3
import sagemaker
import pandas as pd
import os
import numpy as np
import json
import argparse
import tempfile

from pathlib import Path
from botocore.exceptions import ClientError
from sagemaker.inputs import FileSystemInput
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import ScriptProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.model_step import ModelStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.parameters import ParameterInteger, ParameterString, ParameterFloat
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import CacheConfig

role = sagemaker.get_execution_role()
region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()

## Creating the S3 Bucket

Let's create an S3 bucket where you will upload all the information generated by the pipeline. Make sure you set `BUCKET` to the name of the bucket you want to use. This name has to be unique.

If you want to create a bucket in a region other than `us-east-1`, use this command instead:

```
!aws s3api create-bucket --bucket $BUCKET --create-bucket-configuration LocationConstraint=$region
```

The `LocationConstraint` argument should specify the region where you want to create the bucket.

In [None]:
BUCKET = "brianosaurus-mlschool"

!aws s3api create-bucket --bucket $BUCKET --create-bucket-configuration LocationConstraint=us-west-2

## Loading the dataset

We have two CSV files containing the MNIST dataset. These files come from the [MNIST in CSV](https://www.kaggle.com/datasets/oddrationale/mnist-in-csv) Kaggle dataset.

The `mnist_train.csv` file contains 60,000 training examples and labels. The `mnist_test.csv` contains 10,000 test examples and labels. Each row consists of 785 values: the first value is the label (a number from 0 to 9) and the remaining 784 values are the pixel values (a number from 0 to 255).

Let's extract the `dataset.tar.gz` file.

In [None]:
MNIST_FOLDER = "."
DATASET_FOLDER = Path(MNIST_FOLDER) / "dataset"

S3_FILEPATH = f"s3://{BUCKET}/{DATASET_FOLDER}"
DATA_FILEPATH = Path(DATASET_FOLDER) / "mnist_train.csv"
TEST_FILEPATH = Path(DATASET_FOLDER) / "mnist_test.csv"

INPUT_DATA_URI = sagemaker.s3.S3Uploader.upload(
    local_path=str(DATA_FILEPATH), 
    desired_s3_uri=S3_FILEPATH,
)

TEST_INPUT_DATA_URI = sagemaker.s3.S3Uploader.upload(
    local_path=str(TEST_FILEPATH), 
    desired_s3_uri=S3_FILEPATH,
)
print(f"Dataset S3 location: {INPUT_DATA_URI}")

print(DATASET_FOLDER)
!ls -alrt

!tar -xvzf $MNIST_FOLDER/dataset.tar.gz -C $MNIST_FOLDER --no-same-owner

Let's load the first 10 rows of the test set.

In [None]:
import scipy.stats as stats
import pandas as pd
import numpy as np

from sklearn.preprocessing import FunctionTransformer
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.impute import SimpleImputer

# MNIST_FOLDER = "."
# DATASET_FOLDER = Path(MNIST_FOLDER) / "dataset"

# def _replace_outliers(data):
#     means = np.mean(data, axis=0)
#     stds = np.std(data, axis=0)
    
#     for column in range(data.shape[1]):
#         mean = means[column]
#         std = stds[column] * 3 # 3 std deviations away
        
#         c = data[:,column]
#         c[np.where(abs(c - mean) > std)] = mean
        
#     return data

# def _scale_outliers(data):
#      # Create a pipeline to replace outliers with the mean.
#     pipeline = Pipeline([
#         ("scaler", RobustScaler()),
#         ("imputer", SimpleImputer(strategy="mean")),
#         ("replace_outliers", FunctionTransformer(_replace_outliers))
#     ])

#     # Fit the pipeline to the data.
#     return pipeline.fit_transform(data)

    
# print(f"DATASET_FOLDER={DATASET_FOLDER}")
# df = pd.read_csv(Path(DATASET_FOLDER) / "mnist_train.csv")
# print(df.iloc[:,0].describe())
# print("-------------------------------------------------------------------------")


# print(df.iloc[:, 775].describe())
# df.drop(["label"], axis=1, inplace=True)

# print("-------------------------------------------------------------------------")
# df = pd.DataFrame(_scale_outliers(df))
# print(df.iloc[:, 774].describe())
# print("-------------------------------------------------------------------------")

# df = pd.read_csv(Path(DATASET_FOLDER) / "mnist_test.csv")
# df

# print("-------------------------------------------------------------------------")
# df = pd.DataFrame(_scale_outliers(df))
# print(df.iloc[:, 774].describe())
# print("-------------------------------------------------------------------------")


In [None]:
%%writefile preprocessor.py

import os
import numpy as np
import pandas as pd
import tempfile
import subprocess

from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler, FunctionTransformer

from pickle import dump


# This is the location where the SageMaker Processing job
# will save the input dataset.
BASE_DIRECTORY = "/opt/ml/processing"

MNIST_FOLDER = "."
MNIST_DIRECTORY = Path(MNIST_FOLDER) / "dataset"
DATA_FILEPATH = Path(MNIST_DIRECTORY) / "mnist_train.csv"
TEST_FILEPATH = Path(MNIST_DIRECTORY) / "mnist_test.csv"

def _save_splits(base_directory, train, validation, test):
    """
    One of the goals of this script is to output the three
    dataset splits. This function will save each of these
    splits to disk.
    """
    print("saving splits")
    
    train_path = Path(base_directory) / "train" 
    validation_path = Path(base_directory) / "validation" 
    test_path = Path(base_directory) / "test"
    
    train_path.mkdir(parents=True, exist_ok=True)
    validation_path.mkdir(parents=True, exist_ok=True)
    test_path.mkdir(parents=True, exist_ok=True)
    
    pd.DataFrame(train).to_csv(train_path / "train.csv", header=False, index=False)
    pd.DataFrame(validation).to_csv(validation_path / "validation.csv", header=False, index=False)
    pd.DataFrame(test).to_csv(test_path / "test.csv", header=False, index=False)
    

def _save_pipeline(base_directory, pipeline):
    """
    Saves the Scikit-Learn pipeline that we used to
    preprocess the data.
    """
    print("saving pipeline")
    pipeline_path = Path(base_directory) / "pipeline"
    pipeline_path.mkdir(parents=True, exist_ok=True)
    dump(pipeline, open(pipeline_path / "pipeline.pkl", 'wb'))
    

def _save_classes(base_directory, classes):
    """
    Saves the list of classes from the dataset.
    """
    path = Path(base_directory) / "classes"
    path.mkdir(parents=True, exist_ok=True)
    
    print("CLASSES", np.asarray(classes))

    np.asarray(classes).tofile(path / "classes.csv", sep = ",") 
    

def _generate_baseline_dataset(split_name, base_directory, X, y):
    """
    To monitor the data and the quality of our model we need to compare the 
    production quality and results against a baseline. To create those baselines, 
    we need to use a dataset to compute statistics and constraints. That dataset
    should contain information in the same format as expected by the production
    endpoint. This function will generate a baseline dataset and save it to 
    disk so we can later use it.
    
    """
    
    print(f"Generating baseline dataset for {split_name}")
    baseline_path = Path(base_directory) / f"{split_name}-baseline" 
    baseline_path.mkdir(parents=True, exist_ok=True)

    df = X.copy()
    
    # The baseline dataset needs a column containing the groundtruth.
    df["groundtruth"] = y
    df["groundtruth"] = df["groundtruth"].values.astype(int)
    
    print("added groundtruth")
    
    # We will use the baseline dataset to generate baselines
    # for monitoring data and model quality. To simplify the process, 
    # we don't want to include any NaN rows.
    df = df.dropna()
    
    print("dropped na")

    df.to_json(baseline_path / f"{split_name}-baseline.json") #, orient='records', lines=True)
    print("to_json'ed")
    
def _replace_outliers(data):
    means = np.mean(data, axis=0)
    stds = np.std(data, axis=0)
    
    for column in range(data.shape[1]):
        mean = means[column]
        std = stds[column] * 3 # 3 std deviations away
        
        c = data[:,column]
        c[np.where(abs(c - mean) > std)] = mean
        
    return data

def preprocess(base_directory, data_filepath, test_filepath):
    """
    Preprocesses the supplied raw dataset and splits it into a train, validation,
    and a test set.
    """
 
    df = pd.read_csv(data_filepath)
    test = pd.read_csv(test_filepath)
         
    numerical_columns = [column for column in df.columns if df[column].dtype in ["int64", "float64"] and column != "label"]
    
    numerical_preprocessor = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", RobustScaler()),
        ("replace_outliers", FunctionTransformer(_replace_outliers, validate=False))
    ])
    
    # categorical_preprocessor = Pipeline(steps=[
    #     ("imputer", SimpleImputer(strategy="most_frequent")),
    #     ("onehot", OneHotEncoder(handle_unknown="ignore"))
    # ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("numerical", numerical_preprocessor, numerical_columns),
            # ("categorical", categorical_preprocessor, ["label"])
        ]
    )
    
    X = df
    columns = list(X.columns)
    X = X.to_numpy()
    
    np.random.shuffle(X)
    train, validation = np.split(X, [int(.8 * len(X))])

    X_train = pd.DataFrame(train, columns=columns)
    X_validation = pd.DataFrame(validation, columns=columns)
    X_test = pd.DataFrame(test, columns=columns)
    
    y_train = X_train.label
    y_validation = X_validation.label
    y_test = X_test.label
    
    label_encoder = LabelEncoder()
    
    print("transforming y values")
    y_train = label_encoder.fit_transform(y_train)
    y_validation = label_encoder.transform(y_validation)
    y_test = label_encoder.transform(y_test)
    
    X_train.drop(["label"], axis=1, inplace=True)
    X_validation.drop(["label"], axis=1, inplace=True)
    X_test.drop(["label"], axis=1, inplace=True)

    # Let's generate a dataset that we can later use to compute
    # baseline statistics and constraints about the data that we
    # used to train our model.
    _generate_baseline_dataset("train", base_directory, X_train, y_train)
    
    # To generate baseline constraints about the quality of the
    # model's predictions, we will use the test set.
    _generate_baseline_dataset("test", base_directory, X_test, y_test)
    
    # Transform the data using the Scikit-Learn pipeline.
    print("transforming X values")
    X_train = preprocessor.fit_transform(X_train)
    X_validation = preprocessor.transform(X_validation)
    X_test = preprocessor.transform(X_test)
    
    print("trained values, concatonating")
    train = np.concatenate((X_train, np.expand_dims(y_train, axis=1)), axis=1)
    validation = np.concatenate((X_validation, np.expand_dims(y_validation, axis=1)), axis=1)
    test = np.concatenate((X_test, np.expand_dims(y_test, axis=1)), axis=1)
    
    _save_splits(base_directory, train, validation, test)
    _save_pipeline(base_directory, pipeline=preprocessor)
    _save_classes(base_directory, label_encoder.classes_)
        

if __name__ == "__main__":    
#     DATA_FILEPATH="s3://brianosaurus-mlschool/dataset/train.csv"
#     TEST_FILEPATH="s3://brianosaurus-mlschool/dataset/test/test.csv"

    MNIST_FOLDER = "/opt/ml/processing/input"
    DATASET_FOLDER = Path(MNIST_FOLDER) / "dataset"

    #BUCKET = "brianosaurus-mlschool"
    #S3_FILEPATH = f"s3://{BUCKET}/{DATASET_FOLDER}"
    
    DATA_FILEPATH = Path(DATASET_FOLDER) / "train" / "mnist_train.csv"
    TEST_FILEPATH = Path(DATASET_FOLDER) / "test" / "mnist_test.csv"
    
    print("Looking at the filesystem")
    command = "pwd"
    results = subprocess.check_output(command, shell=True)
    print(results.decode('utf-8'))
    command = "ls -alrt /opt/ml/processing/input/dataset"
    results = subprocess.check_output(command, shell=True)
    print(results.decode('utf-8'))
    print("done looking")
    
    preprocess(BASE_DIRECTORY, DATA_FILEPATH, TEST_FILEPATH)


In [None]:
# import os
# import numpy as np
# import pandas as pd
# import tempfile

# from pathlib import Path

# from preprocessor import preprocess

# BASE_DIRECTORY = "/opt/ml/processing"

# MNIST_FOLDER = "."
# MNIST_DIRECTORY = Path(MNIST_FOLDER) / "dataset"
# DATA_FILEPATH = Path(MNIST_DIRECTORY) / "mnist_train.csv"
# TEST_FILEPATH = Path(MNIST_DIRECTORY) / "mnist_test.csv"

# # DATA_FILEPATH="s3://brianosaurus-mlschool/dataset/mnist_train.csv"
# # TEST_FILEPATH="s3://brianosaurus-mlschool/dataset/test/test.csv"

# def print_baseline(split_name):
#     print()
#     print(f"Baseline {split_name}:")
#     with open(Path(directory) / f"{split_name}-baseline" / f"{split_name}-baseline.json") as baseline:
#         lines = [next(baseline) for _ in range(5)]
        
#     for l in lines:
#         print(l[:-1])
    
# print(f"DATA_FILEPATH={DATA_FILEPATH}")
# with tempfile.TemporaryDirectory() as directory:
#     preprocess(
#         base_directory=directory, 
#         data_filepath=DATA_FILEPATH, 
#         test_filepath=TEST_FILEPATH
#     )
    
#     print(f"Folders: {os.listdir(directory)}")
#     for dir in os.listdir(directory):
#         print(f"Files: {os.listdir(Path(directory) / dir)} and dir {dir}")
    
#     VALIDATION_SET_S3_URI = sagemaker.s3.S3Uploader.upload(
#         local_path=str(Path(directory) / "validation" / "validation.csv"), 
#         desired_s3_uri=S3_FILEPATH,
#     )

#     TRAIN_SET_S3_URI = sagemaker.s3.S3Uploader.upload(
#         local_path=str(Path(directory) / "train" / "train.csv"), 
#         desired_s3_uri=S3_FILEPATH,
#     )

#     TEST_SET_S3_URI = sagemaker.s3.S3Uploader.upload(
#         local_path=str(Path(directory) / "test" / "test.csv"), 
#         desired_s3_uri=S3_FILEPATH,
#     )

#     print(f"Train set S3 location: {TRAIN_SET_S3_URI}")
#     print(f"Test set S3 location: {TEST_SET_S3_URI}")
#     print(f"Validation set S3 location: {VALIDATION_SET_S3_URI}")


In [None]:
dataset_location = ParameterString(
    name="dataset_location",
    default_value=INPUT_DATA_URI,
)

test_dataset_location = ParameterString(
    name="test_dataset_location",
    default_value=TEST_INPUT_DATA_URI,
)

preprocessor_destination = ParameterString(
    name="preprocessor_destination",
    default_value=f"{S3_FILEPATH}/preprocessing",
)

train_dataset_baseline_destination = ParameterString(
    name="train_dataset_baseline_destination",
    default_value=f"{S3_FILEPATH}/preprocessing/baselines/train",
)

test_dataset_baseline_destination = ParameterString(
    name="test_dataset_baseline_destination",
    default_value=f"{S3_FILEPATH}/preprocessing/baselines/test",
)


In [None]:
cache_config = CacheConfig(
    enable_caching=True, 
    expire_after="15d"
)

In [None]:
sklearn_processor = SKLearnProcessor(
    base_job_name="mlschool-preprocessing",
    framework_version="0.23-1",
    instance_type="ml.t3.medium",
    instance_count=1,
    role=role,
)

preprocess_data_step = ProcessingStep(
    name="preprocess-data",
    processor=sklearn_processor,
    inputs=[
        ProcessingInput(source=dataset_location, destination="/opt/ml/processing/input/dataset/train"),  
        ProcessingInput(source=test_dataset_location, destination="/opt/ml/processing/input/dataset/test"),  
    ],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train", destination=preprocessor_destination),
        ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation", destination=preprocessor_destination),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test", destination=preprocessor_destination),
        ProcessingOutput(output_name="pipeline", source="/opt/ml/processing/pipeline", destination=preprocessor_destination),
        ProcessingOutput(output_name="classes", source="/opt/ml/processing/classes", destination=preprocessor_destination),
        ProcessingOutput(output_name="train-baseline", source="/opt/ml/processing/train-baseline", destination=train_dataset_baseline_destination),
        ProcessingOutput(output_name="test-baseline", source="/opt/ml/processing/test-baseline", destination=test_dataset_baseline_destination),
    ],
    code=f"{MNIST_FOLDER}/preprocessor.py",
    cache_config=cache_config
)

In [None]:
session1_pipeline = sagemaker.workflow.pipeline.Pipeline(
    name="session1-pipeline",
    parameters=[
        dataset_location, 
        test_dataset_location,
        preprocessor_destination,
        train_dataset_baseline_destination,
        test_dataset_baseline_destination
    ],
    steps=[
        preprocess_data_step, 
    ]
)

In [None]:
# session_pipeline.upsert(role_arn=role)
# execution = session_pipeline.start()


In [None]:
from sagemaker.tuner import HyperparameterTuner
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TuningStep
from sagemaker.parameter import IntegerParameter, ContinuousParameter
from sagemaker.inputs import TrainingInput
from sagemaker.tensorflow import TensorFlow
from sagemaker.workflow.steps import TrainingStep
from sagemaker.workflow.pipeline_context import PipelineSession

In [None]:
%%writefile train.py

import os
import argparse

import numpy as np
import pandas as pd
import tensorflow as tf

from pathlib import Path
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD


def train(base_directory, train_path, validation_path, epochs=50, batch_size=32, learning_rate=0.03):
    X_train = pd.read_csv(Path(train_path) / "train.csv")
    y_train = X_train[X_train.columns[-1]]
    X_train.drop(X_train.columns[-1], axis=1, inplace=True)
    
    X_validation = pd.read_csv(Path(validation_path) / "validation.csv")
    y_validation = X_validation[X_validation.columns[-1]]
    X_validation.drop(X_validation.columns[-1], axis=1, inplace=True)
    
    model = Sequential([
        Dense(10, input_shape=(X_train.shape[1],), activation="relu"),
        Dense(8, activation="relu"),
        Dense(10, activation="softmax"),
    ])

    model.compile(
        optimizer=SGD(learning_rate),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    model.fit(
        X_train, 
        y_train, 
        validation_data=(X_validation, y_validation),
        epochs=epochs, 
        batch_size=batch_size,
        verbose=2,
    )

    predictions = np.argmax(model.predict(X_validation), axis=-1)
    print(f"Validation accuracy: {accuracy_score(y_validation, predictions)}")
    
    model_filepath = Path(base_directory) / "model" / "001"
    model.save(model_filepath)
    
if __name__ == "__main__":
    # Any hyperparameters provided by the training job are passed to the entry point
    # as script arguments. SageMaker will also provide a list of special parameters
    # that you can capture here. Here is the full list: 
    # https://github.com/aws/sagemaker-training-toolkit/blob/master/src/sagemaker_training/params.py
    parser = argparse.ArgumentParser()
    parser.add_argument("--base_directory", type=str, default="/opt/ml/")
    parser.add_argument("--train_path", type=str, default=os.environ.get("SM_CHANNEL_TRAIN", None))
    parser.add_argument("--validation_path", type=str, default=os.environ.get("SM_CHANNEL_VALIDATION", None))
    parser.add_argument("--epochs", type=int, default=50)
    parser.add_argument("--batch_size", type=int, default=32)
    parser.add_argument("--learning_rate", type=float, default=0.03)
    args, _ = parser.parse_known_args()
    
    train(
        base_directory=args.base_directory,
        train_path=args.train_path,
        validation_path=args.validation_path,
        epochs=args.epochs,
        batch_size=args.batch_size,
        learning_rate=args.learning_rate,
    )

In [None]:
estimator = TensorFlow(
    entry_point=f"train.py",
    
    hyperparameters={
        "epochs": 50,
        "batch_size": 32,
        "learning_rate": 0.03,
    },
    
    framework_version="2.6",
    py_version="py38",
    instance_type="ml.m5.large",
    instance_count=1,
    script_mode=True,
    
    disable_profiler=True,
    role=role,
)

In [None]:
train_model_step = TrainingStep(
    name="train-model",
    estimator=estimator,
    inputs={
        "train": TrainingInput(
            s3_data=preprocess_data_step.properties.ProcessingOutputConfig.Outputs[
                "train"
            ].S3Output.S3Uri,
            content_type="text/csv"
        ),
        "validation": TrainingInput(
            s3_data=preprocess_data_step.properties.ProcessingOutputConfig.Outputs[
                "validation"
            ].S3Output.S3Uri,
            content_type="text/csv"
        )
    },
    cache_config=cache_config
)

In [None]:
objective_metric_name = "val_accuracy"
objective_type = "Maximize"
metric_definitions = [{"Name": objective_metric_name, "Regex": "val_accuracy: ([0-9\\.]+)"}]
    
hyperparameter_ranges = {
    "epochs": IntegerParameter(10, 50),
    "learning_rate": ContinuousParameter(0.01, 0.03),
}

tuner = HyperparameterTuner(
    estimator,
    objective_metric_name,
    hyperparameter_ranges,
    metric_definitions,
    objective_type=objective_type,
    max_jobs=3,
    max_parallel_jobs=3,
)

In [None]:
tune_model_step = TuningStep(
    name = "tune-model",
    tuner=tuner,
    inputs={
        "train": TrainingInput(
            s3_data=preprocess_data_step.properties.ProcessingOutputConfig.Outputs[
                "train"
            ].S3Output.S3Uri,
            content_type="text/csv"
        ),
        "validation": TrainingInput(
            s3_data=preprocess_data_step.properties.ProcessingOutputConfig.Outputs[
                "validation"
            ].S3Output.S3Uri,
            content_type="text/csv"
        )
    },
    cache_config=cache_config
)

In [None]:
USE_TUNING_STEP = True

In [None]:
session2_pipeline = sagemaker.workflow.pipeline.Pipeline(
    name="session2-pipeline",
    parameters=[
        dataset_location, 
        test_dataset_location,
        preprocessor_destination,
        train_dataset_baseline_destination,
        test_dataset_baseline_destination,
    ],
    steps=[
        preprocess_data_step, 
        tune_model_step if USE_TUNING_STEP else train_model_step
    ]
)

In [None]:
session2_pipeline.upsert(role_arn=role)
execution = session2_pipeline.start()

In [None]:
# from preprocessor import preprocess
# from train import train


# with tempfile.TemporaryDirectory() as directory:
#     # First, we preprocess the data and create the 
#     # dataset splits.
#     preprocess(
#         base_directory=directory, 
#         data_filepath=DATA_FILEPATH, 
#         test_filepath=TEST_FILEPATH
#     )
    
#     train(
#         base_directory=directory, 
#         train_path=Path(directory) / "train", 
#         validation_path=Path(directory) / "validation",
#         epochs=10
#     )
    
