In [1]:
print("Welcome to Transformation and split Job!")

Welcome to Transformation and split Job!


In [2]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

import json
import logging
import sys
from pathlib import Path

import ipytest

CODE_FOLDER = Path("code")
sys.path.extend([f"./{CODE_FOLDER}"])

DATA_FILEPATH = "../data/penguins.csv"

ipytest.autoconfig(raise_on_error=True)

# By default, The SageMaker SDK logs events related to the default
# configuration using the INFO level. To prevent these from spoiling
# the output of this notebook cells, we can change the logging
# level to ERROR instead.
logging.getLogger("sagemaker.config").setLevel(logging.ERROR)

In [3]:
LOCAL_MODE = True

In [4]:
import os

bucket = os.environ["BUCKET"]
role = os.environ["ROLE"]

COMET_API_KEY = os.environ.get("COMET_API_KEY", None)
COMET_PROJECT_NAME = os.environ.get("COMET_PROJECT_NAME", None)

In [5]:
# We can retrieve the architecture of the local
# computer using the `uname -m` command.
architecture = !(uname -m)

IS_ARM64_ARCHITECTURE = architecture[0] == "arm64"
print(IS_ARM64_ARCHITECTURE)

False


In [6]:
import sagemaker
from sagemaker.workflow.pipeline_context import LocalPipelineSession, PipelineSession

pipeline_session = PipelineSession(default_bucket=bucket) if not LOCAL_MODE else None

if LOCAL_MODE:
    config = {
        "session": LocalPipelineSession(default_bucket=bucket),
        "instance_type": "local",
        # We need to use a custom Docker image when we run the pipeline
        # in Local Model on an ARM64 machine.
        "image": (
            "sagemaker-tensorflow-toolkit-local" if IS_ARM64_ARCHITECTURE else None
        ),
    }
else:
    config = {
        "session": pipeline_session,
        "instance_type": "ml.m5.xlarge",
        "image": None,
    }

# These specific settings refer to the SageMaker
# TensorFlow container we'll use.
config["framework_version"] = "2.12"
config["py_version"] = "py310"

Windows Support for Local Mode is Experimental


In [7]:
import boto3

S3_LOCATION = f"s3://{bucket}/penguins"

sagemaker_session = sagemaker.session.Session()
sagemaker_client = boto3.client("sagemaker")
iam_client = boto3.client("iam")
region = boto3.Session().region_name

print(region)

us-east-1


## Create Processing script

In [11]:
(CODE_FOLDER / "processing").mkdir(parents=True, exist_ok=True)

sys.path.extend([f"./{CODE_FOLDER}/processing"])
print("CODE_FOLDER",CODE_FOLDER)

CODE_FOLDER code


#### Demo for writefile linemagic

In [16]:
%%writefile {CODE_FOLDER}/processing/hello.py
print("Hello Jupyter!")

Overwriting code/processing/hello.py


In [17]:
%%writefile {CODE_FOLDER}/processing/script.py

import os
import tarfile
import tempfile
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler


def preprocess(base_directory):

    """
    This Function loads the supplied data, splits and transforms the same
    """

    df = _read_data_from_csv_files(base_directory)

    # This is to apply label encoding on target variable
    target_transformer = ColumnTransformer(
        transfomers = [("species", OrdinalEncoer(),[0])]
    )

    # This is to transform numerical columns
    # This first imputes missing values with mean of the feature and then applies StandardScaler
    numerical_transformer = make_pipeline(
        SimpleImputer(strategy="mean"),
        StandardScaler()
    )

    # This is to transform categorical columns into OHE columns
    # Before OHE, we impute missing values with most frequent value
    categorical_transformer = make_pipeline(
        SimpleImputer(strategy="most_frequent"),
        OneHotEncoder()
    )

    # note here we are not applying categorical transformations for SEX column
    # This is because we don't think SEX feature in this data has any predictive power
    # o 
    feature_transformers = ColumnTransformer(
        transformers = [
            (
                "numeric",
                numerical_transformer,
                make_column_selector(dtype_exclude="object")
            ),
            (
                "categorical",
                categorical_transformer,
                ['island']
            ) 
        ]
    )


def _read_data_from_csv_files(base_directory):
    """
    This function reads every single available csv files and puts those in one DF
    """

    input_dir = Path(base_directory) / "input"
    files = [file for file in input_dir.glob("*.csv")]

    if len(files) == 0:
        raise ValueError(f"There are no CSV files in {input_dir.as_posix()}")
    
    raw_data = [pd.read_csv(file) for file in files]
    df = pd.concat(raw_data)

    return df.sample(frac=1, random_state=2024) # This is to shuffle the data

if __name__ = "__main__":
    preprocess(base_directory = "/opt/ml/preprocessing")

Writing code/processing/script.py
