# 01 - Initial Setup

In [2]:
# Load necessary extensions
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

# Standard library imports
import os
import sagemaker
from sagemaker.workflow.pipeline_context import PipelineSession, LocalPipelineSession
import boto3
import sys
import logging
from pathlib import Path

# Third-party library imports
import ipytest
import json

# Configure logging to avoid output clutter from specific libraries
logging.getLogger("sagemaker.config").setLevel(logging.ERROR)

# IPyTest configuration for running tests in Jupyter notebooks
ipytest.autoconfig(raise_on_error=True)

# Setting up paths for code and data
CODE_FOLDER = Path("../src").resolve()
CODE_FOLDER.mkdir(parents=True, exist_ok=True)

INFERENCE_CODE_FOLDER = (CODE_FOLDER / "inference").resolve()
INFERENCE_CODE_FOLDER.mkdir(parents=True, exist_ok=True)

# Creating an empty __init__.py file in the code folder to allow importing modules
init_file_path = CODE_FOLDER / "__init__.py"
if not init_file_path.exists():
    init_file_path.touch()

# Adding custom folders to the system path for easy import
sys.path.extend([str(CODE_FOLDER), str(INFERENCE_CODE_FOLDER)])

# Data file path
DATA_FILE_PATH = "data/penguins.csv"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv


In [3]:
%%writefile {CODE_FOLDER}/paths.py

# Path: src/paths.py
# --- Imports ---
import os
from pathlib import Path

# --- Constants ---
PARENT_DIR = Path(__file__).parent.resolve().parent
DATA_DIR = PARENT_DIR / "data"
CODE_FOLDER = PARENT_DIR / "src"
INFERENCE_CODE_FOLDER = CODE_FOLDER / "inference"


Writing /Users/carlos/Projects/palmer-penguins-classification/src/paths.py


In [None]:
LOCAL_MODE = True

bucket = os.getenv("BUCKET")
role = os.getenv("ROLE")

S3_LOCATION = f"s3://{bucket}/penguins"

In [None]:
pipeline_session = PipelineSession(default_bucket=bucket) if not LOCAL_MODE else None

if LOCAL_MODE:
    config = {
        "session": LocalPipelineSession(default_bucket=bucket),
        "instance_type": "local",
    }
else:
    config = {"session": pipeline_session, "instance_type": "ml.m5.xlarge"}

config["framework_version"] = "2.11"
config["py_version"] = "py39"

In [None]:
sagemaker_session = sagemaker.session.Session()
sagemaker_client = boto3.client("sagemaker")
iam_client = boto3.client("iam")
region = boto3.Session().region_name

## Exploratory Data Analysis

### Sobre os Dados

[Os dados foram coletados e disponibilizados pela Dra. Kristen Gorman e pela Estação Palmer](https://allisonhorst.github.io/palmerpenguins/articles/intro.html), Antártica LTER, integrante da Rede de Pesquisa Ecológica de Longo Prazo, oferecendo uma oportunidade única de entender melhor a ecologia dos pinguins na região da Antártica, proporcionada pela dedicação e expertise da Dra. Gorman e do trabalho contínuo da Estação Palmer.

In [None]:
import pandas as pd

df = pd.read_csv(DATA_FILE_PATH)
df.head()