In [1]:
# Importar as bibliotecas:
import boto3
import pandas as pd

In [2]:
# UserID, Account e ARN

sts = boto3.client("sts")
identity = sts.get_caller_identity()
identity


{'UserId': 'AROAT4TB747SNER233QOX:SageMaker',
 'Account': '267567228900',
 'Arn': 'arn:aws:sts::267567228900:assumed-role/iseg-prd-sagemaker-role/SageMaker',
 'ResponseMetadata': {'RequestId': 'b4a51180-bc4d-4d28-82dd-230a14154e2e',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': 'b4a51180-bc4d-4d28-82dd-230a14154e2e',
   'x-amz-sts-extended-request-id': 'MTp1cy1lYXN0LTE6UzoxNzY3NzQxMTE4OTgwOkc6dDloZkhlaFE=',
   'content-type': 'text/xml',
   'content-length': '448',
   'date': 'Tue, 06 Jan 2026 23:11:58 GMT'},
  'RetryAttempts': 0}}

In [3]:
# Criar o bucket S3:

BUCKET_NAME = "aidm-creditcard-fraud-267567228900"
REGION = "eu-west-1"  # usa a mesma região do SageMaker

s3 = boto3.client("s3", region_name=REGION)

s3.create_bucket(
    Bucket=BUCKET_NAME,
    CreateBucketConfiguration={"LocationConstraint": REGION}
)

print(f"Bucket criado: {BUCKET_NAME}")

BucketAlreadyOwnedByYou: An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.

In [None]:
# Criar as pastas do S3:

BUCKET_NAME = "aidm-creditcard-fraud-267567228900"
PREFIXES = [
    "creditcard-fraud/data/raw/",
    "creditcard-fraud/data/splits/",
    "creditcard-fraud/models/",
    "creditcard-fraud/monitoring/",
]

s3 = boto3.client("s3")

for prefix in PREFIXES:
    s3.put_object(Bucket=BUCKET_NAME, Key=prefix)
    print(f"Criado prefix: {prefix}")

In [None]:
# Carregar no s3 o ficheiro transactions.csv

LOCAL_FILE_PATH = "../data/transactions.csv"

# Local no s3
BUCKET_NAME = "aidm-creditcard-fraud-267567228900"
S3_KEY = "transactions/data/raw/transactions.csv"

# Upload
s3.upload_file(
    Filename=LOCAL_FILE_PATH,
    Bucket=BUCKET_NAME,
    Key=S3_KEY
)
print("Upload concluído para:")
print(f"s3://{BUCKET_NAME}/{S3_KEY}")

In [None]:
# Leitura das primeiras linhas do csv

df = pd.read_csv("../data/transactions.csv")

print("Shape (linhas, colunas):", df.shape)
print("\nColunas:", list(df.columns))
df.head()


In [None]:
#Confirmar a distribuição 

target = "Class" 
counts = df[target].value_counts(dropna=False)
percent = (counts / len(df) * 100).round(4)

print("Contagens por classe:")
print(counts)

print("\nPercentagem por classe:")
print(percent)


In [None]:
# Efetuar o split dos dados:
!python ../src/steps/split_data.py

In [None]:
# Correr train_local.py
!python ../src/steps/train_local.py

In [None]:
# Upload dos splits para o s3:
!python ../src/steps/upload_splits.py

In [None]:
# Verificar o caminho:

s3 = boto3.client("s3")
resp = s3.list_objects_v2(
    Bucket="aidm-creditcard-fraud-267567228900",
    Prefix="transactions/data/splits/"
)

for obj in resp.get("Contents", []):
    print(obj["Key"], obj["Size"])

In [None]:
# Lançar o treino:

import sagemaker
from sagemaker.sklearn.estimator import SKLearn

sess = sagemaker.Session()
role = sagemaker.get_execution_role()

BUCKET = "aidm-creditcard-fraud-267567228900"
TRAIN_S3 = f"s3://{BUCKET}/transactions/data/splits/train.csv"
VAL_S3   = f"s3://{BUCKET}/transactions/data/splits/val.csv"

estimator = SKLearn(
    entry_point="train_sm.py",       # o ficheiro que o SageMaker vai executar
    source_dir="../src/steps",        # onde esse ficheiro está (relativo ao notebook)
    role=role,
    instance_type="ml.m5.large",
    instance_count=1,
    framework_version="1.2-1",
    py_version="py3",
    hyperparameters={
        "max_iter": 1000             # vira: --max_iter 1000 (argparse lê isto)
    },
)

estimator.fit({
    "train": TRAIN_S3,               # vira SM_CHANNEL_TRAIN
    "validation": VAL_S3,            # vira SM_CHANNEL_VALIDATION
})


In [13]:
print(estimator.model_data)


s3://sagemaker-eu-west-1-267567228900/sagemaker-scikit-learn-2026-01-06-22-05-39-455/output/model.tar.gz


In [16]:
# Avaliar o treino:
!python ../src/steps/evaluate_sm_model.py \
  --model_tar_s3 "s3://sagemaker-eu-west-1-267567228900/sagemaker-scikit-learn-2026-01-05-08-55-56-843/output/model.tar.gz" \
  --test_s3 "s3://aidm-creditcard-fraud-267567228900/transactions/data/splits/test.csv"


  tar.extractall(path=extract_dir)
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
Test metrics (final):
test:f1=0.12464046021093
test:pr_auc=0.7920898072998782
Relatório guardado em: reports/test_metrics.json


In [10]:
import sagemaker
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter

sess = sagemaker.Session()
role = sagemaker.get_execution_role()

BUCKET = "aidm-creditcard-fraud-267567228900"
TRAIN_S3 = f"s3://{BUCKET}/transactions/data/splits/train.csv"
VAL_S3   = f"s3://{BUCKET}/transactions/data/splits/val.csv"

estimator = SKLearn(
    entry_point="train_sm.py",
    source_dir="../src/steps",
    role=role,
    instance_type="ml.m5.large",
    instance_count=1,
    framework_version="1.2-1",
    py_version="py3",
    sagemaker_session=sess,
    base_job_name="transactionsfraud-hpo",
)

objective_metric_name = "validation:pr_auc"

hyperparameter_ranges = {
    "C": ContinuousParameter(0.01, 10.0),
    "max_iter": IntegerParameter(500, 2000),
}

tuner = HyperparameterTuner(
    estimator=estimator,
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=[
        {"Name": "validation:pr_auc", "Regex": r"validation:pr_auc=([0-9\.eE+-]+)"},
        {"Name": "validation:f1",     "Regex": r"validation:f1=([0-9\.eE+-]+)"},
    ],
    objective_type="Maximize",
    max_jobs=6,
    max_parallel_jobs=2,
)

tuner.fit({"train": TRAIN_S3, "validation": VAL_S3})


No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config
No finished training job found associated with this estimator. Please make sure this estimator is only used for building workflow config


................................................................!


In [11]:
import boto3, sagemaker

sm = boto3.client("sagemaker", region_name=sagemaker.Session().boto_region_name)
tj = tuner.latest_tuning_job.name
desc = sm.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=tj)

print("TuningJobStatus:", desc["HyperParameterTuningJobStatus"])
print("FailureReason:", desc.get("FailureReason"))


TuningJobStatus: Completed
FailureReason: None


In [22]:
import boto3
import sagemaker
from botocore.exceptions import ClientError

sess = sagemaker.Session()
sm = boto3.client("sagemaker", region_name=sess.boto_region_name)

model_package_group_name = "transactionsfraud-sklearn"

try:
    sm.describe_model_package_group(ModelPackageGroupName=model_package_group_name)
    print("Model Package Group já existe:", model_package_group_name)
except ClientError as e:
    code = e.response["Error"].get("Code", "")
    msg = e.response["Error"].get("Message", "")
    if code in ["ValidationException", "ResourceNotFound"] and "does not exist" in msg:
        sm.create_model_package_group(
            ModelPackageGroupName=model_package_group_name,
            ModelPackageGroupDescription="Credit card fraud detection (SKLearn) - registered models"
        )
        print("Model Package Group criado:", model_package_group_name)
    else:
        raise



# --- Melhor training job e artefacto ---
best_job = tuner.best_training_job()
best_model_artifact = tuner.best_estimator().model_data  # ou use o S3 que você já imprimiu

# --- Métrica objetivo e métricas adicionais ---
df = tuner.analytics().dataframe()
best_row = df.sort_values("FinalObjectiveValue", ascending=False).iloc[0]

best_pr_auc = float(best_row["FinalObjectiveValue"])

# tenta encontrar uma coluna de F1 no dataframe (pode variar)
best_f1 = None
for col in ["validation:f1", "validation_f1", "validation:f1 "]:
    if col in df.columns:
        try:
            best_f1 = float(best_row[col])
            break
        except Exception:
            pass

# --- Imagem usada no treino (garante compatibilidade com model.tar.gz) ---
best_desc = sm.describe_training_job(TrainingJobName=best_job)
image_uri = best_desc["AlgorithmSpecification"]["TrainingImage"]

# --- Nome do package ---
model_package_name = f"transactionsfraud-sklearn-{datetime.now(timezone.utc).strftime('%Y%m%d-%H%M%S')}"

# --- Criar Model Package ---
create_resp = sm.create_model_package(
    ModelPackageGroupName=model_package_group_name,
    ModelPackageDescription=f"Best HPO model from {best_job}",
    InferenceSpecification={
        "Containers": [
            {
                "Image": image_uri,
                "ModelDataUrl": best_model_artifact,
            }
        ],
        "SupportedContentTypes": ["text/csv"],
        "SupportedResponseMIMETypes": ["text/csv", "application/json"],
    },
    ModelApprovalStatus="PendingManualApproval",
    CustomerMetadataProperties={
        "best_training_job": best_job,
        "objective_metric": "validation:pr_auc",
        "validation_pr_auc": f"{best_pr_auc:.6f}",
        "validation_f1": f"{best_f1:.6f}" if best_f1 is not None else "NA",
    },
)

print("ModelPackageArn:", create_resp["ModelPackageArn"])
print("Best job:", best_job)
print("Best model artifact:", best_model_artifact)
print("Best PR AUC:", f"{best_pr_auc:.6f}")
print("Best F1:", f"{best_f1:.6f}" if best_f1 is not None else "NA")


Model Package Group criado: transactionsfraud-sklearn

2026-01-06 23:35:12 Starting - Found matching resource for reuse
2026-01-06 23:35:12 Downloading - Downloading the training image
2026-01-06 23:35:12 Training - Training image download completed. Training in progress.
2026-01-06 23:35:12 Uploading - Uploading generated training model
2026-01-06 23:35:12 Completed - Resource released due to keep alive period expiry
ModelPackageArn: arn:aws:sagemaker:eu-west-1:267567228900:model-package/transactionsfraud-sklearn/1
Best job: sagemaker-scikit-lea-260106-2329-005-08ccec11
Best model artifact: s3://sagemaker-eu-west-1-267567228900/sagemaker-scikit-lea-260106-2329-005-08ccec11/output/model.tar.gz
Best PR AUC: 0.629903
Best F1: NA


In [23]:
import boto3, sagemaker, re

region = sagemaker.Session().boto_region_name
logs = boto3.client("logs", region_name=region)

log_group = "/aws/sagemaker/TrainingJobs"
best_job = "sagemaker-scikit-lea-260106-2329-005-08ccec11"

# achar stream
resp = logs.describe_log_streams(
    logGroupName=log_group,
    logStreamNamePrefix=best_job,
    descending=True,
    limit=5
)
log_stream = resp["logStreams"][0]["logStreamName"]

# ler eventos
events = []
token = None
for _ in range(10):
    kwargs = dict(
        logGroupName=log_group,
        logStreamName=log_stream,
        startFromHead=True,
        limit=1000,
    )
    if token:
        kwargs["nextToken"] = token
    out = logs.get_log_events(**kwargs)
    events.extend(out.get("events", []))
    nt = out.get("nextForwardToken")
    if nt == token:
        break
    token = nt

text = "\n".join(e["message"] for e in events)

m_f1 = re.findall(r"validation:f1=([0-9\.eE+-]+)", text)
m_pr = re.findall(r"validation:pr_auc=([0-9\.eE+-]+)", text)

print("PR AUC encontrados:", m_pr[-3:])
print("F1 encontrados:", m_f1[-3:])

best_f1 = float(m_f1[-1]) if m_f1 else None
print("Best F1:", best_f1)


PR AUC encontrados: ['0.629903']
F1 encontrados: ['0.124521']
Best F1: 0.124521


In [24]:
import boto3, sagemaker
sm = boto3.client("sagemaker", region_name=sagemaker.Session().boto_region_name)

model_package_arn = "arn:aws:sagemaker:eu-west-1:267567228900:model-package/transactionsfraud-sklearn/1"

sm.update_model_package(
    ModelPackageArn=model_package_arn,
    ModelApprovalStatus="Approved"
)
print("Approved:", model_package_arn)


Approved: arn:aws:sagemaker:eu-west-1:267567228900:model-package/transactionsfraud-sklearn/1
