In [9]:
# Importar as bibliotecas:
import boto3
import pandas as pd

In [10]:
# UserID, Account e ARN

sts = boto3.client("sts")
identity = sts.get_caller_identity()
identity


{'UserId': 'AROAT4TB747SNER233QOX:SageMaker',
 'Account': '267567228900',
 'Arn': 'arn:aws:sts::267567228900:assumed-role/iseg-prd-sagemaker-role/SageMaker',
 'ResponseMetadata': {'RequestId': '1e22ec7b-b38c-4f21-bd10-6b81ccd7ad45',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1e22ec7b-b38c-4f21-bd10-6b81ccd7ad45',
   'x-amz-sts-extended-request-id': 'MTp1cy1lYXN0LTE6UzoxNzY3NTQ5NTY2ODA3Okc6cllpWk12SFk=',
   'content-type': 'text/xml',
   'content-length': '448',
   'date': 'Sun, 04 Jan 2026 17:59:26 GMT'},
  'RetryAttempts': 0}}

In [19]:
import os
from pathlib import Path

print("CWD:", os.getcwd())

print("\nConteúdo da pasta atual (notebooks/):")
print([p.name for p in Path(".").iterdir()][:30])

print("\nExiste ../data ? ->", Path("../data").exists())
if Path("../data").exists():
    print("Conteúdo de ../data:")
    print([p.name for p in Path("../data").iterdir()][:50])

print("\nExiste ../data/splits ? ->", Path("../data/splits").exists())


CWD: /home/sagemaker-user/fraud_mlops

Conteúdo da pasta atual (notebooks/):
['.ipynb_checkpoints', 'data']

Existe ../data ? -> False

Existe ../data/splits ? -> False


In [11]:
# Criar o bucket S3:

BUCKET_NAME = "aidm-creditcard-fraud-267567228900"
REGION = "eu-west-1"  # usa a mesma região do SageMaker

s3 = boto3.client("s3", region_name=REGION)

s3.create_bucket(
    Bucket=BUCKET_NAME,
    CreateBucketConfiguration={"LocationConstraint": REGION}
)

print(f"Bucket criado: {BUCKET_NAME}")



BucketAlreadyOwnedByYou: An error occurred (BucketAlreadyOwnedByYou) when calling the CreateBucket operation: Your previous request to create the named bucket succeeded and you already own it.

In [6]:
# Criar as pastas do S3:

BUCKET_NAME = "aidm-creditcard-fraud-267567228900"
PREFIXES = [
    "creditcard-fraud/data/raw/",
    "creditcard-fraud/data/splits/",
    "creditcard-fraud/models/",
    "creditcard-fraud/monitoring/",
]

s3 = boto3.client("s3")

for prefix in PREFIXES:
    s3.put_object(Bucket=BUCKET_NAME, Key=prefix)
    print(f"Criado prefix: {prefix}")


Criado prefix: creditcard-fraud/data/raw/
Criado prefix: creditcard-fraud/data/splits/
Criado prefix: creditcard-fraud/models/
Criado prefix: creditcard-fraud/monitoring/


In [18]:
# Carregar no s3 o ficheiro transactions.csv

LOCAL_FILE_PATH = "../data/transactions.csv"

# Local no s3
BUCKET_NAME = "aidm-creditcard-fraud-267567228900"
S3_KEY = "transactions/data/raw/transactions.csv"

# Upload
s3.upload_file(
    Filename=LOCAL_FILE_PATH,
    Bucket=BUCKET_NAME,
    Key=S3_KEY
)
print("Upload concluído para:")
print(f"s3://{BUCKET_NAME}/{S3_KEY}")


FileNotFoundError: [Errno 2] No such file or directory: '../data/transactions.csv'

In [14]:
# Leitura das primeiras linhas do csv

df = pd.read_csv("data/transactions.csv")

print("Shape (linhas, colunas):", df.shape)
print("\nColunas:", list(df.columns))
df.head()


Shape (linhas, colunas): (284807, 31)

Colunas: ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount', 'Class']


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [15]:
#Confirmar a distribuição 

target = "Class" 
counts = df[target].value_counts(dropna=False)
percent = (counts / len(df) * 100).round(4)

print("Contagens por classe:")
print(counts)

print("\nPercentagem por classe:")
print(percent)


Contagens por classe:
Class
0    284315
1       492
Name: count, dtype: int64

Percentagem por classe:
Class
0    99.8273
1     0.1727
Name: count, dtype: float64


In [16]:
# Efetuar o split dos dados:
!python src/steps/split_data.py


python: can't open file '/home/sagemaker-user/fraud_mlops/src/steps/split_data.py': [Errno 2] No such file or directory
