In [1]:
import boto3
import pandas as pd
import joblib
import datetime
import os
from io import BytesIO

In [2]:
# Configuración de bucket y paths, aqui se agarraria el uri directamente del SageMaker Registry
S3_INPUT_PATH = "processed/yyyy-mm-dd/job_run_id/data.parquet"
S3_MODEL_PATH = "models/nequi-risk-nlp/model-YYYYMMDD_HHMMSS/model.joblib"
S3_OUTPUT_PATH = "predictions/yyyy-mm-dd/job_run_id/predictions.csv"

In [3]:
# Parámetros dinámicos para versionado automático
today = datetime.datetime.now().strftime("%Y-%m-%d")
run_id = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
output_s3_path = f"predictions/{today}/{run_id}/predictions.csv"

In [None]:
# Inicialización de cliente S3
s3 = boto3.client('s3')

In [None]:
# Descarga de datos preprocesados desde S3
print("Descargando datos de entrada...")
obj = s3.get_object(Bucket=S3_BUCKET, Key=S3_INPUT_PATH)
df_input = pd.read_parquet(BytesIO(obj['Body'].read()))

print("Datos cargados:", df_input.shape)

In [None]:
# Descarga y carga del modelo desde S3
print("Descargando modelo desde S3...")
local_model_path = "/tmp/model.joblib"
s3.download_file(S3_BUCKET, S3_MODEL_PATH, local_model_path)

In [None]:
model = joblib.load(local_model_path)
print("Modelo cargado.")

In [None]:
X_pred = df_input["clean_text"]
print("Iniciando inferencia batch...")
y_pred = model.predict(X_pred)

In [None]:
df_output = df_input.copy()
df_output["prediction"] = y_pred

# Guardar localmente
# local_output_path = "/tmp/predictions.csv"
# df_output.to_csv(local_output_path, index=False)

# Subir a S3
print(f"Guardando resultados en S3: {output_s3_path}")
s3.upload_file(local_output_path, S3_BUCKET, output_s3_path)

print("Batch inference finalizada. Resultados guardados en S3.")

In [None]:
# Logging de la ejecución
log_message = f"""
Batch inference run
Input data: s3://{S3_BUCKET}/{S3_INPUT_PATH}
Model used: s3://{S3_BUCKET}/{S3_MODEL_PATH}
Output predictions: s3://{S3_BUCKET}/{output_s3_path}
Total records: {df_output.shape[0]}
Run date: {today}
Run id: {run_id}
"""

log_path = f"predictions/{today}/{run_id}/log.txt"
with open("/tmp/log.txt", "w") as f:
    f.write(log_message)
s3.upload_file("/tmp/log.txt", S3_BUCKET, log_path)

print("Log de inferencia batch guardado en S3.")