In [9]:
# import awswrangler as wr # Para usar en el; ambiente de AWS
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib
import json
import boto3
import datetime

In [11]:
df = pd.read_parquet("../Preprocesamiento/sample_data.parquet")

df.head()

Unnamed: 0,clean_text,num_question_marks,num_exclamations,has_problem,has_issue,has_not_working,has_refund,has_error,has_fail,has_help,has_wtf,has_worst,has_urgent,has_bad
0,sprintcare and how do you propose we do that,0,0,0,0,0,0,0,0,0,0,0,0,0
1,sprintcare i have sent several private message...,0,0,0,0,0,0,0,0,0,0,0,0,0
2,sprintcare i did,0,0,0,0,0,0,0,0,0,0,0,0,0
3,sprintcare is the worst customer service,0,0,0,0,0,0,0,0,0,0,1,0,0
4,sprintcare you gonna magically change your con...,0,0,0,0,0,0,0,0,0,0,0,0,0


In [3]:
def label_message(row):
    critical_cols = [c for c in row.index if c.startswith("has_")]
    return "critical" if any(row[c] == 1 for c in critical_cols) else "non_critical"

df["label"] = df.apply(label_message, axis=1)
print(df["label"].value_counts())

df.head()

label
non_critical    334582
critical        165418
Name: count, dtype: int64


Unnamed: 0,clean_text,num_question_marks,num_exclamations,has_problem,has_issue,has_not_working,has_refund,has_error,has_fail,has_help,has_wtf,has_worst,has_urgent,has_bad,label
0,sprintcare and how do you propose we do that,0,0,0,0,0,0,0,0,0,0,0,0,0,non_critical
1,sprintcare i have sent several private message...,0,0,0,0,0,0,0,0,0,0,0,0,0,non_critical
2,sprintcare i did,0,0,0,0,0,0,0,0,0,0,0,0,0,non_critical
3,sprintcare is the worst customer service,0,0,0,0,0,0,0,0,0,0,1,0,0,critical
4,sprintcare you gonna magically change your con...,0,0,0,0,0,0,0,0,0,0,0,0,0,non_critical


In [None]:
X = df['clean_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=2025)

In [5]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=500)),
    ('clf', LogisticRegression(max_iter=100, class_weight='balanced', random_state=2025))
])
pipeline.fit(X_train, y_train)

In [6]:
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
print(report)

{'critical': {'precision': 0.9914801541686389, 'recall': 0.8850371204957447, 'f1-score': 0.9352397151057723, 'support': 49703.0}, 'non_critical': {'precision': 0.945907055560289, 'recall': 0.9962311933557335, 'f1-score': 0.9704171320351576, 'support': 100297.0}, 'accuracy': 0.9593866666666667, 'macro avg': {'precision': 0.9686936048644639, 'recall': 0.940634156925739, 'f1-score': 0.9528284235704649, 'support': 150000.0}, 'weighted avg': {'precision': 0.9610078536944945, 'recall': 0.9593866666666667, 'f1-score': 0.9587609776775492, 'support': 150000.0}}


In [7]:
metrics_dict = {
    "f1_score_critical": report["critical"]["f1-score"],
    "Accuracy": report["accuracy"],
    "Macro_F1": report["macro avg"]["f1-score"]
}
print(metrics_dict)

{'f1_score_critical': 0.9352397151057723, 'Accuracy': 0.9593866666666667, 'Macro_F1': 0.9528284235704649}


In [8]:
# Guardar modelo local
now_date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
local_model_path = f"model_{now_date}.joblib"
joblib.dump(pipeline, local_model_path)

# Guardar metricas local
metrics_path = f"metrics_{now_date}.json"
with open(metrics_path, 'w') as f:
    json.dump(metrics_dict, f)

In [None]:
# Guardar el modelo y las metricas en S3
s3_bucket = "your-s3-bucket-name"
s3_prefix = f"models/nequi-risk-nlp/model-{now_date}/"
s3_key = f"{s3_prefix}model.joblib"

s3 = boto3.client('s3')
s3.upload_file(local_model_path, s3_bucket, s3_key)
# Subir metricas a S3
metrics_key = f"{s3_prefix}metrics.json"
s3.upload_file(metrics_path, s3_bucket, metrics_key)


print(f"Model uploaded to s3://{s3_bucket}/{s3_key}")

In [None]:
### Plantilla para versionado en sagemaker con model registry
# from sagemaker.model_metrics import ModelMetrics, MetricsSource

# sm_client = boto3.client("sagemaker", region_name=region)
# sagemaker_session = Session()

# model_package_group_name = "nequi-risk-nlp"


# # Si es la primera vez: crea el grupo de paquetes
# try:
#     sm_client.create_model_package_group(
#         ModelPackageGroupName=model_package_group_name,
#         ModelPackageGroupDescription="Modelos de clasificación de mensajes críticos Nequi"
#     )
# except sm_client.exceptions.ResourceInUse:
#     print(f"ModelPackageGroup {model_package_group_name} ya existe")

In [None]:
# Registrar el modelo
# model_metrocs = ModelMetrics(
#     model_statistics=MetricsSource(
#         s3_uri=f"s3://{s3_bucket}/{metrics_key}"
#     ),
#     model_data_quality=MetricsSource(
#         s3_uri=f"s3://{s3_bucket}/{metrics_key}"))




In [None]:
# model_package_input = {
#     "ModelPackageGroupName": model_package_group_name,
#     "ModelPackageDescription": f"Modelo LR + TFIDF ({now_date})",
#     "InferenceSpecification": {
#         "Containers": [
#             {
#                 "Image": sagemaker.image_uris.retrieve("sklearn", region, version="1.6.1"),
#                 "ModelDataUrl": s3_model_path,
#                 "Environment": {
#                     "SAGEMAKER_SUBMIT_DIRECTORY": s3_model_path,
#                     "SAGEMAKER_PROGRAM": "inference.py" # Script de inferencia
#                 }
#             }
#         ],
#         "SupportedContentTypes": ["text/csv", "application/json"],
#         "SupportedResponseMIMETypes": ["text/csv", "application/json"]
#     },
#     "ModelMetrics": model_metrics,
#     "CertifyForMarketplace": False
# }

# model_package_response = sm_client.create_model_package(**model_package_input)
# print(f"Modelo registrado en el Model Registry: {model_package_response['ModelPackageArn']}")