In [15]:
# Carga de librerias
# ------------------
# Librerias de uso general
import holidays
from google.cloud import storage

# Manejo de datos
import numpy as np
import pandas as pd

import kagglehub
import shutil
import os

import optuna
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import f1_score, make_scorer

from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM  # Importación para OC-SVM
from sklearn.neighbors import LocalOutlierFactor # Importación para LOF

from sklearn.feature_extraction import DictVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
import os
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
import pickle
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score, mean_squared_error
from math import sqrt
import pandas as pd
import numpy as np
from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, TimeSeriesSplit
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")

# Visualización
import matplotlib.pyplot as plt
import seaborn as sns

# Estadística y series temporales
from statsmodels.tsa.seasonal import seasonal_decompose

# Se importan las funciones
from sklearn.metrics import  mean_absolute_error

In [16]:
# ---------- 0. configuración ----------
CSV_PATH = "/Users/angeleduardogamarrarios/Repositorio_UDEM/MLops_AMICO/data/costs.csv"       # ajusta si hace falta
TEST_DAYS = 60               # últimos N días para test
RND_ITER = 12                # RandomizedSearchCV iteraciones (ajusta)
TS_SPLITS = 3                # TimeSeriesSplit folds (ajusta)
PERM_REPEATS = 10            # permutación (ajusta)


In [17]:
# ---------- 1. carga y limpieza básica ----------
df = pd.read_csv(CSV_PATH)

# eliminar fila de "Service total" si existe
df = df[df['Service'] != 'Service total']

# convertir columna de fecha en indice
df['Service'] = pd.to_datetime(df['Service'])
df = df.rename(columns={'Service':'date'}).sort_values('date').set_index('date')

# forzar numérico y revisar columnas
df = df.apply(pd.to_numeric, errors='coerce')
num_cols = df.columns.tolist()

print("Filas:", len(df), "Columnas:", num_cols)

Filas: 351 Columnas: ['Relational Database Service($)', 'EC2-Instances($)', 'FSx($)', 'Elastic File System($)', 'EC2-Other($)', 'CloudWatch($)', 'S3($)', 'Elastic Load Balancing($)', 'Backup($)', 'Key Management Service($)', 'DataSync($)', 'Secrets Manager($)', 'Resilience Hub($)', 'Total costs($)']


In [18]:
column_mapper = {
    'Service': 'fecha',
    'Relational Database Service($)': 'rds',
    'EC2-Instances($)': 'ec2',
    'FSx($)': 'fsx',
    'Elastic File System($)': 'efs',
    'EC2-Other($)': 'ec2_other',
    'CloudWatch($)': 'cloudwatch',
    'Elastic Load Balancing($)': 'elb',
    'S3($)': 's3',
    'Backup($)': 'backup',
    'Key Management Service($)': 'kms',
    'DataSync($)': 'data_sync',
    'Secrets Manager($)': 'secrets_manager',
    'Resilience Hub($)': 'resiliency',
    'Total costs($)': 'total_costs'
}

In [19]:
# Mapeo de columnas 
df.rename(columns=column_mapper, inplace=True)

In [20]:
# ---------- 2. EDA rápido (resumen + skew + patrón semanal) ----------
print("\nResumen estadístico:")
print(df.describe().T)

skewness = df.skew().sort_values(ascending=False)
print("\nSkewness (top):")
print(skewness.head(10))

weekly_mean = df.groupby(df.index.day_name()).mean()
print("\nMedia por día de la semana (muestra):")
print(weekly_mean.head())


Resumen estadístico:
                 count        mean         std           min         25%  \
rds              351.0   97.240692   87.975006  1.491792e+01   41.109960   
ec2              329.0   71.525192   25.109641  5.154986e-01   60.758766   
fsx              351.0   14.678272    1.855596  1.208052e+01   12.584705   
efs              351.0    8.064240    6.456387  1.786584e+00    2.040983   
ec2_other        351.0    6.076750    2.900253  3.117944e-01    5.541606   
cloudwatch       351.0    4.398582    7.681060  3.109096e-03    0.478766   
s3               351.0    2.218248    0.787167  1.023371e+00    1.693569   
elb              351.0    2.162289    0.004710  2.160000e+00    2.160033   
backup           350.0    1.514773    1.210465  2.810710e-01    0.322099   
kms              351.0    0.260617    0.032317  2.263914e-01    0.237089   
data_sync        119.0    0.286251    1.415347  6.440000e-08    0.000001   
secrets_manager  349.0    0.000513    0.000527  2.000000e-05    0.

In [21]:
# ---------- 3. imputación ----------
# Strategy: cambiar a 0
df_imputed = df.fillna(0)

In [6]:
# ---------- 4. Box-Cox selectivo ----------
# Aplicar Box-Cox solo a columnas muy sesgadas (skew > 1)
skewed_cols = skewness[skewness > 1].index.tolist()
df_bc = df_imputed.copy()

for c in skewed_cols:
    # Box-Cox exige valores > 0. si hay ceros o negativos shift pequeño
    min_val = df_bc[c].min()
    shift = 0.0 if min_val > 0 else abs(min_val) + 1e-6
    try:
        transformed, lam = boxcox(df_bc[c] + shift)
        df_bc[c] = transformed
        print(f"Box-Cox aplicado a {c}, lambda={lam:.4f}")
    except Exception as e:
        # fallback log1p si falla
        df_bc[c] = np.log1p(df_bc[c] + shift)
        print(f"Box-Cox falló en {c}, aplicado log1p")


Box-Cox aplicado a Secrets Manager($), lambda=0.3130
Box-Cox aplicado a DataSync($), lambda=-0.7391
Box-Cox aplicado a Elastic Load Balancing($), lambda=-693.3143
Box-Cox aplicado a Relational Database Service($), lambda=-0.0614
Box-Cox aplicado a CloudWatch($), lambda=0.0712
Box-Cox aplicado a Key Management Service($), lambda=-4.2383


In [7]:
# ---------- 5. normalización por día de la semana ----------
df_bc['day_of_week'] = df_bc.index.day_name()
scaled = df_bc.copy()
features = [c for c in num_cols]  # lista de features reales

for day in scaled['day_of_week'].unique():
    mask = scaled['day_of_week'] == day
    if mask.sum() < 2:
        # si no hay suficientes ejemplos para el día, omitir
        continue
    scaler = StandardScaler()
    scaled.loc[mask, features] = scaler.fit_transform(scaled.loc[mask, features])


In [22]:
# ---------- 6. división train/test temporal ----------
split_date = scaled.index.max() - pd.Timedelta(days=TEST_DAYS)
train_df = scaled[scaled.index < split_date].drop(columns=['day_of_week'])
test_df = scaled[scaled.index >= split_date].drop(columns=['day_of_week'])

X_train = train_df.values
X_test = test_df.values
cols = train_df.columns.tolist()

print(f"\nTrain shape: {X_train.shape}, Test shape: {X_test.shape}")



Train shape: (290, 14), Test shape: (61, 14)


In [23]:
# ---------- 7. búsqueda de hiperparámetros ----------
# scoring personalizado: media de decision_function (cuanto mayor, mejor)
def scoring_fn(estimator, X, y=None):
    return float(np.mean(estimator.decision_function(X)))

iso = IsolationForest(random_state=42)

param_dist = {
    'n_estimators': [50, 100, 200, 300],
    'max_samples': [0.5, 0.7, 0.9, 'auto'],
    'contamination': [0.001, 0.005, 0.01, 0.02, 0.05],
    'max_features': [0.5, 0.7, 1.0]
}

tscv = TimeSeriesSplit(n_splits=TS_SPLITS)
rnd = RandomizedSearchCV(iso, param_distributions=param_dist, n_iter=RND_ITER,
                         cv=tscv, random_state=42, n_jobs=-1, scoring=scoring_fn, verbose=1)
rnd.fit(X_train)

print("\nMejores parámetros (RandomizedSearch):")
print(rnd.best_params_)

# ajustar pequeño grid alrededor del mejor para refinar (GridSearch)
best = rnd.best_params_
grid = {
    'n_estimators': sorted(list({max(10, best['n_estimators']-50), best['n_estimators'], best['n_estimators']+50})),
    'max_samples': sorted(list(set([best['max_samples'] if best['max_samples']=='auto' else max(0.1, best['max_samples']-0.1), best['max_samples'], min(1.0, best['max_samples']+0.1)]))),
    'contamination': sorted(list({max(0.0005, best['contamination']/2), best['contamination'], min(0.1, best['contamination']*2)})),
    'max_features': sorted(list({max(0.1, best['max_features']-0.2), best['max_features'], min(1.0, best['max_features']+0.2)}))
}
# limpiar valores inválidos
grid['max_samples'] = [v for v in grid['max_samples'] if (isinstance(v, str) or (isinstance(v, float) and 0 < v <= 1))]

gsearch = GridSearchCV(IsolationForest(random_state=42), param_grid=grid, cv=tscv, n_jobs=-1, scoring=scoring_fn, verbose=1)
gsearch.fit(X_train)

print("\nMejores parámetros (GridSearch):")
print(gsearch.best_params_)

best_model = gsearch.best_estimator_

Fitting 3 folds for each of 12 candidates, totalling 36 fits

Mejores parámetros (RandomizedSearch):
{'n_estimators': 50, 'max_samples': 0.9, 'max_features': 0.7, 'contamination': 0.001}
Fitting 3 folds for each of 81 candidates, totalling 243 fits

Mejores parámetros (GridSearch):
{'contamination': 0.0005, 'max_features': 0.7, 'max_samples': 0.9, 'n_estimators': 10}


In [24]:
# ---------- 8. predecir y marcar anomalías ----------
test_scores = best_model.decision_function(X_test)   # mayor => más normal
test_pred = best_model.predict(X_test)                # 1 normal, -1 anomalía
test_anomaly = np.where(test_pred == 1, 0, 1)         # 1 = anomalía (más intuitivo)

test_out = test_df.copy()
test_out['anomaly_score'] = test_scores
test_out['anomaly'] = test_anomaly

print("\nAnomalías en test (conteo):", int(test_out['anomaly'].sum()))
print(test_out[['anomaly_score','anomaly']].head(10))

# ---------- 9. importancia de features (permutación) ----------
# Usamos como 'y' las scores del modelo en train y medimos R2 del estimator.decision_function
y_train_scores = best_model.decision_function(X_train)

def scoring_fn_r2(estimator, X, y):
    return r2_score(y, estimator.decision_function(X))

perm = permutation_importance(best_model, X_train, y_train_scores, scoring=scoring_fn_r2,
                              n_repeats=PERM_REPEATS, random_state=42, n_jobs=-1)
perm_importances = pd.Series(perm.importances_mean, index=cols).sort_values(ascending=False)

print("\nTop features por importancia (permutación):")
print(perm_importances.head(10))


Anomalías en test (conteo): 27
            anomaly_score  anomaly
date                              
2025-03-18       0.112032        0
2025-03-19       0.124936        0
2025-03-20       0.112604        0
2025-03-21       0.112991        0
2025-03-22       0.107334        0
2025-03-23       0.075393        0
2025-03-24       0.070000        0
2025-03-25       0.049060        0
2025-03-26       0.062226        0
2025-03-27       0.062226        0

Top features por importancia (permutación):
EC2-Instances($)                  0.926207
EC2-Other($)                      0.737711
Elastic File System($)            0.546623
Backup($)                         0.503320
DataSync($)                       0.481063
S3($)                             0.466884
Total costs($)                    0.447476
Relational Database Service($)    0.422563
CloudWatch($)                     0.405069
Key Management Service($)         0.357044
dtype: float64


In [11]:
# ---------- 10. exportar resultados ----------
test_out.to_csv("test_anomaly_results.csv")
perm_importances.to_csv("feature_importances_permutation.csv")
print("\nResultados guardados: test_anomaly_results.csv, feature_importances_permutation.csv")


Resultados guardados: test_anomaly_results.csv, feature_importances_permutation.csv


In [14]:
import joblib
# Guardar el modelo entrenado
joblib.dump(iso, 'modelo_entrenado_amico_1.pkl')
modelo_cargado = joblib.load('modelo_entrenado_1.pkl')
pred = modelo_cargado.predict(X_test[:5])
print(pred)

FileNotFoundError: [Errno 2] No such file or directory: 'modelo_entrenado_1.pkl'