## Instalar dependencias y definir esquema

In [1]:
#importar modulos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import mysql.connector

#columnas numericas
continuous = ["culmen_length_mm", "culmen_depth_mm",
    "flipper_length_mm", "body_mass_g"]

#columnas categoricas
categorical = ["species","island","sex"]

## Crear Tablas

In [2]:
#iniciar conexion con mysql
conn = mysql.connector.connect(
    #host="10.43.101.168",       # e.g., "localhost" or your database server
    host="10.43.101.166",       # e.g., "localhost" or your database server
    user="mlflow",   # e.g., "root"
    password="mlflow",
    database="mlflow"
)
cursor = conn.cursor()

#crear tablas de datos crudos y datos procesados
cursor.execute("""
        CREATE TABLE IF NOT EXISTS penguins (
            species VARCHAR(20),
            island VARCHAR(20),
            culmen_length_mm FLOAT,
            culmen_depth_mm FLOAT,
            flipper_length_mm FLOAT,
            body_mass_g FLOAT,
            sex VARCHAR(20)
        )
        """)

cursor.execute("""
        CREATE TABLE IF NOT EXISTS penguins_proc (
            species INTEGER,
            island INTEGER,
            culmen_length_mm FLOAT,
            culmen_depth_mm FLOAT,
            flipper_length_mm FLOAT,
            body_mass_g FLOAT,
            sex INTEGER
        )
        """)

conn.commit()

## Cargar Datos Crudos a MySQL

In [3]:
#leer csv
df = pd.read_csv("penguins_size.csv").dropna()

#crear query que inserta datos
query = "INSERT INTO penguins (species, island, culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g,	sex) \
    VALUES (%s, %s, %s, %s, %s, %s, %s)"

#almacenar datos en tuplas
values = [tuple(row) for row in df.values]

#insertar datos en mysql
cursor.executemany(query, values)
conn.commit()

df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


## Preprocesar datos

In [4]:
#cargar datos de mysql a un dataframe
query = "SELECT * FROM penguins"
cursor.execute(query)
columns = [col[0] for col in cursor.description]
df = pd.DataFrame(cursor.fetchall(), columns=columns)

# Codificar variables categoricas
le = LabelEncoder()
for variable in categorical:
    df[variable] = le.fit_transform(df[variable])

# Normalizar variables numericas
scaler = Normalizer()
df[continuous] = scaler.fit_transform(df[continuous])

#convertir datos a tipo facil de convertir a mysql
df = df.astype(object)  

#queyr para insertar datos
query = "INSERT INTO penguins_proc (species, island, culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g, sex) \
VALUES (%s, %s, %s, %s, %s, %s, %s)"

#almacenar datos en tuplas
values = [tuple(row) for row in df.values]

#insertar datos en mysql
cursor.executemany(query, values)
conn.commit()

df.head()


Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,0.010414,0.004981,0.048207,0.998771,2
1,0,2,0.010382,0.004573,0.048886,0.99874,1
2,0,2,0.012377,0.005528,0.059887,0.998113,1
3,0,2,0.01062,0.005585,0.055851,0.998367,1
4,0,2,0.010752,0.005636,0.051981,0.998574,2


## Entrenamiento de Modelos

In [6]:
#cargar datos procesados de mysql a dataframe
query = "SELECT * FROM penguins_proc"
cursor.execute(query)
columns = [col[0] for col in cursor.description]
df = pd.DataFrame(cursor.fetchall(), columns=columns)

#separar etiqueta de descriptores
y = df['species']
X = df.drop(['species'], axis=1)

# dividir datos entre train y test
X_train, X_test,y_train, y_test = train_test_split(X,y , 
                               random_state=50,  
                               test_size=0.30) 

X_train.head()

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
18330,0,0.010021,0.003087,0.04482,0.99894,1
8863,1,0.012423,0.004579,0.049447,0.998689,2
7259,0,0.008923,0.002869,0.040954,0.999117,1
20163,2,0.011009,0.004493,0.052424,0.998554,2
8941,0,0.009091,0.002862,0.039966,0.999156,2


In [6]:
# definir malla de hiperparametros
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
}

# inicializar svm
svm = SVC()

# buscar hiperparametros mas optimos
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# mejor modelo
model = grid_search.best_estimator_

# resultado
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'C': 100, 'gamma': 1}


## Entrenamiento de Modelos MLflow

In [5]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import os
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://10.43.101.166:9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

#cargar datos procesados de mysql a dataframe
query = "SELECT * FROM penguins_proc"
cursor.execute(query)
columns = [col[0] for col in cursor.description]
df = pd.DataFrame(cursor.fetchall(), columns=columns)

#separar etiqueta de descriptores
y = df['species']
X = df.drop(['species'], axis=1)

# dividir datos entre train y test
X_train, X_test,y_train, y_test = train_test_split(X,y , 
                               random_state=50,  
                               test_size=0.30)



  from google.protobuf import service as _service


In [51]:
import boto3
from sklearn.model_selection import GridSearchCV

In [6]:
# connect to mlflow
mlflow.set_tracking_uri("http://10.43.101.166:5000")
mlflow.set_experiment("mlflow_tracking_examples")

mlflow.autolog(log_model_signatures=True, log_input_examples=True)

with mlflow.start_run(run_name="svm_artifacts") as run:
    params = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
}

    # inicializar svm
    svm = SVC()

    # buscar hiperparametros mas optimos
    grid_search = GridSearchCV(svm, params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    mlflow.log_params(params)
    mlflow.set_tag("column_names", ",".join(columns))
    mlflow.sklearn.log_model(
      sk_model=grid_search,
      artifact_path="svm",
        registered_model_name="svm-model"
    )

print('tracking uri:', mlflow.get_tracking_uri())
print('artifact uri:', mlflow.get_artifact_uri())

2025/03/18 05:20:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/03/18 05:26:28 INFO mlflow.sklearn.utils: Logging the 5 best runs, 15 runs will be omitted.
Successfully registered model 'svm-model'.
2025/03/18 05:26:33 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: svm-model, version 1


tracking uri: http://10.43.101.166:5000
artifact uri: s3://mlflows3/artifacts/2/76f641d3b69f41dcb7d1f8018aea6e06/artifacts


Created version '1' of model 'svm-model'.


In [7]:
import mlflow.pyfunc

model_name = "svm-model"
model_version = 1

model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{model_version}"
)

y_pred = model.predict(X_test)
print(y_pred)

sklearn_model = mlflow.sklearn.load_model(
    model_uri=f"models:/{model_name}/{model_version}"
)
y_pred_prob = sklearn_model.predict(X_test)
print(y_pred_prob)



[2 2 0 ... 0 1 1]




[2 2 0 ... 0 1 1]


In [10]:
model_name = "svm-model"
model_version = 1

client = mlflow.tracking.MlflowClient()
client.transition_model_version_stage(
    name=model_name,
    version=model_version,
    stage="Production"
)

  client.transition_model_version_stage(


<ModelVersion: aliases=[], creation_timestamp=1742275593382, current_stage='Production', description='', last_updated_timestamp=1742275749773, name='svm-model', run_id='5a0b57a77ebd496b898334e0639cec49', run_link='', source='s3://mlflows3/artifacts/2/5a0b57a77ebd496b898334e0639cec49/artifacts/svm', status='READY', status_message='', tags={}, user_id='', version='1'>

In [31]:
import mlflow.pyfunc

model_name = "svm-model"
stage = 'Production'

model = mlflow.pyfunc.load_model(
    model_uri=f"models:/{model_name}/{stage}"
)

model
#y_pred = model.predict(X_test)
#print(y_pred)

mlflow.pyfunc.loaded_model:
  artifact_path: svm
  flavor: mlflow.sklearn
  run_id: 5a0b57a77ebd496b898334e0639cec49

In [12]:
mlflow.set_tracking_uri("http://10.43.101.166:5000")

In [26]:
import mlflow

client = mlflow.tracking.MlflowClient()

# Listar modelos registrados en el Model Registry
models = client.search_registered_models()
#for model in models:
#    print(model.name)


svm-classifier
svm-model


In [27]:
models

[<RegisteredModel: aliases={}, creation_timestamp=1742267100794, description='', last_updated_timestamp=1742273254904, latest_versions=[<ModelVersion: aliases=[], creation_timestamp=1742267100853, current_stage='Production', description='', last_updated_timestamp=1742273254904, name='svm-classifier', run_id='19c18a9aeb9b48959fe2e0b9e1608eab', run_link='', source='s3://mlflows3/artifacts/1/19c18a9aeb9b48959fe2e0b9e1608eab/artifacts/svm', status='READY', status_message='', tags={}, user_id='', version='1'>,
  <ModelVersion: aliases=[], creation_timestamp=1742273249513, current_stage='None', description='', last_updated_timestamp=1742273249513, name='svm-classifier', run_id='62d6afe3d4df4c37b9c99ead32808735', run_link='', source='s3://mlflows3/artifacts/1/62d6afe3d4df4c37b9c99ead32808735/artifacts/svm', status='READY', status_message='', tags={}, user_id='', version='2'>], name='svm-classifier', tags={}>,
 <RegisteredModel: aliases={}, creation_timestamp=1742275593316, description='', las

In [21]:
import mlflow

# Obtener el experimento asociado al modelo
experiment = mlflow.get_experiment_by_name("mlflow_tracking_examples")

# Cargar el último run asociado
runs = mlflow.search_runs(experiment.experiment_id)
last_run = runs.iloc[0]  # Último entrenamiento
last_run = last_run["run_id"]

#client = mlflow.tracking.MlflowClient()
#artifact_list = client.list_artifacts(last_run)
#print([artifact.path for artifact in artifact_list])




In [24]:
tags = client.get_run(last_run).data.tags

In [25]:
tags

{'mlflow.user': 'root',
 'mlflow.source.name': '/root/.cache/uv/archive-v0/mghnIJEArs6n84e_0uBfr/lib/python3.12/site-packages/ipykernel_launcher.py',
 'mlflow.source.type': 'LOCAL',
 'mlflow.runName': 'marvelous-stork-271'}

In [52]:
import boto3

# Configuración de MinIO
minio_endpoint = "http://10.43.101.166:9000"  # Ajusta si está en otro servidor
access_key = "admin"
secret_key = "supersecret"
bucket_name = "mlflows3"
object_key = "artifacts/1/e5b9a57d50534be9bd5ae97f5390da60/artifacts/best_estimator/model.pkl"
local_file_path = "models/model.pkl"

# Crear cliente MinIO
s3_client = boto3.client(
    "s3",
    endpoint_url=minio_endpoint,
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key
)

# Descargar el archivo
s3_client.download_file(bucket_name, object_key, local_file_path)
print(f"✅ Archivo descargado en: {local_file_path}")


✅ Archivo descargado en: models/model.pkl


## Registro de modelos

## Borrar Tablas

In [30]:
query = "DROP TABLE IF EXISTS penguins"
cursor.execute(query)

query = "DROP TABLE IF EXISTS penguins_proc"
cursor.execute(query)

conn.commit()

cursor.close()
conn.close()