## Instalar dependencias y definir esquema

In [1]:
#importar modulos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import mysql.connector

#columnas numericas
continuous = ["culmen_length_mm", "culmen_depth_mm",
    "flipper_length_mm", "body_mass_g"]

#columnas categoricas
categorical = ["species","island","sex"]

## Crear Tablas

In [2]:
#iniciar conexion con mysql
conn = mysql.connector.connect(
    #host="10.43.101.168",       # e.g., "localhost" or your database server
    host="10.43.101.166",       # e.g., "localhost" or your database server
    user="mlflow",   # e.g., "root"
    password="mlflow",
    database="mlflow"
)
cursor = conn.cursor()

#crear tablas de datos crudos y datos procesados
cursor.execute("""
        CREATE TABLE IF NOT EXISTS penguins (
            species VARCHAR(20),
            island VARCHAR(20),
            culmen_length_mm FLOAT,
            culmen_depth_mm FLOAT,
            flipper_length_mm FLOAT,
            body_mass_g FLOAT,
            sex VARCHAR(20)
        )
        """)

cursor.execute("""
        CREATE TABLE IF NOT EXISTS penguins_proc (
            species INTEGER,
            island INTEGER,
            culmen_length_mm FLOAT,
            culmen_depth_mm FLOAT,
            flipper_length_mm FLOAT,
            body_mass_g FLOAT,
            sex INTEGER
        )
        """)

conn.commit()

## Cargar Datos Crudos a MySQL

In [3]:
#leer csv
df = pd.read_csv("penguins_size.csv").dropna()

#crear query que inserta datos
query = "INSERT INTO penguins (species, island, culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g,	sex) \
    VALUES (%s, %s, %s, %s, %s, %s, %s)"

#almacenar datos en tuplas
values = [tuple(row) for row in df.values]

#insertar datos en mysql
cursor.executemany(query, values)
conn.commit()

df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE


## Preprocesar datos

In [4]:
#cargar datos de mysql a un dataframe
query = "SELECT * FROM penguins"
cursor.execute(query)
columns = [col[0] for col in cursor.description]
df = pd.DataFrame(cursor.fetchall(), columns=columns)

# Codificar variables categoricas
le = LabelEncoder()
for variable in categorical:
    df[variable] = le.fit_transform(df[variable])

# Normalizar variables numericas
scaler = Normalizer()
df[continuous] = scaler.fit_transform(df[continuous])

#convertir datos a tipo facil de convertir a mysql
df = df.astype(object)  

#queyr para insertar datos
query = "INSERT INTO penguins_proc (species, island, culmen_length_mm, culmen_depth_mm, flipper_length_mm, body_mass_g, sex) \
VALUES (%s, %s, %s, %s, %s, %s, %s)"

#almacenar datos en tuplas
values = [tuple(row) for row in df.values]

#insertar datos en mysql
cursor.executemany(query, values)
conn.commit()

df.head()


Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,0,2,0.010414,0.004981,0.048207,0.998771,2
1,0,2,0.010382,0.004573,0.048886,0.99874,1
2,0,2,0.012377,0.005528,0.059887,0.998113,1
3,0,2,0.01062,0.005585,0.055851,0.998367,1
4,0,2,0.010752,0.005636,0.051981,0.998574,2


## Entrenamiento de Modelos

In [14]:
#cargar datos procesados de mysql a dataframe
query = "SELECT * FROM penguins_proc"
cursor.execute(query)
columns = [col[0] for col in cursor.description]
df = pd.DataFrame(cursor.fetchall(), columns=columns)

#separar etiqueta de descriptores
y = df['species']
X = df.drop(['species'], axis=1)

# dividir datos entre train y test
X_train, X_test,y_train, y_test = train_test_split(X,y , 
                               random_state=50,  
                               test_size=0.30) 

X_train.head()

Unnamed: 0,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
271,0,0.009697,0.003255,0.046688,0.998857,1
1393,0,0.010136,0.004489,0.047351,0.998817,2
2526,1,0.014315,0.005371,0.055121,0.998363,2
1564,0,0.01089,0.003472,0.049934,0.998687,1
972,0,0.009569,0.003049,0.044585,0.998955,1


In [15]:
# definir malla de hiperparametros
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
}

# inicializar svm
svm = SVC()

# buscar hiperparametros mas optimos
grid_search = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# mejor modelo
model = grid_search.best_estimator_

# resultado
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'C': 100, 'gamma': 1}


## Entrenamiento de Modelos MLflow

In [5]:
import mlflow

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import os
os.environ['MLFLOW_S3_ENDPOINT_URL'] = "http://10.43.101.166:9000"
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'supersecret'

#cargar datos procesados de mysql a dataframe
query = "SELECT * FROM penguins_proc"
cursor.execute(query)
columns = [col[0] for col in cursor.description]
df = pd.DataFrame(cursor.fetchall(), columns=columns)

#separar etiqueta de descriptores
y = df['species']
X = df.drop(['species'], axis=1)

# dividir datos entre train y test
X_train, X_test,y_train, y_test = train_test_split(X,y , 
                               random_state=50,  
                               test_size=0.30)



  from google.protobuf import service as _service


In [8]:
import boto3
from sklearn.model_selection import GridSearchCV

In [10]:
# connect to mlflow
mlflow.set_tracking_uri("http://10.43.101.166:5000")
mlflow.set_experiment("mlflow_tracking_examples_class1")

mlflow.autolog(log_model_signatures=True, log_input_examples=True)

with mlflow.start_run(run_name="logged_artifacts") as run:
    params = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.01, 0.1, 1],
}

    # inicializar svm
    svm = SVC()

    # buscar hiperparametros mas optimos
    grid_search = GridSearchCV(svm, params, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    mlflow.log_params(params)
    mlflow.sklearn.log_model(
      sk_model=grid_search,
      artifact_path="svm"
    )

print('tracking uri:', mlflow.get_tracking_uri())
print('artifact uri:', mlflow.get_artifact_uri())

2025/03/17 04:52:43 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/03/17 04:54:34 INFO mlflow.sklearn.utils: Logging the 5 best runs, 15 runs will be omitted.


tracking uri: http://10.43.101.166:5000
artifact uri: s3://mlflows3/artifacts/1/ec1c9e6e9af44a09ace375019d6c806b/artifacts


In [52]:
import boto3

# Configuración de MinIO
minio_endpoint = "http://10.43.101.166:9000"  # Ajusta si está en otro servidor
access_key = "admin"
secret_key = "supersecret"
bucket_name = "mlflows3"
object_key = "artifacts/1/e5b9a57d50534be9bd5ae97f5390da60/artifacts/best_estimator/model.pkl"
local_file_path = "models/model.pkl"

# Crear cliente MinIO
s3_client = boto3.client(
    "s3",
    endpoint_url=minio_endpoint,
    aws_access_key_id=access_key,
    aws_secret_access_key=secret_key
)

# Descargar el archivo
s3_client.download_file(bucket_name, object_key, local_file_path)
print(f"✅ Archivo descargado en: {local_file_path}")


✅ Archivo descargado en: models/model.pkl


## Registro de modelos

## Borrar Tablas

In [30]:
query = "DROP TABLE IF EXISTS penguins"
cursor.execute(query)

query = "DROP TABLE IF EXISTS penguins_proc"
cursor.execute(query)

conn.commit()

cursor.close()
conn.close()