In [1]:
#importar modulos
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, Normalizer
from sklearn.model_selection import train_test_split, GridSearchCV
import joblib
from sklearn import linear_model
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import mysql.connector
import os
import requests

## Lectura BBDD

In [3]:
import pandas as pd
from sqlalchemy import create_engine

def read_diabetes_data(dataset_type='train', limit=None, columns=None):
    """
    Lee datos de la base de datos diabetes.
    
    Parámetros:
    -----------
    dataset_type : str, default='train'
        Tipo de conjunto de datos a leer ('train', 'validation', o 'test')
    limit : int, opcional
        Número máximo de filas a recuperar. None para recuperar todas.
    columns : list, opcional
        Lista de columnas a recuperar. None para recuperar todas.
    
    Retorna:
    --------
    pandas.DataFrame
        DataFrame con los datos solicitados
    """
    # Parámetros de conexión
    host = '172.18.0.2'  # Nombre del servicio en Docker
    port = 3307              # Puerto expuesto
    user = 'diabetes_user'
    password = 'diabetes_password'
    database = 'diabetes'
    
    # Crear cadena de conexión
    connection_string = f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}"
    
    try:
        # Crear motor de SQLAlchemy
        engine = create_engine(connection_string)
        
        # Verificar que el tipo de dataset es válido
        valid_types = ['train', 'validation', 'test']
        if dataset_type.lower() not in valid_types:
            raise ValueError(f"Tipo de dataset inválido. Debe ser uno de: {valid_types}")
        
        # Construir la tabla a consultar
        table_name = f"diabetes_{dataset_type.lower()}"
        
        # Construir la consulta SQL
        if columns:
            columns_str = ", ".join(columns)
            query = f"SELECT {columns_str} FROM {table_name}"
        else:
            query = f"SELECT * FROM {table_name}"
        
        if limit:
            query += f" LIMIT {limit}"
        
        # Ejecutar la consulta y devolver un DataFrame
        df = pd.read_sql(query, engine)
        print(f"Datos recuperados: {len(df)} filas de {table_name}")
        return df
    
    except Exception as e:
        print(f"Error al leer datos: {str(e)}")
        return pd.DataFrame()

# Función adicional para obtener estadísticas básicas
def get_diabetes_stats(dataset_type='train'):
    """
    Obtiene estadísticas básicas de los datos de diabetes.
    
    Parámetros:
    -----------
    dataset_type : str, default='train'
        Tipo de conjunto de datos a analizar ('train', 'validation', o 'test')
    
    Retorna:
    --------
    pandas.DataFrame
        DataFrame con estadísticas descriptivas
    """
    df = read_diabetes_data(dataset_type)
    if not df.empty:
        return df.describe()
    return pd.DataFrame()

# Función para ejecutar consultas personalizadas
def execute_diabetes_query(query):
    """
    Ejecuta una consulta SQL personalizada en la base de datos diabetes.
    
    Parámetros:
    -----------
    query : str
        Consulta SQL a ejecutar
    
    Retorna:
    --------
    pandas.DataFrame
        DataFrame con los resultados de la consulta
    """
    # Parámetros de conexión
    host = 'mysql_diabetes'
    port = 3307
    user = 'diabetes_user'
    password = 'diabetes_password'
    database = 'diabetes'
    
    connection_string = f"mysql+pymysql://{user}:{password}@{host}:{port}/{database}"
    
    try:
        engine = create_engine(connection_string)
        df = pd.read_sql(query, engine)
        print(f"Consulta ejecutada: {len(df)} filas recuperadas")
        return df
    except Exception as e:
        print(f"Error al ejecutar consulta: {str(e)}")
        return pd.DataFrame()

In [4]:
train_data = read_diabetes_data(dataset_type='train', limit=10)
train_data

Error al leer datos: (pymysql.err.OperationalError) (2003, "Can't connect to MySQL server on '172.18.0.2' (timed out)")
(Background on this error at: https://sqlalche.me/e/20/e3q8)


In [None]:
from sklearn.model_selection import train_test_split

# Primero separa test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Luego separa validation desde el conjunto temporal
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  
# 0.25 x 0.8 = 0.2 → 20% validación

In [2]:
df_diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_

In [161]:
pd.set_option('display.max_columns', None)
df_diabetes.head(5)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,chlorpropamide,glimepiride,acetohexamide,glipizide,glyburide,tolbutamide,pioglitazone,rosiglitazone,acarbose,miglitol,troglitazone,tolazamide,examide,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,?,Pediatrics-Endocrinology,41,0,1,0,0,0,250.83,?,?,1,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,?,?,59,0,18,0,0,0,276.0,250.01,255,9,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,?,?,11,5,13,2,0,1,648.0,250,V27,6,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,?,?,44,1,16,0,0,0,8.0,250.43,403,7,,,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,?,?,51,0,8,0,0,0,197.0,157,250,5,,,No,No,No,No,No,No,Steady,No,No,No,No,No,No,No,No,No,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [162]:
df_diabetes.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight',
       'admission_type_id', 'discharge_disposition_id', 'admission_source_id',
       'time_in_hospital', 'payer_code', 'medical_specialty',
       'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1',
       'diag_2', 'diag_3', 'number_diagnoses', 'max_glu_serum', 'A1Cresult',
       'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
       'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
       'tolazamide', 'examide', 'citoglipton', 'insulin',
       'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone',
       'metformin-pioglitazone', 'change', 'diabetesMed', 'readmitted'],
      dtype='object')

In [163]:
isd_cols = ['encounter_id', 'patient_nbr' ]

categorical_cols = ['race', 'gender', 'age','admission_type_id', 'discharge_disposition_id', 'admission_source_id','payer_code', 'medical_specialty',
                   'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide',
                   'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
                   'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone',
                   'tolazamide', 'examide', 'citoglipton', 'insulin',
                   'glyburide-metformin', 'glipizide-metformin',
                   'glimepiride-pioglitazone', 'metformin-rosiglitazone',
                   'metformin-pioglitazone', 'change', 'readmitted','diag_1','diag_2', 'diag_3',]

numerical_cols = ['time_in_hospital','num_lab_procedures', 'num_procedures', 'num_medications','number_outpatient', 'number_emergency',
                  'number_inpatient', 'number_diagnoses']#, 'max_glu_serum', 'A1Cresult']

target_col = ['diabetesMed']


In [164]:
df_diabetes[categorical_cols].isna().sum()

race                        0
gender                      0
age                         0
admission_type_id           0
discharge_disposition_id    0
admission_source_id         0
payer_code                  0
medical_specialty           0
metformin                   0
repaglinide                 0
nateglinide                 0
chlorpropamide              0
glimepiride                 0
acetohexamide               0
glipizide                   0
glyburide                   0
tolbutamide                 0
pioglitazone                0
rosiglitazone               0
acarbose                    0
miglitol                    0
troglitazone                0
tolazamide                  0
examide                     0
citoglipton                 0
insulin                     0
glyburide-metformin         0
glipizide-metformin         0
glimepiride-pioglitazone    0
metformin-rosiglitazone     0
metformin-pioglitazone      0
change                      0
readmitted                  0
diag_1    

In [165]:
df_diabetes[numerical_cols].isna().sum()

time_in_hospital      0
num_lab_procedures    0
num_procedures        0
num_medications       0
number_outpatient     0
number_emergency      0
number_inpatient      0
number_diagnoses      0
dtype: int64

In [167]:
df_numerical_cols = df_diabetes[numerical_cols]
df_categorical_cols = df_diabetes[categorical_cols]

#df_numerical_cols = [numerical_cols]
#df_categorical_cols = [categorical_cols]

print(df_numerical_cols.shape)
print(df_categorical_cols.shape)

(101766, 8)
(101766, 36)


In [168]:
X = df_diabetes[numerical_cols + categorical_cols]
y = df_diabetes[target_col]

print(X.shape)
print(y.shape)


(101766, 44)
(101766, 1)


In [169]:
from sklearn.model_selection import train_test_split

# Primero separa test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Luego separa validation desde el conjunto temporal
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)  
# 0.25 x 0.8 = 0.2 → 20% validación


In [174]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

# Pipeline para columnas numéricas
numeric_pipeline = Pipeline([
    ("scaler", StandardScaler())
])

# Pipeline para columnas categóricas
categorical_pipeline = Pipeline([
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])

preprocessor = ColumnTransformer([
    ("num", numeric_pipeline, numerical_cols),
    ("cat", categorical_pipeline, categorical_cols)
])

full_pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

full_pipeline.fit(X_train, y_train)
y_pred = full_pipeline.predict(X_test)


  return fit_method(estimator, *args, **kwargs)


In [175]:
y_pred

array(['Yes', 'Yes', 'Yes', ..., 'Yes', 'Yes', 'Yes'], dtype=object)

In [176]:
# Evaluar el modelo
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

Accuracy: 0.9991156529429105


In [121]:
# Importar las bibliotecas necesarias
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Definir las columnas de tu dataset
isd_cols = ['encounter_id', 'patient_nbr']
categorical_cols = ['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 
                    'admission_source_id', 'payer_code', 'medical_specialty', 'metformin', 'repaglinide', 
                    'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 
                    'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 
                    'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
                    'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 
                    'readmitted', 'diabetesMed']
numerical_cols = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 
                  'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3', 
                  'number_diagnoses']#, 'max_glu_serum', 'A1Cresult']
target_col = ['diabetesMed']

# Crear el dataframe (suponiendo que df_diabetes ya está definido en tu entorno)
# df_diabetes = pd.read_csv('tu_archivo.csv')  # Si necesitas cargar el dataset

# Dividir los datos en X (entradas) e y (salida)
X = df_diabetes[numerical_cols + categorical_cols]  # Entradas
y = df_diabetes[target_col]  # Salida

# Dividir los datos en entrenamiento (70%) y prueba/validación (30%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Filtra las filas donde hay valores numéricos válidos
X_train = X_train.apply(pd.to_numeric, errors='coerce')  # Convierte valores no numéricos a NaN
X_train = X_train.dropna()  # Elimina las filas con NaN

# Definir el preprocesamiento para las columnas numéricas y categóricas
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),  # Escalar las columnas numéricas
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # OneHotEncoding para las columnas categóricas
    ])

# Crear el pipeline con el preprocesamiento y el modelo
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())  # Aquí puedes cambiar el modelo a tu preferencia
])

# Entrenar el pipeline con los datos de entrenamiento
pipeline.fit(X_train, y_train)

# Hacer predicciones con los datos de prueba
y_pred = pipeline.predict(X_test)

# Evaluar el modelo
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')


ValueError: Found array with 0 sample(s) (shape=(0, 11)) while a minimum of 1 is required by StandardScaler.

In [109]:
# Filtra las filas donde hay valores numéricos válidos
X_train = X_train.apply(pd.to_numeric, errors='coerce')  # Convierte valores no numéricos a NaN
X_train = X_train.dropna()  # Elimina las filas con NaN

# También verifica si hay valores no numéricos antes de entrenar el modelo
