In [None]:
!brew install libomp
!pip install seaborn

!pip install kagglehub

!pip uninstall -y xgboost
!pip install xgboost

In [None]:
import os
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from category_encoders import BinaryEncoder
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Data import
df = pd.read_csv('diabetes_prediction_dataset.csv')
df.head()

In [None]:
# Descipción de datos
df.describe()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df['gender'].unique()

In [None]:
# Count the number of rows where gender is not 'Other'
count_without_other = df[df['gender'] != 'Other'].shape[0]
print(f"Number of rows without 'Other' in gender: {count_without_other}")

In [None]:
# Drop rows where the 'gender' column has the value 'Other'
df = df[df['gender'] != 'Other']

In [None]:
# Convertimos genero Hombre, Mujer, a 1, 0
df['gender'] = df['gender'].map({'Male': 1, 'Female': 0})

In [None]:
# Validamos si tenemos valore vacíos
df.isnull().sum()

In [None]:
df.hist(bins=30, figsize=(15, 12))
plt.tight_layout()
plt.show()

In [None]:
# Ahora vemos el conteo de nuestra variable objetivo: Diabetes

df['diabetes'].value_counts().plot(kind='bar', color='skyblue')
plt.title('Distribución de Diabetes')
plt.xlabel('Diabetes')
plt.ylabel('Frecuencia')
plt.show()

In [None]:
df.duplicated().sum()

In [None]:
df = df.drop_duplicates()

In [None]:
df.duplicated().sum()

In [None]:
df.shape

In [None]:
df.sample(10)

In [None]:
unique_values = df['smoking_history'].unique()

In [None]:
binary_transformer = Pipeline(
    steps=[("binary_encoder", BinaryEncoder(cols=['smoking_history']))]
)

In [None]:
df_encoded = binary_transformer.fit_transform(df)

In [None]:
smoking_mapping = {}
for i, col in enumerate(encoded_cols):
    clean_name = f"smoking_{unique_values[i].lower().replace(' ', '_')}"
    smoking_mapping[col] = clean_name

In [None]:
df_encoded = df_encoded.rename(columns=smoking_mapping)

print("\nSample of data with renamed columns:")
df_encoded.sample(5)

In [None]:
def cap_outliers(df_encoded, column):
    Q1 = df_encoded[column].quantile(0.25)
    Q3 = df_encoded[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_encoded[column] = df_encoded[column].clip(lower_bound, upper_bound)
    return df_encoded

for col in ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']:
    df_encoded = cap_outliers(df_encoded, col)

In [None]:
scaler = StandardScaler()
df_encoded[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']] = scaler.fit_transform(
    df_encoded[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']]
)

In [None]:
df_encoded.dtypes

In [None]:
print("Generating Pair Plot...")
sns.pairplot(df_encoded[['age', 'bmi', 'HbA1c_level', 'blood_glucose_level', 'diabetes']], 
             hue='diabetes', diag_kind='kde')
plt.suptitle('Pair Plot of Numerical Features by Diabetes', y=1.02)
plt.show()

In [None]:
print("Generating Box Plots...")
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
numerical_cols = ['age', 'bmi', 'HbA1c_level', 'blood_glucose_level']
for i, col in enumerate(numerical_cols):
    sns.boxplot(x='diabetes', y=col, data=df_encoded, ax=axes[i//2, i%2])
    axes[i//2, i%2].set_title(f'{col} vs. Diabetes')
plt.tight_layout()
plt.show()

In [None]:
print("Generating Count Plots...")
categorical_cols = ['gender', 'smoking_never', 'smoking_no_info', 'smoking_current', 'hypertension', 'heart_disease']

# First figure with first 4 plots
fig1, axes1 = plt.subplots(2, 2, figsize=(12, 8))
axes1 = axes1.flatten()  # Flatten for easier indexing

for i in range(min(4, len(categorical_cols))):
    sns.countplot(x=categorical_cols[i], hue='diabetes', data=df_encoded, ax=axes1[i])
    axes1[i].set_title(f'{categorical_cols[i]} vs. Diabetes')

plt.tight_layout()
plt.show()

# Second figure with remaining plots
if len(categorical_cols) > 4:
    fig2, axes2 = plt.subplots(1, len(categorical_cols)-4, figsize=(12, 4))
    if len(categorical_cols) == 5:  # Handle case of just one subplot
        axes2 = [axes2]
        
    for i in range(4, len(categorical_cols)):
        sns.countplot(x=categorical_cols[i], hue='diabetes', data=df_encoded, ax=axes2[i-4])
        axes2[i-4].set_title(f'{categorical_cols[i]} vs. Diabetes')
    
    plt.tight_layout()
    plt.show()

In [None]:
print("Diabetes Class Distribution:")
print(df_encoded['diabetes'].value_counts())

In [None]:
# Ahora vemos la correlación entre los datos

num_df = df_encoded.select_dtypes(exclude='object')
fig, ax = plt.subplots()
sns.heatmap(num_df.corr(), annot=True)
plt.title('Correlación numérica de características')
plt.show()

In [None]:
X = df_encoded.drop('diabetes', axis=1)
y = df_encoded['diabetes']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.1, 1, 10],
            'solver': ['liblinear', 'lbfgs']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(random_state=42),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [None, 10, 20]
        }
    },
    'XGBoost': {
        'model': XGBClassifier(random_state=42, eval_metric='logloss'),
        'params': {
            'n_estimators': [50, 100],
            'max_depth': [3, 6],
            'learning_rate': [0.01, 0.1]
        }
    }
}

In [None]:
results = []
hyperparameters = {}

for name, config in models.items():
    print(f"\nTraining {name}...")
    grid = GridSearchCV(config['model'], config['params'], cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    results.append({'Model': name, 'Accuracy': accuracy})
    hyperparameters[name] = grid.best_params_
    
    print(f"Best Parameters for {name}: {grid.best_params_}")
    print(f"Test Accuracy: {accuracy:.3f}")

In [None]:
# prompt: number of values equals to 1 in the column diabetes
count_diabetes_1 = df[df['diabetes'] == 1]['diabetes'].count()
print(f"Number of values equal to 1 in the 'diabetes' column: {count_diabetes_1}")


In [None]:
# Relacion entre variable dependiente y la glucosa en sangre como independiente

plt.figure(figsize=(8, 5))
sns.kdeplot(df[df['diabetes'] == 0]['blood_glucose_level'], label='No Diabetes', fill=True)
sns.kdeplot(df[df['diabetes'] == 1]['blood_glucose_level'], label='Diabetes', fill=True)
plt.title('Distribución de Glucosa en Sangre')
plt.xlabel('Nivel de Glucosa')
plt.ylabel('Densidad')
plt.legend()
plt.show()

In [None]:
# Relacion entre variable dependiente e índice de masa corporal como independiente

plt.figure(figsize=(8, 5))
sns.kdeplot(df[df['diabetes'] == 0]['bmi'], label='No Diabetes', fill=True)
sns.kdeplot(df[df['diabetes'] == 1]['bmi'], label='Diabetes', fill=True)
plt.title('Distribución de IMC según Diagnóstico de Diabetes')
plt.xlabel('IMC')
plt.ylabel('Densidad')
plt.legend()
plt.show()

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(10, 10))  # 2 filas, 1 columna

# --- Gráfico 1: Distribución de Glucosa en Sangre ---
sns.kdeplot(
    data=df[df['diabetes'] == 0]['blood_glucose_level'],
    label='No Diabetes',
    fill=True,
    ax=axs[0]
)
sns.kdeplot(
    data=df[df['diabetes'] == 1]['blood_glucose_level'],
    label='Diabetes',
    fill=True,
    ax=axs[0]
)
axs[0].set_title('Distribución de Glucosa en Sangre')
axs[0].set_xlabel('Nivel de Glucosa')
axs[0].set_ylabel('Densidad')
axs[0].legend()

# --- Gráfico 2: Distribución de IMC ---
sns.kdeplot(
    data=df[df['diabetes'] == 0]['bmi'],
    label='No Diabetes',
    fill=True,
    ax=axs[1]
)
sns.kdeplot(
    data=df[df['diabetes'] == 1]['bmi'],
    label='Diabetes',
    fill=True,
    ax=axs[1]
)
axs[1].set_title('Distribución de IMC según Diagnóstico de Diabetes')
axs[1].set_xlabel('IMC')
axs[1].set_ylabel('Densidad')
axs[1].legend()

# Ajustar espacios
plt.tight_layout()
plt.show()