In [3]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTENC

In [4]:
def apply_smote(X, y, random_state=42):
    """
    Applies SMOTE to balance the dataset.
    
    Args:
        X (pd.DataFrame): Features
        y (pd.Series or array): Target
        random_state (int): Seed for reproducibility
        
    Returns:
        X_resampled (pd.DataFrame), y_resampled (pd.Series)
    """
    
    sm = SMOTE(random_state=random_state)    # sampling_strategy=0.5)
    
    X_res, y_res = sm.fit_resample(X, y.values.ravel())
    X_resampled = pd.DataFrame(X_res, columns=X.columns)
    y_resampled = pd.Series(y_res, name="target")
    
    return X_resampled, y_resampled


def smotenc(X, y, categorical_features, random_state=42):
    """
    Applies SMOTENC to balance the dataset with categorical features.

    Args:
        X (pd.DataFrame): Features
        y (pd.Series or array): Target
        categorical_features (list): List of column indices (not names) that are categorical
        random_state (int): Seed for reproducibility

    Returns:
        X_resampled (pd.DataFrame), y_resampled (pd.Series)
    """
    sm = SMOTENC(categorical_features=categorical_features, random_state=random_state)

    X_res, y_res = sm.fit_resample(X.values, y.values.ravel())

    X_resampled = pd.DataFrame(X_res, columns=X.columns)
    y_resampled = pd.Series(y_res, name="target")

    return X_resampled, y_resampled

In [None]:
import pandas as pd
import os

os.chdir("/home/aac/Anyone/FinalProject/CreditRiskAnalysisProject/")

X_train = pd.read_csv('data/processed/X_train_balanced.csv')

print(len(X_train))

print("¿Hay NaNs en X_train BALANCED?")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(X_train.isnull().sum().to_string())

In [None]:
y_train_balanced = pd.read_csv('data/processed/y_train_balanced.csv')

print(y_train_balanced.value_counts())

In [None]:
X_train_balanced = pd.read_csv('data/processed/X_train_balanced.csv')

print(X_train_balanced.shape)

In [None]:
def eda_by_segment(df_segmented):
    
    # Ensure the segment column exists
    if 'kmeans_segment' not in df_segmented.columns:
        raise ValueError("The DataFrame must contain a 'kmeans_segment' column.")
    
    # Identify numeric columns for statistics and plotting (excluding the segment column)
    # numeric_cols = df_segmented.select_dtypes(include='number').drop(columns=['kmeans_segment'], errors='ignore').columns

    segments = sorted(df_segmented['kmeans_segment'].unique())

    # Loop through each segment and perform EDA
    for segment_label in segments:
        segment_data = df_segmented[df_segmented['kmeans_segment'] == segment_label]
        
        print(f"\n=== EDA for Segment '{segment_label}' ===")
        print(f"Number of records: {segment_data.shape[0]}")
        print("\nDescriptive Statistics:")
        print(segment_data.describe())
        
        # Correlation heatmap
        plt.figure(figsize=(10, 6))
        sns.heatmap(segment_data.corr(), annot=True, cmap='coolwarm', fmt=".2f")
        plt.title(f'Correlation Heatmap - Segment {segment_label}')
        plt.tight_layout()
        plt.show()

        # Distribution plots for each numeric variable
        for col in df_segmented.columns:
            plt.figure(figsize=(6, 4))
            sns.histplot(segment_data[col], kde=True, bins=30)
            plt.title(f'Distribution of {col} - Segment {segment_label}')
            plt.xlabel(col)
            plt.ylabel('Frequency')
            plt.tight_layout()
            plt.show()
    
    return

# --------

In [5]:
X_train = pd.read_csv('X_train_balanced.csv')
X_val = pd.read_csv('X_val_p.csv')
y_train = pd.read_csv('y_train_balanced.csv')
y_val = pd.read_csv('y_val.csv')

In [6]:
y_train.value_counts() #normalize=True)

TARGET_LABEL_BAD=1
0                     29524
1                     29524
Name: count, dtype: int64

In [9]:
smote = SMOTE(random_state=42, sampling_strategy=0.5)

# Aplicar SMOTE solo a los datos de entrenamiento
xtrain_balanceado, ytrain_balanceado = smote.fit_resample(X_train, y_train)

print("Dimensiones originales de ytrain:")
print(y_train.value_counts())
print("\nDimensiones de ytrain después de SMOTE:")
print(ytrain_balanceado.value_counts())

ValueError: The specified ratio required to remove samples from the minority class while trying to generate new samples. Please increase the ratio.

In [None]:
import pandas as pd
import os

os.chdir("/home/aac/Anyone/FinalProject/CreditRiskAnalysisProject/")

X_train = pd.read_csv('data/processed/interim/X_train_X_train_unbalanced.csv')
X_val = pd.read_csv('data/processed/interim/X_val_X_val_unbalanced.csv')

print("¿Hay NaNs en X_train?")
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print(X_train.isnull().sum().to_string())
    print("\n\n","-"*20)
    print("-- Now X_val --")
    print("\n")
    print(X_val.isnull().sum().to_string())
