In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score

## Stratified K-Fold Cross-Validation
- `Description`: A variation of K-Fold Cross-Validation that ensures each fold has a similar distribution of the target variable as the original dataset.
- `When to Use`: When dealing with imbalanced datasets where some classes are underrepresented. Ensures each fold maintains the target class distribution.

In [None]:
def create_cv_dataset(df, target_col, n_splits=5):
    """
    Function to create a dataset with K-fold assignments.
    
    Parameters:
    df (pd.DataFrame): The full dataset including the target column.
    target_col (str): The name of the target column.
    n_splits (int): The number of folds for cross-validation.
    
    Returns:
    pd.DataFrame: The dataset with a new column 'kfold' for fold assignments.
    """
    # Create a new column for fold assignments and initialize with -1
    df['kfold'] = -1
    
    # Shuffle the data
    df = df.sample(frac=1).reset_index(drop=True)
    
    # Extract the target column
    y = df[target_col].values
    
    # Initialize Stratified K-Folds
    kf = StratifiedKFold(n_splits=n_splits)
    
    # Split data into folds
    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df, y=y)):
        df.loc[val_idx, 'kfold'] = fold
    
    # Return the new dataframe with fold assignments
    return df


def preprocess_and_train(df, target_col, model, preprocessor=None, n_splits=5):
    """
    Function to perform cross-validation by training and evaluating on each fold.
    
    Parameters:
    df (pd.DataFrame): The dataset with 'kfold' column for fold assignments.
    target_col (str): The name of the target column.
    model (sklearn estimator): The machine learning model to be trained.
    preprocessor (function, optional): A function to preprocess data (e.g., imputation, scaling).
    n_splits (int): The number of folds for cross-validation.
    
    Returns:
    list: A list of accuracy scores for each fold.
    """
    # List to store accuracy for each fold
    fold_accuracies = []

    # Loop through each fold
    for fold in range(n_splits):
        print(f"Processing Fold {fold}")
        
        # Create training and validation sets based on the fold assignment
        train_df = df[df.kfold != fold].reset_index(drop=True)
        val_df = df[df.kfold == fold].reset_index(drop=True)

        # Separate features and target
        X_train = train_df.drop([target_col, 'kfold'], axis=1)
        y_train = train_df[target_col]
        X_val = val_df.drop([target_col, 'kfold'], axis=1)
        y_val = val_df[target_col]

        # Apply preprocessing if a preprocessor is provided
        if preprocessor:
            X_train, X_val = preprocessor(X_train, X_val)

        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on the validation set
        y_pred = model.predict(X_val)
        
        # Calculate accuracy for this fold
        acc = accuracy_score(y_val, y_pred)
        fold_accuracies.append(acc)
        
        print(f"Fold {fold} Accuracy: {acc}")
    
    # Return the accuracy list for all folds
    return fold_accuracies


def simple_preprocessor(X_train, X_val):
    """
    A simple preprocessor function for imputation and scaling.
    
    Parameters:
    X_train (pd.DataFrame): The training features.
    X_val (pd.DataFrame): The validation features.
    
    Returns:
    tuple: Processed training and validation features.
    """
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler

    # Imputation
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_val = imputer.transform(X_val)

    # Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    
    return X_train, X_val

##  K-Fold Cross-Validation
- `Description`: Splits the dataset into k equal-sized folds. Each fold serves as a validation set once, while the remaining k-1 folds are used for training. The process is repeated k times, and the results are averaged.
- `When to Use`: When you want a more robust estimate of model performance compared to hold-out validation. Useful when you have a moderate amount of data.

In [None]:
def create_kfold_dataset(df, target_col, n_splits=5, random_state=42):
    """
    Function to create a dataset with K-Fold assignments.

    Parameters:
    df (pd.DataFrame): The full dataset including the target column.
    target_col (str): The name of the target column.
    n_splits (int): The number of folds for cross-validation.
    random_state (int): Random seed for reproducibility.

    Returns:
    pd.DataFrame: The dataset with a new column 'kfold' for fold assignments.
    """
    # Create a new column for fold assignments and initialize with -1
    df['kfold'] = -1
    
    # Shuffle the data
    df = df.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    # Initialize KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Fill the new kfold column
    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df)):
        df.loc[val_idx, 'kfold'] = fold
    
    return df


def preprocess_and_train(df, target_col, model, preprocessor=None, n_splits=5):
    """
    Function to perform cross-validation by training and evaluating on each fold.

    Parameters:
    df (pd.DataFrame): The dataset with 'kfold' column for fold assignments.
    target_col (str): The name of the target column.
    model (sklearn estimator): The machine learning model to be trained.
    preprocessor (function, optional): A function to preprocess data (e.g., imputation, scaling).
    n_splits (int): The number of folds for cross-validation.

    Returns:
    list: A list of accuracy scores for each fold.
    """
    from sklearn.metrics import accuracy_score
    
    # List to store accuracy for each fold
    fold_accuracies = []

    # Loop through each fold
    for fold in range(n_splits):
        print(f"Processing Fold {fold}")

        # Create training and validation sets based on the fold assignment
        train_df = df[df.kfold != fold].reset_index(drop=True)
        val_df = df[df.kfold == fold].reset_index(drop=True)

        # Separate features and target
        X_train = train_df.drop([target_col, 'kfold'], axis=1)
        y_train = train_df[target_col]
        X_val = val_df.drop([target_col, 'kfold'], axis=1)
        y_val = val_df[target_col]

        # Apply preprocessing if a preprocessor is provided
        if preprocessor:
            X_train, X_val = preprocessor(X_train, X_val)

        # Train the model
        model.fit(X_train, y_train)
        
        # Predict on the validation set
        y_pred = model.predict(X_val)
        
        # Calculate accuracy for this fold
        acc = accuracy_score(y_val, y_pred)
        fold_accuracies.append(acc)
        
        print(f"Fold {fold} Accuracy: {acc}")

    # Return the accuracy list for all folds
    return fold_accuracies


def simple_preprocessor(X_train, X_val):
    """
    A simple preprocessor function for imputation and scaling.

    Parameters:
    X_train (pd.DataFrame): The training features.
    X_val (pd.DataFrame): The validation features.

    Returns:
    tuple: Processed training and validation features.
    """
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler

    # Imputation
    imputer = SimpleImputer(strategy='mean')
    X_train = imputer.fit_transform(X_train)
    X_val = imputer.transform(X_val)

    # Scaling
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    
    return X_train, X_val

## Stratified K-Fold Cross-Validation for Regression Model

In [None]:
def create_folds(data, target_col, n_splits=5, random_state=42):
    """
    Create stratified folds based on target binning.

    Parameters:
    data (pd.DataFrame): The dataset including the target column.
    target_col (str): The name of the target column.
    n_splits (int): Number of folds for cross-validation.
    random_state (int): Seed for random number generator for reproducibility.

    Returns:
    pd.DataFrame: Dataset with a new 'kfold' column for fold assignments.
    """
    # Create a new column for fold assignments and initialize with -1
    data['kfold'] = -1
    
    # Shuffle the data
    data = data.sample(frac=1, random_state=random_state).reset_index(drop=True)
    
    # Calculate the number of bins using Sturge's rule
    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    # Bin the target variable
    data['bins'] = pd.cut(data[target_col], bins=num_bins, labels=False)
    
    # Initialize StratifiedKFold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Fill the new kfold column
    for fold, (train_idx, val_idx) in enumerate(skf.split(X=data, y=data['bins'])):
        data.loc[val_idx, 'kfold'] = fold
    
    # Drop the bins column
    data = data.drop('bins', axis=1)
    
    return data