In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OrdinalEncoder, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, SimpleImputer
import category_encoders as ce
from sklearn import set_config

# =====================================================================================
# PART 1: DEFINE THE COMPLETE PREPROCESSING PIPELINE
# =====================================================================================

# -------------------------------------------------
# 1.1. Custom Transformation Functions
# -------------------------------------------------

def create_title_feature(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts, cleans, and standardizes titles from the 'Name' column.
    """
    df_copy = df.copy()
    name_series = df_copy['Name'].apply(lambda x: re.sub(r'\(([^)]*)\)', '', x).strip())
    name_series = name_series.str.replace(r'\"[^\"]*\"', '', regex=True)
    name_part = name_series.str.split(',').str.get(1).str.strip()
    title = name_part.str.split('.').str.get(0).str.strip()
    title_mapping = {'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'}
    title = title.replace(title_mapping)
    common_titles = ['Mr', 'Miss', 'Mrs', 'Master']
    title = title.apply(lambda x: x if x in common_titles else 'Rare')
    return title.to_frame(name='Titel')

def extract_cabin_letter(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts the first letter from the 'Cabin' column, converting NaNs to 'n'.
    The output column is named 'Cabin' to match the original script's final output.
    """
    return df['Cabin'].astype(str).str[0].to_frame(name='Cabin')

# -------------------------------------------------
# 1.2. ColumnTransformer for Feature-Specific Steps
# -------------------------------------------------

preprocessor = ColumnTransformer(
    transformers=[

        ('sex_oe', OrdinalEncoder(categories=[['male', 'female']]), ['Sex']),
        ('title_creator', Pipeline(steps=[
            ('create', FunctionTransformer(create_title_feature, feature_names_out=lambda self, input_features: ['Titel'])),
            ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), ['Name']),

        ('embarked_impute_encode', Pipeline(steps=[
            ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
            ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), ['Embarked']),

        ('cabin_extract_encode', Pipeline(steps=[
            ('extract', FunctionTransformer(extract_cabin_letter, feature_names_out=lambda self, input_features: ['Cabin'])),
            ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), ['Cabin']),

        ('ticket_loo', ce.LeaveOneOutEncoder(), ['Ticket'])
    ],
    # The remainder now correctly applies to columns that are not dropped or transformed
    remainder='passthrough',
    verbose_feature_names_out=False
)

# -------------------------------------------------
# 1.3. Final Pipeline with Imputer
# -------------------------------------------------

titanic_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('imputer', IterativeImputer(min_value=1, random_state=0))
])

# Optional: Display the pipeline structure
set_config(display='diagram')
print("✅ Pipeline defined successfully. Here is the structure:")
display(titanic_pipeline)


# =====================================================================================
# PART 2: VERIFY THE PIPELINE AGAINST THE ORIGINAL CODE
# =====================================================================================
print("\n" + "="*80)
print("Starting verification process...")

# =====================================================================================
# PART 2: VERIFY THE PIPELINE AGAINST THE ORIGINAL CODE
# =====================================================================================
print("\n" + "="*80)
print("Starting verification process...")

# -------------------------------------------------
# 2.1. Run the ORIGINAL preprocessing steps
# -------------------------------------------------
print("\nStep 1: Running original notebook script...")

# This part remains the same: load data and manually drop the column for the baseline
df_orig = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\train.csv")
X_original = df_orig.drop(["Survived", "PassengerId"], axis=1)
y_original = df_orig["Survived"]

# --- Start of notebook code replication ---
category_order = ['male','female']
ordinal_encoder = OrdinalEncoder(categories=[category_order])
X_original['Sex'] = ordinal_encoder.fit_transform(X_original[['Sex']])
X_original['Name'] = X_original['Name'].apply(lambda x: re.sub(r'\\([^)]*\\)', '', x).strip())
X_original['Name'] = X_original['Name'].str.replace(r'\"[^\"]*\"', '', regex=True)
X_original['LastName'] = X_original['Name'].str.split(',').str.get(0).str.strip()
name_part = X_original['Name'].str.split(',').str.get(1).str.strip()
X_original['Titel'] = name_part.str.split('.').str.get(0).str.strip()
title_mapping = {'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs'}
X_original['Titel'] = X_original['Titel'].replace(title_mapping)
common_titles = ['Mr', 'Miss', 'Mrs', 'Master']
X_original['Titel'] = X_original['Titel'].apply(lambda x: x if x in common_titles else 'Rare')
X_original.drop(['Name', 'LastName'], axis=1, inplace=True)
X_original['Embarked'] = X_original['Embarked'].fillna('missing')
X_original['Cabin'] = X_original['Cabin'].astype(str).str[0]
le_titel = LabelEncoder()
le_cabin = LabelEncoder()
le_embarked = LabelEncoder()
X_original['Titel'] = le_titel.fit_transform(X_original['Titel'])
X_original['Cabin'] = le_cabin.fit_transform(X_original['Cabin'])
X_original['Embarked'] = le_embarked.fit_transform(X_original['Embarked'])
loo_encoder = ce.LeaveOneOutEncoder(cols=['Ticket'])
X_original = loo_encoder.fit_transform(X_original, y_original)

# FIX: Remove 'PassengerId' from the list of expected final columns
final_cols_original = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Titel']
X_original = X_original[final_cols_original]

imputer_original = IterativeImputer(min_value=1, random_state=0)
original_processed_array = imputer_original.fit_transform(X_original)
original_processed_df = pd.DataFrame(original_processed_array, columns=final_cols_original)
# --- End of notebook code replication ---
print("...original script finished.")

# -------------------------------------------------
# 2.2. Run the new scikit-learn PIPELINE
# -------------------------------------------------
print("\nStep 2: Running new scikit-learn pipeline...")

# The pipeline now handles dropping the ID, so you can feed it the raw data
df_pipeline_input = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\train.csv")
X_pipeline = df_pipeline_input.drop(["Survived", "PassengerId"], axis=1)
y_pipeline = df_pipeline_input["Survived"]

pipeline_processed_array = titanic_pipeline.fit_transform(X_pipeline, y_pipeline)
pipeline_cols = titanic_pipeline.named_steps['preprocessor'].get_feature_names_out()
pipeline_processed_df = pd.DataFrame(pipeline_processed_array, columns=pipeline_cols)
print("...pipeline finished.")

# -------------------------------------------------
# 2.3. VERIFY the outputs are identical
# -------------------------------------------------
print("\nStep 3: Verifying outputs...")

# Align column order for a true comparison
pipeline_processed_df = pipeline_processed_df[final_cols_original]

# This part should now succeed
try:
    np.testing.assert_allclose(
        original_processed_df.values,
        pipeline_processed_df.values,
        rtol=1e-5,
        atol=1e-8
    )
    print("\n✅ SUCCESS: The pipeline's output is numerically identical to the original script's output.")
except AssertionError as e:
    print("\n❌ FAILURE: The outputs do not match.")
    print(e)

✅ Pipeline defined successfully. Here is the structure:


0,1,2
,steps,"[('preprocessor', ...), ('imputer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('sex_oe', ...), ('title_creator', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['male', 'female']]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,func,<function cre...0028A83964CC0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function <la...0028AC31AD580>
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,func,<function ext...0028AC31AD300>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function <la...0028A83CFF240>
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_unknown,'value'
,handle_missing,'value'
,random_state,
,sigma,

0,1,2
,estimator,
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False



Starting verification process...

Starting verification process...

Step 1: Running original notebook script...
...original script finished.

Step 2: Running new scikit-learn pipeline...
...pipeline finished.

Step 3: Verifying outputs...

✅ SUCCESS: The pipeline's output is numerically identical to the original script's output.


In [2]:
train = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\train.csv")
test = pd.read_csv(r"C:\Users\basde\OneDrive\Documenten\GitHub\Titanic\test.csv")

In [3]:
test_passenger_id = test['PassengerId']

In [4]:
X_train = train.drop(["Survived", "PassengerId"], axis=1)
y_train = train["Survived"]


In [5]:
X_test = test.drop("PassengerId", axis=1)

In [6]:
titanic_pipeline.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('imputer', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('sex_oe', ...), ('title_creator', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,False
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,"[['male', 'female']]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'error'
,unknown_value,
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,func,<function cre...0028A83964CC0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function <la...0028AC31AD580>
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'missing'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,func,<function ext...0028AC31AD300>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,<function <la...0028A83CFF240>
,kw_args,
,inv_kw_args,

0,1,2
,categories,'auto'
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,verbose,0
,cols,
,drop_invariant,False
,return_df,True
,handle_unknown,'value'
,handle_missing,'value'
,random_state,
,sigma,

0,1,2
,estimator,
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False


In [7]:
X_test_processed_array = titanic_pipeline.transform(X_test)
pipeline_cols = titanic_pipeline.named_steps['preprocessor'].get_feature_names_out()
X_test_processed_df = pd.DataFrame(X_test_processed_array, columns=pipeline_cols)

In [8]:
X_train_processed_array = titanic_pipeline.transform(X_train)
pipeline_cols = titanic_pipeline.named_steps['preprocessor'].get_feature_names_out()
X_train_processed_df = pd.DataFrame(X_train_processed_array, columns=pipeline_cols)

In [9]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from catboost import CatBoostClassifier
# Ensemble and Boosting models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb

# --- Model Evaluation Tools ---
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV

In [10]:
models = {
    "Logistic Regression": LogisticRegression(solver='liblinear'),
    "Ridge Classifier": RidgeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "SVC": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": xgb.XGBClassifier(eval_metric='logloss'),
    "LightGBM": lgb.LGBMClassifier(),
    "CatBoost": cb.CatBoostClassifier(verbose=0),
    "MLP Classifier": MLPClassifier(max_iter=500, random_state=42)
}

In [11]:
results = []
names = []

print("Evaluating models...")
for name, model in models.items():
    # 1. Define the cross-validation strategy
    # Using StratifiedKFold is good practice for classification problems
    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

    # 2. Perform cross-validation
    cv_results = cross_val_score(model, X_train_processed_df, y_train, cv=cv, scoring='accuracy')

    # 3. Store the results
    results.append(cv_results)
    names.append(name)

    # 4. Print the mean and std dev of the results
    print(f"{name}: {cv_results.mean():.4f} Accuracy with a std of {cv_results.std():.4f}")




In [12]:
# # --- Present Results in a DataFrame ---
# # Create a DataFrame to hold the results for easy comparison
# results_df = pd.DataFrame({
#     'Model': names,
#     'Mean Accuracy': [res.mean() for res in results],
#     'Std Dev': [res.std() for res in results]
# })

# # Sort the models by mean accuracy in descending order
# sorted_results_df = results_df.sort_values(by='Mean Accuracy', ascending=False).reset_index(drop=True)

# print("\n--- Model Performance Comparison ---")
# print(sorted_results_df)

In [14]:
# --- 1. Define the Parameter Grid ---
# For GridSearchCV, you provide a list of specific values to test.
# Keep this grid small, as every combination will be tested!
param_grid = {
    'depth': [8, 10 , 12],
    'learning_rate': [0.03, 0.05, 0.07],
    'iterations': [250, 300, 350]
}

# --- 2. Set Up GridSearchCV ---
# Initialize your model
cat_model = CatBoostClassifier(verbose=0, random_state=42)

# Set up the cross-validation strategy
cv_strategy = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Set up the grid search with CPU control
grid_search = GridSearchCV(
    estimator=cat_model,
    param_grid=param_grid,
    cv=cv_strategy,
    scoring='accuracy',
    verbose=1
    # n_jobs=5,  # IMPORTANT: Uses exactly 5 of your 10 cores
    # return_train_score=True 
)

# --- 3. Run the Search (This part is the same) ---
print(f"Starting GridSearchCV...")
grid_search.fit(X_train_processed_df, y_train)

# --- 4. Display the Best Results and the Gap ---
print("\n--- Grid Search Complete ---")

# The best CV score found
best_cv_score = grid_search.best_score_

# Find the index of the best combination of parameters
best_model_index = grid_search.best_index_

# Use that index to get the corresponding average training score
# The 'mean_train_score' is an array of scores for all parameter combinations
best_train_score = grid_search.cv_results_['mean_train_score'][best_model_index]

print(f"Best CV (Validation) Score: {best_cv_score:.4f}")
# print(f"Training Score for Best Model: {best_train_score:.4f}")
# print(f"Overfitting Gap: {(best_train_score - best_cv_score):.4f}")
print("\nBest Hyperparameters Found:")
print(grid_search.best_params_)

Starting GridSearchCV...
Fitting 10 folds for each of 27 candidates, totalling 270 fits


ValueError: 
All the 270 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\catboost\core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\catboost\core.py", line 2321, in _prepare_train_params
    _check_train_params(params)
  File "_catboost.pyx", line 6601, in _catboost._check_train_params
  File "_catboost.pyx", line 6623, in _catboost._check_train_params
_catboost.CatBoostError: catboost/private/libs/options/plain_options_helper.cpp:512: Unknown option {catboost__learning_rate} with value "0.03"

--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\catboost\core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\catboost\core.py", line 2321, in _prepare_train_params
    _check_train_params(params)
  File "_catboost.pyx", line 6601, in _catboost._check_train_params
  File "_catboost.pyx", line 6623, in _catboost._check_train_params
_catboost.CatBoostError: catboost/private/libs/options/plain_options_helper.cpp:512: Unknown option {catboost__learning_rate} with value "0.05"

--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\catboost\core.py", line 2395, in _fit
    train_params = self._prepare_train_params(
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\basde\AppData\Local\Programs\Python\Python311\Lib\site-packages\catboost\core.py", line 2321, in _prepare_train_params
    _check_train_params(params)
  File "_catboost.pyx", line 6601, in _catboost._check_train_params
  File "_catboost.pyx", line 6623, in _catboost._check_train_params
_catboost.CatBoostError: catboost/private/libs/options/plain_options_helper.cpp:512: Unknown option {catboost__learning_rate} with value "0.07"


In [None]:
final_model = grid_search.best_estimator_

In [None]:
print("\n--- Making predictions on the test data ---")
test_predictions = final_model.predict(X_test_processed_df)

In [None]:
# Assuming your original test dataframe is `test_df` which has the PassengerId.
submission_df = pd.DataFrame({
    "PassengerId": test_passenger_id["PassengerId"],
    "Survived": test_predictions
})

In [None]:
submission_df.to_csv("submission.csv", index=False)

In [None]:
import numpy as np
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.datasets import make_classification

# Ensemble and Boosting models
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import RidgeClassifier
import lightgbm as lgb


# --- Define a consistent CV Strategy ---
# This ensures each model is tested in the exact same way
cv_strategy = StratifiedKFold(
    n_splits=10,
    shuffle=True,
    random_state=42
)

In [None]:
# --- 1. Gradient Boosting ---
model_gb = GradientBoostingClassifier(random_state=42)

# Perform cross-validation
scores_gb = cross_val_score(model_gb, X_train_processed_df, y_train, cv=cv_strategy, scoring='accuracy')

# Print results
print("--- Gradient Boosting ---")
print(f"Accuracy: {np.mean(scores_gb):.4f}")
print(f"Standard Deviation: {np.std(scores_gb):.4f}")

In [None]:
# --- 2. AdaBoost ---
model_ada = AdaBoostClassifier(random_state=42)

# Perform cross-validation
scores_ada = cross_val_score(model_ada, X_train_processed_df, y_train, cv=cv_strategy, scoring='accuracy')

# Print results
print("--- AdaBoost ---")
print(f"Accuracy: {np.mean(scores_ada):.4f}")
print(f"Standard Deviation: {np.std(scores_ada):.4f}")

In [None]:
# --- 3. Ridge Classifier ---
model_ridge = RidgeClassifier(random_state=42)

# Perform cross-validation
scores_ridge = cross_val_score(model_ridge, X_train_processed_df, y_train, cv=cv_strategy, scoring='accuracy')

# Print results
print("--- Ridge Classifier ---")
print(f"Accuracy: {np.mean(scores_ridge):.4f}")
print(f"Standard Deviation: {np.std(scores_ridge):.4f}")

In [None]:
# --- 4. LightGBM ---
model_lgbm = lgb.LGBMClassifier(random_state=42)

# Perform cross-validation
scores_lgbm = cross_val_score(model_lgbm, X_train_processed_df, y_train, cv=cv_strategy, scoring='accuracy')

# Print results
print("--- LightGBM ---")
print(f"Accuracy: {np.mean(scores_lgbm):.4f}")
print(f"Standard Deviation: {np.std(scores_lgbm):.4f}")