<center><h1  style="color:white; background-color:#000000; border-radius: 0px; padding:25px;"> Rules extraction </h1></center>

This notebook aim at evaluating different classiifer (including rules evaluation) on several learning tasks (classificationa nd regression).

In [None]:
from collections import Counter
import os
os.chdir('../')
import time

import numpy as np
import pandas as pd

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
os.mkdir('reproduce-exp', exist_ok=True) # create output directory

# Protocol:

In [None]:
from sklearn.metrics import average_precision_score, accuracy_score,roc_auc_score,mean_squared_error, mean_absolute_error

In [None]:
from woodtapper.extract_rules import SirusClassifier,SirusRegressor
from woodtapper.extract_rules.visualization import show_rules
from data.data import load_titatnic_benard_data, load_houses_sales_reg_data

In [None]:
from math import sqrt
from sklearn.model_selection import StratifiedKFold, KFold

In [None]:
def make_folds(X, y, output_path="data_with_folds.csv", stratified=True, n_splits=5, random_state=0):
    """
    Create a 5-fold split and save a CSV file with fold assignments.
    """
    # Convert to DataFrame if needed
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)
    if not isinstance(y, (pd.Series, pd.DataFrame)):
        y = pd.Series(y, name="target")

    # Combine
    df = X.copy()
    df["target"] = y.values

    # Initialize splitter
    if stratified:
        kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    else:
        kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    # Assign folds
    df["fold"] = -1
    for fold, (_, val_idx) in enumerate(kf.split(X, y)):
        df.loc[val_idx, "fold"] = fold

    # Save
    df.to_csv(output_path, index=False)
    print(f"âœ… File saved: {output_path}")
    return df

def cross_validate_from_csv(csv_path_data,csv_path_preds,
                            model, target_col="target", fold_col="fold", is_clf=True):
    """
    Perform cross-validation using a CSV with preassigned folds.

    Parameters:
    - csv_path: path to CSV file with 'fold' column
    - model_class: scikit-learn estimator class (not an instance)
    - target_col: name of target column
    - fold_col: name of fold column
    - model_params: dictionary of parameters to pass to the model
    - classification_threshold: for binary classification with probabilities

    Returns:
    - DataFrame with fold metrics
    """
    #if model_params is None:
    #    model_params = {}

    df = pd.read_csv(csv_path_data)
    folds = sorted(df[fold_col].unique())
    results = []
    list_preds = []
    list_folds = []
    for f in folds:
        train_df = df[df[fold_col] != f]
        test_df  = df[df[fold_col] == f]

        X_train = train_df.drop(columns=[target_col, fold_col])
        y_train = train_df[target_col]
        X_test  = test_df.drop(columns=[target_col, fold_col])
        y_test  = test_df[target_col]
        list_folds.extend([f]*len(y_test))

        # Instantiate and fit the model
        #model = model_class(**model_params)
        model.fit(X_train, y_train)

        # Predict
        if is_clf:
            preds = model.predict_proba(X_test)
            names= ['fold','class_0','class_1']
        else:
            preds = model.predict(X_test)
            names= ['fold','pred']
        list_preds.extend(list(preds))
    if is_clf:
        res_final = np.concatenate((np.array(list_folds).reshape(-1,1),np.array(list_preds)), axis=1)
    else:
        res_final = np.concatenate((np.array(list_folds).reshape(-1,1),np.array(list_preds).reshape(-1,1)), axis=1)
    pd.DataFrame(res_final,columns=names).to_csv(csv_path_preds, index=False)

def compute_metrics_csv(csv_path_data,csv_path_preds,col_preds="class_1", 
                        type_learning="clf", target_col="target",fold_col="fold", classification_threshold=0.5):

    df = pd.read_csv(csv_path_data)
    folds = sorted(df[fold_col].unique())
    df_preds = pd.read_csv(csv_path_preds)
    results_metric1 = []
    results_metric2 = []

    #start_row = 0
    for f in folds:
        test_df  = df[df[fold_col] == f]
        y_test  = test_df[target_col].to_numpy().ravel()
        
        df_preds_tests = df_preds[df_preds[fold_col] == f]
        preds_probas = df_preds_tests[col_preds].to_numpy().ravel()
        #preds_probas  = df_preds.iloc[start_row:start_row+len(y_test),index_col_target].to_numpy()
        #start_row += len(y_test) # Update for next fold

        # Determine metric
        if type_learning == 'clf':  # Classification
            preds_ = (preds_probas >= classification_threshold).astype(int)
            
            metric = accuracy_score(y_test, preds_)
            metric_name = "accuracy"
            metric2 = roc_auc_score(y_test,preds_probas)
            metric_name2 = "roc_auc"
        else:  # Regression
            metric = sqrt(mean_squared_error(y_test, preds_probas))
            metric_name = "MSE"
            metric2 = mean_absolute_error(y_test,preds_probas)
            metric_name2 = "MAE"
            
        print(f"Fold {f} - {metric_name}: {metric:.4f}")
        results_metric1.append({"fold": f, metric_name: metric})
        print(f"Fold {f} - {metric_name2}: {metric2:.4f}")
        results_metric2.append({"fold": f, metric_name2: metric2})

    results_df_metric1 = pd.DataFrame(results_metric1)
    results_df_metric2 = pd.DataFrame(results_metric2)
    print("\n=== Overall Results ===")
    print(results_df_metric1)
    print(f"Mean {metric_name}: {results_df_metric1[metric_name].mean():.4f} with std {results_df_metric1[metric_name].std():.4f}")
    print(results_df_metric2)
    print(f"Mean {metric_name2}: {results_df_metric2[metric_name2].mean():.4f} with std {results_df_metric2[metric_name2].std():.4f}")

    return results_df_metric1,results_df_metric2

# Clf:

## Titanic:

### rules: 

In [None]:
X_titanic, y_titanic = load_titatnic_benard_data()
#df_titanic = load_titatnic_data()

In [None]:
X_titanic

In [None]:
len(y_titanic)

In [None]:
Counter(y_titanic)

In [None]:
Counter(y_titanic)[1] / (Counter(y_titanic)[0] + Counter(y_titanic)[1])

In [None]:
## RandomForestClassifier rules extraction
RFSirus = SirusClassifier(n_estimators=2000,max_depth=2,quantile=10,p0=0.0,max_n_rules=25,max_features=6,
                            to_not_binarize_colindexes=[1,3,4],
                            starting_index_one_hot=None,bootstrap=True,
                            random_state=0,splitter="quantile")
start = time.time()
RFSirus.fit(X_titanic,y_titanic)
end = time.time()

In [None]:
RFSirus.feature_names_in_ = ["Pclass","Sex","Age","SibSp","Parch","Fare"]
show_rules(RFSirus,max_rules=25,target_class_index=1,value_mappings= {"Sex":{0:"male",1:"female"}})

### perfs:

In [None]:
numeric_features = [1,2,3,4,5]
categorical_features = [0]

In [None]:
def to_str(x):
    return x.astype(str)
def to_float(x):
    return x.astype(float)
fun_tr_str = FunctionTransformer(to_str)
fun_tr_float = FunctionTransformer(to_float)
numeric_transformer = Pipeline(steps=[("Transform_float", fun_tr_float)])
categorical_transformer = Pipeline(
    steps=[
        ("Transform_str", fun_tr_str),
        ("OneHot", OneHotEncoder(handle_unknown="ignore")),
    ]
)
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [None]:
RFSirus = SirusClassifier(n_estimators=1000,max_depth=2,quantile=10,p0=0,max_n_rules=25,max_features=6,
                            to_not_binarize_colindexes=[0,2,3],starting_index_one_hot=5,
                            random_state=0,splitter="quantile")
RFSirus_pipeline = Pipeline(steps=[("preprocessor", preprocessor), ("rf-sirus", RFSirus)])

In [None]:
make_folds(X=X_titanic, y=y_titanic, output_path="reproduce-exp/titanic-folds.csv", 
           stratified=True, n_splits=5, random_state=0)

In [None]:
cross_validate_from_csv(csv_path_data="reproduce-exp/titanic-folds.csv",
                        csv_path_preds="reproduce-exp/py-predictions-titanic.csv",
                        model=RFSirus_pipeline,target_col="target", fold_col="fold", is_clf=True)

In [None]:
res1,res2 = compute_metrics_csv(csv_path_data="reproduce-exp/titanic-folds.csv",
                    csv_path_preds="reproduce-exp/py-predictions-titanic.csv",
                    type_learning="clf",col_preds="class_1",
                    target_col="target",fold_col="fold", classification_threshold=0.5)

In [None]:
res1.mean()

In [None]:
res2.mean()


## House sales:

In [None]:
X_house_sales,y_house_sales = load_houses_sales_reg_data()

In [None]:
## RandomForestRegressor rules extraction
RFSirus = SirusRegressor(n_estimators=1000,max_depth=2,quantile=10,p0=0,max_n_rules=25,max_features=15,
                            to_not_binarize_colindexes=None,starting_index_one_hot=None,
                            random_state=0,splitter="quantile")
RFSirus.fit(X_house_sales,y_house_sales)
#n_estimators=1000, max_features=15, random_state=19

In [None]:
show_rules(RFSirus,max_rules=20,is_regression=True)

### perfs

In [None]:
make_folds(X=X_house_sales, y=y_house_sales, output_path="reproduce-exp/house_sales-folds.csv", 
           stratified=False, n_splits=5, random_state=0)

In [None]:
RFSirus = SirusRegressor(n_estimators=1000,max_depth=2,quantile=10,p0=0,max_n_rules=25,max_features=15,
                            to_not_binarize_colindexes=None,starting_index_one_hot=None,
                            random_state=0,splitter="quantile")

In [None]:
cross_validate_from_csv(csv_path_data="reproduce-exp/house_sales-folds.csv",
                        csv_path_preds="reproduce-exp/py-predictions-house_sales.csv",
                        model=RFSirus,target_col="target", fold_col="fold", is_clf=False)

In [None]:
res1,res2 = compute_metrics_csv(csv_path_data="reproduce-exp/house_sales-folds.csv",
                    csv_path_preds="reproduce-exp/py-predictions-house_sales.csv",
                    type_learning="reg",col_preds="pred", 
                    target_col="target",fold_col="fold",classification_threshold=0.5)

In [None]:
res1.mean()

In [None]:
res2.mean()