In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import PowerTransformer, RobustScaler

from sklearn.compose import ColumnTransformer
import pandas as pd
from sklearn.compose import make_column_selector
from sklearn import set_config
set_config(transform_output="pandas")

class SentenceGroupScaler(BaseEstimator, TransformerMixin):
    def __init__(self, features_to_scale, group_columns=['language', 'sentence']):
        self.features_to_scale = features_to_scale
        self.group_columns = group_columns
        
    def fit(self, X, y=None):
        # No need to store anything
        return self
    
    def transform(self, X):
        X = X.copy()
        X[self.features_to_scale] = X[self.features_to_scale].astype(np.float64)
        
        for _, group_df in X.groupby(self.group_columns):
            group_idx = group_df.index
            scaler = MinMaxScaler()
            X.loc[group_idx, self.features_to_scale] = scaler.fit_transform(
                group_df[self.features_to_scale]
            )
        return X
    
    def fit_transform(self, X, y=None):
        return self.transform(X)
    def set_output(self, *, transform=None):
        self._output_config = transform
        return self
    
def create_feature_pipeline(sentence_scaled_features):
    # Create individual transformers
    
    return Pipeline([
        ('sentence_scale', SentenceGroupScaler(
            features_to_scale=sentence_scaled_features
     )),
        # ('global_scale', global_scaler)
    ])

sentence_scaled_features = ['eccentricity'
                            # , 'subtree_size'
                            ] # Features for sentence-level scaling


# Build the pipeline
feature_pipeline = create_feature_pipeline(sentence_scaled_features)
feature_pipeline.set_output(transform="pandas")

In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.base import clone
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
import optuna
from functools import partial
import lightgbm as lgb
from xgboost import XGBClassifier

def one_root_f1_scorer(estimator, X_with_groups_and_features, y_true, feature_columns_for_model):
    """Custom scorer. X_with_groups_and_features should be a pandas DataFrame containing
       both group columns ('language', 'sentence') and the feature_columns_for_model."""

    if not isinstance(X_with_groups_and_features, pd.DataFrame):
        raise ValueError("X_with_groups_and_features must be a pandas DataFrame for this scorer version.")

    # Make predictions using only the specified feature columns
    X_for_predict = X_with_groups_and_features[feature_columns_for_model]
    try:
        proba_values = estimator.predict_proba(X_for_predict)[:, 1]
    except Exception as e:
        print(f"Error during predict_proba in scorer: {e}. Columns expected: {feature_columns_for_model}, Columns in X_for_predict: {X_for_predict.columns.tolist() if isinstance(X_for_predict, pd.DataFrame) else 'N/A (NumPy)'}")
        return 0.0 # Return a score that indicates failure

    # Add probabilities to a copy of the DataFrame to use its index for grouping and selection
    df_copy = X_with_groups_and_features.copy()
    df_copy['__proba__'] = proba_values

    y_pred = pd.Series(np.zeros(len(df_copy)), index=df_copy.index)
    group_cols = ['language', 'sentence']

    for _, group_df in df_copy.groupby(group_cols):
        if not group_df.empty:
            if '__proba__' in group_df.columns:
                idx_max = group_df['__proba__'].idxmax()
                y_pred.loc[idx_max] = 1
            else:
                # This case should not happen if proba_values were assigned correctly
                print(f"Warning: '__proba__' column missing in group_df during scoring.")
    
    # Ensure y_true and y_pred are numpy arrays for f1_score to avoid potential index issues
    return f1_score(y_true.values, y_pred.values)


def create_folds(df, n_folds, group_column):
    """Create folds for GroupKFold cross-validation"""
    df_copy = df.copy() # Work on a copy
    df_copy['kfold'] = -1 # Use df_copy
    # Ensure y is a Series from the copied DataFrame to maintain alignment
    y = df_copy['is_root']
    groups = df_copy[group_column]

    kf = GroupKFold(n_splits=n_folds)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df_copy, y=y, groups=groups)):
        df_copy.loc[val_idx, 'kfold'] = fold
        
    return df_copy # Return the DataFrame with the 'kfold' column


def objective(trial, df, feature_columns, feature_pipeline):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss', # or 'auc', 'f1' if lgb supports it directly for early stopping
        'verbosity': -1,
        'random_state': 42, # for reproducibility of LGBM runs
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=50), # Often good to tune n_estimators
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 20, 150), # Max 2^(max_depth)
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0), #  (colsample_bytree)
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0), # (subsample)
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True) # Often critical
    }
    n_folds = 4
    # Create folds returns a new df with 'kfold' column. Use this new_df.
    df_with_folds = create_folds(df, n_folds, group_column='sentence') 
    scores = []

    for fold in range(n_folds):
        trial.set_user_attr(f'fold_{fold+1}_status', 'starting')
        try:
            # Pass the trial object to run_model
            score = run_model(df_with_folds, fold, lgb.LGBMClassifier(), params, 
                              feature_columns, feature_pipeline, trial=trial)
            if np.isnan(score):
                trial.set_user_attr(f'fold_{fold+1}_status', 'returned_nan')
                print(f"Trial {trial.number} Fold {fold+1} returned NaN.")
            else:
                scores.append(score)
                trial.set_user_attr(f'fold_{fold+1}_score', score)
                trial.set_user_attr(f'fold_{fold+1}_status', 'completed')

        except Exception as e:
            print(f"Trial {trial.number} Fold {fold+1} failed with exception: {str(e)}")
            trial.set_user_attr(f'fold_{fold+1}_status', f'failed_exception: {str(e)}')
            # If a fold fails catastrophically, you might want to prune the trial
            # raise optuna.TrialPruned() # Uncomment to prune if any fold raises an unhandled exception
            continue # Or skip this fold and average the successful ones
    
    if not scores:
        print(f"Trial {trial.number} pruned: No scores were recorded (all folds failed or returned NaN without appending).")
        raise optuna.TrialPruned() # Prune if no scores are valid
    
    mean_score = np.mean([s for s in scores if not np.isnan(s)]) # Calculate mean of non-NaN scores
    if np.isnan(mean_score): # If all scores were NaN
        print(f"Trial {trial.number} resulted in all NaN scores, returning -inf.")
        return float('-inf') # Return a value indicating failure
        
    return mean_score


# Corrected run_model
def run_model(df_with_folds, fold, estimator, params, feature_columns, feature_pipeline=None, trial=None):
    df_train = df_with_folds[df_with_folds.kfold != fold].reset_index(drop=True)
    df_valid = df_with_folds[df_with_folds.kfold == fold].reset_index(drop=True)

    X_train_original = df_train.drop(columns=['is_root', 'kfold'])
    X_valid_original = df_valid.drop(columns=['is_root', 'kfold'])
    y_train = df_train['is_root']
    y_valid = df_valid['is_root']

    X_train_processed = X_train_original
    X_valid_processed = X_valid_original
    
    current_feature_columns = list(feature_columns) # Ensure it's a mutable list

    if feature_pipeline:
        X_train_processed = feature_pipeline.fit_transform(X_train_original)
        X_valid_processed = feature_pipeline.transform(X_valid_original)
        
        # If your pipeline changes columns, you MUST update current_feature_columns
        # Example: current_feature_columns = feature_pipeline.get_feature_names_out() (for sklearn pipelines)
        # If pipeline outputs numpy, ensure it's converted to DataFrame with correct columns before model.fit if model expects specific names
        if not isinstance(X_train_processed, pd.DataFrame) and hasattr(feature_pipeline, 'get_feature_names_out'):
            try:
                current_feature_columns = feature_pipeline.get_feature_names_out()
                X_train_processed = pd.DataFrame(X_train_processed, columns=current_feature_columns)
                X_valid_processed = pd.DataFrame(X_valid_processed, columns=current_feature_columns)
            except Exception as e:
                 if trial: trial.set_user_attr(f'fold_{fold}_feature_pipeline_error', str(e))
                 print(f"Fold {fold} error getting feature names from pipeline: {e}")
                 return float('nan')
        elif not isinstance(X_train_processed, pd.DataFrame):
             # Fallback if get_feature_names_out is not available or fails
             X_train_processed = pd.DataFrame(X_train_processed, columns=feature_columns) # This assumes original feature_columns apply
             X_valid_processed = pd.DataFrame(X_valid_processed, columns=feature_columns)


    model = clone(estimator)
    model.set_params(**params) # random_state is already in params from objective
    
    try:
        # Ensure the data passed to fit has only the feature columns the model expects
        # If X_train_processed is DataFrame:
        model.fit(X_train_processed[current_feature_columns], y_train)
            
    except Exception as e:
        if trial: trial.set_user_attr(f'fold_{fold}_fit_error', str(e) + f" Features: {current_feature_columns}")
        print(f"Fold {fold} model fitting failed: {str(e)}. Features used: {current_feature_columns}")
        return float('nan')

    try:
        score = one_root_f1_scorer(
            model,
            X_valid_processed, # This DF should now have 'language', 'sentence', and current_feature_columns
            y_valid,
            feature_columns_for_model=current_feature_columns
        )
    except Exception as e:
        if trial: trial.set_user_attr(f'fold_{fold}_score_error', str(e))
        print(f"Fold {fold} scoring failed: {str(e)}")
        return float('nan')
        
    return score


if __name__ == "__main__":
    # Initialize your data and features
    train_data=pd.read_csv('../data/train_processed.csv')
    feature_columns=['is_articulation', 'betweenness', 'eigencentrality',
                 'closeness', 'degree', 'pagerank', 'eccentricity']

    study = optuna.create_study(
        direction='maximize'
    )
    
    sentence_scaled_features = ['eccentricity'
                                # , 'subtree_size'
                                ] # Features for sentence-level scaling


    # Build the pipeline
    feature_pipeline = create_feature_pipeline(sentence_scaled_features)
    feature_pipeline.set_output(transform="pandas")
    # Pass additional arguments using partial
    objective_func = partial(
        objective,
        df=train_data,
        feature_columns=feature_columns,
        feature_pipeline=None  # Add your feature pipeline here
    )
    
    study.optimize(objective_func, n_trials=30, show_progress_bar=True)
    
    # Results
    print("Best trial:")
    trial = study.best_trial
    print(f"  Value (F1 Score): {trial.value}")
    print("  Params: ")
    for key, value in trial.params.items():
        print(f"    {key}: {value}")

[I 2025-05-15 07:20:08,773] A new study created in memory with name: no-name-b4ed32a3-799d-4232-84c8-dcf61a90536b


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2025-05-15 07:20:26,023] Trial 0 finished with value: 0.2683809523809524 and parameters: {'n_estimators': 350, 'lambda_l1': 0.0013175308177243695, 'lambda_l2': 0.019240549286187222, 'num_leaves': 67, 'feature_fraction': 0.6033249604631762, 'bagging_fraction': 0.8297577613323112, 'bagging_freq': 7, 'min_child_samples': 65, 'learning_rate': 0.06777983812876885}. Best is trial 0 with value: 0.2683809523809524.
[I 2025-05-15 07:20:41,152] Trial 1 finished with value: 0.26866666666666666 and parameters: {'n_estimators': 650, 'lambda_l1': 0.14228363663844784, 'lambda_l2': 3.0730265178624385e-05, 'num_leaves': 26, 'feature_fraction': 0.9901059262095292, 'bagging_fraction': 0.5684702470852004, 'bagging_freq': 2, 'min_child_samples': 96, 'learning_rate': 0.08479002889542284}. Best is trial 1 with value: 0.26866666666666666.
[I 2025-05-15 07:20:50,221] Trial 2 finished with value: 0.28247619047619044 and parameters: {'n_estimators': 350, 'lambda_l1': 6.753414223904012e-07, 'lambda_l2': 4.7693