In [1]:
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import set_config
import numpy as np
import pandas as pd
import optuna
from functools import partial

set_config(transform_output="pandas")
np.random.seed(42)


In [2]:
def one_root_f1_scorer(estimator, X_with_groups_and_features, y_true, feature_columns_for_model):
    """Custom scorer. X_with_groups_and_features should be a pandas DataFrame containing
       both group columns ('language', 'sentence') and the feature_columns_for_model."""

    if not isinstance(X_with_groups_and_features, pd.DataFrame):
        raise ValueError("X_with_groups_and_features must be a pandas DataFrame for this scorer version.")

    # Make predictions using only the specified feature columns
    X_for_predict = X_with_groups_and_features[feature_columns_for_model]
    try:
        proba_values = estimator.predict_proba(X_for_predict)[:, 1]
    except Exception as e:
        print(f"Error during predict_proba in scorer: {e}. Columns expected: {feature_columns_for_model}, Columns in X_for_predict: {X_for_predict.columns.tolist() if isinstance(X_for_predict, pd.DataFrame) else 'N/A (NumPy)'}")
        return 0.0 # Return a score that indicates failure

    # Add probabilities to a copy of the DataFrame to use its index for grouping and selection
    df_copy = X_with_groups_and_features.copy()
    df_copy['__proba__'] = proba_values

    y_pred = pd.Series(np.zeros(len(df_copy)), index=df_copy.index)
    group_cols = ['language', 'sentence']

    for _, group_df in df_copy.groupby(group_cols):
        if not group_df.empty:
            if '__proba__' in group_df.columns:
                idx_max = group_df['__proba__'].idxmax()
                y_pred.loc[idx_max] = 1
            else:
                # This case should not happen if proba_values were assigned correctly
                print(f"Warning: '__proba__' column missing in group_df during scoring.")
    
    # Ensure y_true and y_pred are numpy arrays for f1_score to avoid potential index issues
    return f1_score(y_true.values, y_pred.values)

In [3]:
def create_folds(df, n_folds, group_column):
    """Create folds for GroupKFold cross-validation"""
    df_copy = df.copy()
    df_copy['kfold'] = -1
    y = df_copy['is_root']
    groups = df_copy[group_column]

    kf = GroupKFold(n_splits=n_folds)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df_copy, y=y, groups=groups)):
        df_copy.loc[val_idx, 'kfold'] = fold
        
    return df_copy

In [4]:
from sklearn.base import clone

def run_model(df_with_folds, fold, estimator, params, feature_columns, feature_pipeline=None, trial=None):
    """Run model training and evaluation for a single fold."""
    df_train = df_with_folds[df_with_folds.kfold != fold].reset_index(drop=True)
    df_valid = df_with_folds[df_with_folds.kfold == fold].reset_index(drop=True)

    X_train_original = df_train.drop(columns=['is_root', 'kfold'])
    X_valid_original = df_valid.drop(columns=['is_root', 'kfold'])
    y_train = df_train['is_root']
    y_valid = df_valid['is_root']

    X_train_processed = X_train_original
    X_valid_processed = X_valid_original
    
    current_feature_columns = list(feature_columns)

    if feature_pipeline:
        X_train_processed = feature_pipeline.fit_transform(X_train_original)
        X_valid_processed = feature_pipeline.transform(X_valid_original)
        
        if not isinstance(X_train_processed, pd.DataFrame) and hasattr(feature_pipeline, 'get_feature_names_out'):
            try:
                current_feature_columns = feature_pipeline.get_feature_names_out()
                X_train_processed = pd.DataFrame(X_train_processed, columns=current_feature_columns)
                X_valid_processed = pd.DataFrame(X_valid_processed, columns=current_feature_columns)
            except Exception as e:
                if trial: trial.set_user_attr(f'fold_{fold}_feature_pipeline_error', str(e))
                print(f"Fold {fold} error getting feature names from pipeline: {e}")
                return float('nan')
        elif not isinstance(X_train_processed, pd.DataFrame):
            X_train_processed = pd.DataFrame(X_train_processed, columns=feature_columns)
            X_valid_processed = pd.DataFrame(X_valid_processed, columns=feature_columns)

    model = clone(estimator)
    model.set_params(**params) # random_state is already in params from objective
    
    try:
        model.fit(X_train_processed[current_feature_columns], y_train)
    except Exception as e:
        if trial: trial.set_user_attr(f'fold_{fold}_fit_error', str(e))
        print(f"Fold {fold} model fitting failed: {str(e)}. Features used: {current_feature_columns}")
        return float('nan')

    try:
        score = one_root_f1_scorer(
            model,
            X_valid_processed,
            y_valid,
            feature_columns_for_model=current_feature_columns
        )
    except Exception as e:
        if trial: trial.set_user_attr(f'fold_{fold}_score_error', str(e))
        print(f"Fold {fold} scoring failed: {str(e)}")
        return float('nan')
        
    return score

In [5]:

def evaluate_model_performance(model, X_with_groups_and_features, y_true, feature_columns_for_model):
    if not isinstance(X_with_groups_and_features, pd.DataFrame):
        raise ValueError("X_with_groups_and_features must be a pandas DataFrame for this scorer version.")

    # Make predictions using only the specified feature columns
    X_for_predict = X_with_groups_and_features[feature_columns_for_model]
    try:
        proba_values = model.predict_proba(X_for_predict)[:, 1]
    except Exception as e:
        print(f"Error during predict_proba in evaluation: {e}")
        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}, {}

    # Add probabilities to a copy of the DataFrame
    df_copy = X_with_groups_and_features.copy()
    df_copy['__proba__'] = proba_values

    # Initialize predictions
    y_pred = pd.Series(np.zeros(len(df_copy)), index=df_copy.index)
    group_cols = ['language', 'sentence']

    # Ensure one root per sentence
    for _, group_df in df_copy.groupby(group_cols):
        if not group_df.empty:
            if '__proba__' in group_df.columns:
                idx_max = group_df['__proba__'].idxmax()
                y_pred.loc[idx_max] = 1
            else:
                print(f"Warning: '__proba__' column missing in group_df during evaluation.")

    # Calculate overall metrics
    overall_metrics = {
        'precision': precision_score(y_true.values, y_pred.values),
        'recall': recall_score(y_true.values, y_pred.values),
        'f1': f1_score(y_true.values, y_pred.values)
    }

    # Calculate per-language metrics
    per_language_metrics = {}
    for lang in X_with_groups_and_features['language'].unique():
        lang_mask = X_with_groups_and_features['language'] == lang
        lang_true = y_true[lang_mask]
        lang_pred = y_pred[lang_mask]
        
        # Only calculate metrics if there are samples for this language
        if len(lang_true) > 0:
            per_language_metrics[lang] = {
                'precision': precision_score(lang_true, lang_pred),
                'recall': recall_score(lang_true, lang_pred),
                'f1': f1_score(lang_true, lang_pred)
            }

    return overall_metrics, per_language_metrics

In [6]:
def objective(trial, df, feature_columns, feature_pipeline=None):
    try:
        # First decide which configuration to use
        # config_type = trial.suggest_categorical('config_type', ['l1_l2', 'elasticnet'])
        config_type = trial.suggest_categorical('config_type', ['l1_l2'])
        
        class_weight_option = trial.suggest_categorical('class_weight', ['balanced', 'custom'])
        class_weight = {0: 1, 1: 10} if class_weight_option == 'custom' else 'balanced'

        if config_type == 'l1_l2':
            params = {
                'penalty': trial.suggest_categorical('penalty', ['l1','l2']),
                'C': trial.suggest_float('C', 0.0000001, 100, log=True),
                'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
                'class_weight': class_weight,
                'max_iter': 2000,
                'random_state': 42
            } 
        else:  # elasticnet
            params = {
                'penalty': 'elasticnet',
                'C': trial.suggest_float('C', 0.0000001, 0.01, log=True),
                'solver': 'saga',
                'l1_ratio': trial.suggest_float('l1_ratio', 0.1, 0.9),
                'class_weight': 'balanced',
                'max_iter': 1000,
                'random_state': 42
            }
        
        n_folds = 4
        df_with_folds = create_folds(df, n_folds, group_column='sentence')
        scores = []

        for fold in range(n_folds):
            trial.set_user_attr(f'fold_{fold+1}_status', 'starting')
            try:
                score = run_model(
                    df_with_folds, 
                    fold, 
                    LogisticRegression(), 
                    params, 
                    feature_columns, 
                    feature_pipeline, 
                    trial=trial
                )
                
                if np.isnan(score):
                    trial.set_user_attr(f'fold_{fold+1}_status', 'returned_nan')
                    print(f"Trial {trial.number} Fold {fold+1} returned NaN.")
                else:
                    scores.append(score)
                    trial.set_user_attr(f'fold_{fold+1}_score', score)
                    trial.set_user_attr(f'fold_{fold+1}_status', 'completed')

                # Report intermediate value for pruning
                trial.report(np.mean(scores), fold)

            except Exception as e:
                print(f"Trial {trial.number} Fold {fold+1} failed with exception: {str(e)}")
                trial.set_user_attr(f'fold_{fold+1}_status', f'failed_exception: {str(e)}')
                continue
        
        if not scores:
            trial.set_user_attr('pruned_reason', 'No valid scores obtained')
            raise optuna.TrialPruned()
        
        mean_score = np.mean([s for s in scores if not np.isnan(s)])
        if np.isnan(mean_score):
            print(f"Trial {trial.number} resulted in all NaN scores, returning -inf.")
            return float('-inf')
            
        return mean_score
    except Exception as e:
        # print(f"Trial {trial.number} failed completely with error: {str(e)}")
        trial.set_user_attr('complete_failure', str(e))
        return float('-inf')

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import PowerTransformer, RobustScaler

from sklearn.compose import ColumnTransformer
import pandas as pd
from sklearn.compose import make_column_selector
from sklearn import set_config
set_config(transform_output="pandas")

class SentenceGroupScaler(BaseEstimator, TransformerMixin):
    def __init__(self, features_to_scale, group_columns=['language', 'sentence']):
        self.features_to_scale = features_to_scale
        self.group_columns = group_columns
        
    def fit(self, X, y=None):
        # No need to store anything
        return self
    
    def transform(self, X):
        X = X.copy()
        X[self.features_to_scale] = X[self.features_to_scale].astype(np.float64)
        
        for _, group_df in X.groupby(self.group_columns):
            group_idx = group_df.index
            scaler = MinMaxScaler()
            X.loc[group_idx, self.features_to_scale] = scaler.fit_transform(
                group_df[self.features_to_scale]
            )
        return X
    
    def fit_transform(self, X, y=None):
        return self.transform(X)
    def set_output(self, *, transform=None):
        self._output_config = transform
        return self
    
def create_feature_pipeline(sentence_scaled_features):
    # Create individual transformers
    
    return Pipeline([
        ('sentence_scale', SentenceGroupScaler(
            features_to_scale=sentence_scaled_features
     )),
        # ('global_scale', global_scaler)
    ])

In [8]:
# Load data
train_data = pd.read_csv('/Users/elnararb/Documents/UPC/Machine Learning/Project_ML/data/train_random_processed.csv')

In [9]:

sentence_scaled_features = ['eccentricity','path_asymmetry', 'avg_neighbor_degree', 'diameter_path'] # Features for sentence-level scaling


# Build the pipeline 
feature_pipeline = create_feature_pipeline(sentence_scaled_features)
feature_pipeline.set_output(transform="pandas")

In [None]:
feature_columns = ['closeness', 'pagerank',
                   'is_articulation', 'betweenness', 'eigencentrality', 'eccentricity', 'is_japanese',
        'lang_group_head_final_sov', 'lang_group_romance_svo', 'lang_group_germanic_v2', 'lang_group_free_order_case', 'lang_group_analytic',
      'path_asymmetry', 'avg_neighbor_degree', 'diameter_path']

# Create study 
study = optuna.create_study(
    direction='maximize'
)

sentence_scaled_features = ['eccentricity','path_asymmetry', 'avg_neighbor_degree', 'diameter_path'] # Features for sentence-level scaling


# Build the pipeline 
feature_pipeline = create_feature_pipeline(sentence_scaled_features)
feature_pipeline.set_output(transform="pandas")

# Create objective function with fixed parameters
objective_func = partial(
    objective, 
    df=train_data, 
    feature_columns=feature_columns,
    feature_pipeline=feature_pipeline
)

# Optimize with more trials
study.optimize(objective_func, n_trials=50, show_progress_bar=True) 

# Print results
print("\nBest trial:")
trial = study.best_trial
print(f"  Value (F1 Score): {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-06-01 13:47:31,584] A new study created in memory with name: no-name-140ff71e-4489-40b8-8744-bcaa21a89048


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-06-01 13:48:18,218] Trial 0 finished with value: 0.2735238095238095 and parameters: {'config_type': 'l1_l2', 'class_weight': 'custom', 'penalty': 'l1', 'C': 0.0010349538551790401, 'solver': 'liblinear'}. Best is trial 0 with value: 0.2735238095238095.




[I 2025-06-01 13:52:06,307] Trial 1 finished with value: 0.26857142857142857 and parameters: {'config_type': 'l1_l2', 'class_weight': 'balanced', 'penalty': 'l1', 'C': 0.001383719557073505, 'solver': 'saga'}. Best is trial 0 with value: 0.2735238095238095.
[I 2025-06-01 13:52:51,501] Trial 2 finished with value: 0.26076190476190475 and parameters: {'config_type': 'l1_l2', 'class_weight': 'custom', 'penalty': 'l2', 'C': 1.1740979796740421e-05, 'solver': 'saga'}. Best is trial 0 with value: 0.2735238095238095.




In [None]:
feature_columns = ['closeness', 'pagerank',
                   'is_articulation', 'betweenness', 'eigencentrality', 'eccentricity', 'is_japanese',
        'lang_group_head_final_sov', 'lang_group_romance_svo', 'lang_group_germanic_v2', 'lang_group_free_order_case', 'lang_group_analytic',
      'path_asymmetry', 'avg_neighbor_degree', 'diameter_path']

# Create study 
study = optuna.create_study(
    direction='maximize'
)

sentence_scaled_features = ['eccentricity','path_asymmetry', 'avg_neighbor_degree'] # Features for sentence-level scaling


# Build the pipeline 
feature_pipeline = create_feature_pipeline(sentence_scaled_features)
feature_pipeline.set_output(transform="pandas")

# Create objective function with fixed parameters
objective_func = partial(
    objective, 
    df=train_data, 
    feature_columns=feature_columns,
    feature_pipeline=feature_pipeline
)

# Optimize with more trials
study.optimize(objective_func, n_trials=50, show_progress_bar=True) 

# Print results
print("\nBest trial:")
trial = study.best_trial
print(f"  Value (F1 Score): {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-06-01 09:52:33,789] A new study created in memory with name: no-name-d900df97-6154-4e5d-b3d5-1e47d0bf91ce


  0%|          | 0/50 [00:00<?, ?it/s]



[I 2025-06-01 09:53:35,300] Trial 0 finished with value: 0.2682857142857143 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.00029299492989435285, 'solver': 'saga', 'class_weight': {0: 1, 1: 10}}. Best is trial 0 with value: 0.2682857142857143.




[I 2025-06-01 09:54:26,189] Trial 1 finished with value: 0.26466666666666666 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.00012185591325805504, 'solver': 'saga', 'class_weight': {0: 1, 1: 10}}. Best is trial 0 with value: 0.2682857142857143.




[I 2025-06-01 09:55:22,398] Trial 2 finished with value: 0.2673333333333333 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.0016962903783056856, 'solver': 'saga', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.2682857142857143.




[I 2025-06-01 09:56:06,341] Trial 3 finished with value: 0.25952380952380955 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.0004657039935175277, 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.2682857142857143.




[I 2025-06-01 09:57:04,161] Trial 4 finished with value: 0.2684761904761905 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.00046336760980386193, 'solver': 'saga', 'class_weight': 'balanced'}. Best is trial 4 with value: 0.2684761904761905.




[I 2025-06-01 10:00:28,399] Trial 5 finished with value: 0.2713333333333333 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 5.408192188020274, 'solver': 'saga', 'class_weight': {0: 1, 1: 10}}. Best is trial 5 with value: 0.2713333333333333.
[I 2025-06-01 10:01:12,705] Trial 6 finished with value: 0.274 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.27764795771060985, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 6 with value: 0.274.




[I 2025-06-01 10:01:58,869] Trial 7 finished with value: 0.2613333333333333 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 2.515342193275605e-05, 'solver': 'saga', 'class_weight': {0: 1, 1: 10}}. Best is trial 6 with value: 0.274.




[I 2025-06-01 10:02:42,915] Trial 8 finished with value: 0.26590476190476187 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.010052917310287752, 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 6 with value: 0.274.




[I 2025-06-01 10:03:38,050] Trial 9 finished with value: 0.2672380952380952 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.0033568884297714738, 'solver': 'saga', 'class_weight': 'balanced'}. Best is trial 6 with value: 0.274.




[I 2025-06-01 10:04:22,502] Trial 10 finished with value: 0.2 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 1.0126613920623274e-07, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 6 with value: 0.274.




[I 2025-06-01 10:05:09,641] Trial 11 finished with value: 0.2744761904761905 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 22.623135144966703, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 11 with value: 0.2744761904761905.




[I 2025-06-01 10:05:55,346] Trial 12 finished with value: 0.2742857142857143 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 86.2389120482131, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 11 with value: 0.2744761904761905.




[I 2025-06-01 10:06:40,618] Trial 13 finished with value: 0.2743809523809524 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 41.38529929971263, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 11 with value: 0.2744761904761905.




[I 2025-06-01 10:07:25,154] Trial 14 finished with value: 0.2742857142857143 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 75.68738391404577, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 11 with value: 0.2744761904761905.




[I 2025-06-01 10:08:09,910] Trial 15 finished with value: 0.2741904761904762 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.4349344453156768, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 11 with value: 0.2744761904761905.




[I 2025-06-01 10:08:54,288] Trial 16 finished with value: 0.2744761904761905 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 2.4338127171175863, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 11 with value: 0.2744761904761905.




[I 2025-06-01 10:09:40,587] Trial 17 finished with value: 0.2744761904761905 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 1.2558784167873982, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 11 with value: 0.2744761904761905.




[I 2025-06-01 10:10:27,735] Trial 18 finished with value: 0.2702857142857143 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.02775629982626467, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 11 with value: 0.2744761904761905.




[I 2025-06-01 10:11:11,948] Trial 19 finished with value: 0.2745714285714286 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 4.990410553642827, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:11:58,090] Trial 20 finished with value: 0.2732380952380953 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.12117606323870411, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:12:43,388] Trial 21 finished with value: 0.2743809523809524 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 7.750416364268543, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:13:28,077] Trial 22 finished with value: 0.2744761904761905 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 4.388787532098385, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:14:12,061] Trial 23 finished with value: 0.2743809523809524 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 7.623958299999668, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:14:57,322] Trial 24 finished with value: 0.2733333333333333 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.0968745022841927, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:15:42,245] Trial 25 finished with value: 0.2744761904761905 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 1.331627631541223, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:16:30,053] Trial 26 finished with value: 0.27314285714285713 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 13.60139702519743, 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:17:17,856] Trial 27 finished with value: 0.21638095238095237 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 3.075920472679235e-06, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:18:02,496] Trial 28 finished with value: 0.2743809523809524 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 23.421291340397563, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:18:47,517] Trial 29 finished with value: 0.2743809523809524 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 1.6847780010017674, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:19:31,957] Trial 30 finished with value: 0.2712380952380953 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.03671839531601787, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:20:16,587] Trial 31 finished with value: 0.2741904761904762 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.6010963275223997, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:21:01,734] Trial 32 finished with value: 0.2743809523809524 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 1.919755431982983, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:21:46,224] Trial 33 finished with value: 0.2743809523809524 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 23.329643782588413, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:24:57,508] Trial 34 finished with value: 0.27199999999999996 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 1.3796980496513698, 'solver': 'saga', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.
[I 2025-06-01 10:25:42,237] Trial 35 finished with value: 0.2722857142857143 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.17344603877586803, 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:29:12,403] Trial 36 finished with value: 0.2683809523809524 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 4.9352172633270115, 'solver': 'saga', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.
[I 2025-06-01 10:29:57,523] Trial 37 finished with value: 0.27304761904761904 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.6676504637884789, 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:33:20,742] Trial 38 finished with value: 0.2646666666666667 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 26.12819354236278, 'solver': 'saga', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.
[I 2025-06-01 10:34:05,215] Trial 39 finished with value: 0.27161904761904765 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.043191264460772615, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:34:49,415] Trial 40 finished with value: 0.264 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.004885267998932965, 'solver': 'liblinear', 'class_weight': 'balanced'}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:35:35,369] Trial 41 finished with value: 0.2743809523809524 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 6.485737386363507, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:36:21,204] Trial 42 finished with value: 0.2743809523809524 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 5.1168234422152645, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:37:07,539] Trial 43 finished with value: 0.2742857142857143 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 3.849521639337718, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:37:52,511] Trial 44 finished with value: 0.2613333333333333 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.0009275617057880708, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:38:42,005] Trial 45 finished with value: 0.2638095238095238 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 9.098089953196419e-05, 'solver': 'saga', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:39:27,660] Trial 46 finished with value: 0.2743809523809524 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 93.3437353887726, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:40:11,977] Trial 47 finished with value: 0.2741904761904762 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.7372787861003515, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:40:57,499] Trial 48 finished with value: 0.2744761904761905 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 15.131459672081084, 'solver': 'liblinear', 'class_weight': {0: 1, 1: 10}}. Best is trial 19 with value: 0.2745714285714286.




[I 2025-06-01 10:41:51,314] Trial 49 finished with value: 0.27314285714285713 and parameters: {'config_type': 'l1_l2', 'penalty': 'l2', 'C': 0.28371234142605584, 'solver': 'saga', 'class_weight': 'balanced'}. Best is trial 19 with value: 0.2745714285714286.

Best trial:
  Value (F1 Score): 0.2745714285714286
  Params: 
    config_type: l1_l2
    penalty: l2
    C: 4.990410553642827
    solver: liblinear
    class_weight: {0: 1, 1: 10}


# Validation

In [10]:
sentence_scaled_features = ['eccentricity','path_asymmetry', 'avg_neighbor_degree'] # Features for sentence-level scaling


# Build the pipeline 
feature_pipeline = create_feature_pipeline(sentence_scaled_features)
feature_pipeline.set_output(transform="pandas")

In [11]:
from imblearn.under_sampling import RandomUnderSampler

n_folds = 4
df_with_folds = create_folds(train_data, n_folds, group_column='sentence')

# Use the last fold as validation set
val_fold = n_folds - 1
df_train = df_with_folds[df_with_folds.kfold != val_fold].reset_index(drop=True)
df_valid = df_with_folds[df_with_folds.kfold == val_fold].reset_index(drop=True)
feature_columns = ['closeness', 'pagerank',
                   'is_articulation', 'betweenness', 'eigencentrality', 'eccentricity', 'is_japanese',
        'lang_group_head_final_sov', 'lang_group_romance_svo', 'lang_group_germanic_v2', 'lang_group_free_order_case', 'lang_group_analytic',
      'path_asymmetry', 'avg_neighbor_degree', 'diameter_path']

X_valid = df_valid.drop(columns=['is_root', 'kfold'])
y_valid = df_valid['is_root']
X_train = df_train.drop(columns=['is_root', 'kfold'])
y_train = df_train['is_root']
# Uncomment this if using Optuna
# best_model = LogisticRegression(**trial.params)

best_model = LogisticRegression(
    C=0.06800570611214875, 
    penalty='l1',
    solver='liblinear',
    class_weight={0: 1, 1: 10},
    max_iter=2000
)

X_train_trans = feature_pipeline.fit_transform(X_train)
X_valid_trans = feature_pipeline.transform(X_valid)

# rus = RandomUnderSampler(random_state=42)
# X_train_res, y_train_res = rus.fit_resample(X_train, y_train)
best_model.fit(X_train_trans[feature_columns], y_train)


# Evaluate on validation set
print("\nModel Performance (Validation Set):")
overall_metrics, per_language_metrics = evaluate_model_performance(
    best_model, 
    X_valid_trans, 
    y_valid,
    feature_columns
)

print("\nOverall Performance:")
print(f"Precision: {overall_metrics['precision']:.4f}")
print(f"Recall: {overall_metrics['recall']:.4f}")
print(f"F1 Score: {overall_metrics['f1']:.4f}")

print("\nPer-Language Performance:")
for lang, metrics in per_language_metrics.items():
    print(f"\n{lang}:")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1 Score: {metrics['f1']:.4f}")


Model Performance (Validation Set):

Overall Performance:
Precision: 0.3040
Recall: 0.3040
F1 Score: 0.3040

Per-Language Performance:

Japanese:
  Precision: 0.0480
  Recall: 0.0480
  F1 Score: 0.0480

Finnish:
  Precision: 0.3920
  Recall: 0.3920
  F1 Score: 0.3920

Galician:
  Precision: 0.3040
  Recall: 0.3040
  F1 Score: 0.3040

English:
  Precision: 0.2480
  Recall: 0.2480
  F1 Score: 0.2480

Hindi:
  Precision: 0.2640
  Recall: 0.2640
  F1 Score: 0.2640

French:
  Precision: 0.3040
  Recall: 0.3040
  F1 Score: 0.3040

Italian:
  Precision: 0.2640
  Recall: 0.2640
  F1 Score: 0.2640

Indonesian:
  Precision: 0.3280
  Recall: 0.3280
  F1 Score: 0.3280

Swedish:
  Precision: 0.3680
  Recall: 0.3680
  F1 Score: 0.3680

Spanish:
  Precision: 0.3280
  Recall: 0.3280
  F1 Score: 0.3280

Icelandic:
  Precision: 0.2720
  Recall: 0.2720
  F1 Score: 0.2720

German:
  Precision: 0.2960
  Recall: 0.2960
  F1 Score: 0.2960

Korean:
  Precision: 0.3280
  Recall: 0.3280
  F1 Score: 0.3280

Pol

In [13]:
n_folds = 4
df_with_folds = create_folds(train_data, n_folds, group_column='sentence')

# Use the last fold as validation set
val_fold = n_folds - 1
df_train = df_with_folds[df_with_folds.kfold != val_fold].reset_index(drop=True)
df_valid = df_with_folds[df_with_folds.kfold == val_fold].reset_index(drop=True)

X_valid = df_valid.drop(columns=['is_root', 'kfold'])
y_valid = df_valid['is_root']
X_train = df_train.drop(columns=['is_root', 'kfold'])
y_train = df_train['is_root']
# Uncomment this if using Optuna
# best_model = LogisticRegression(**trial.params)

# Best parameters with C > 0.0000001
# best_model = LogisticRegression(
#     C=2.7145102022431556e-07, 
#     l1_ratio=0.8868773763606811,
#     penalty='elasticnet',
#     solver='saga',
#     class_weight='balanced',
#     max_iter=1000
# )

# Best parameters with C > 1
best_model = LogisticRegression(
    C=99.86981387827636, 
    penalty='l2',
    solver='liblinear',
    class_weight={0: 1, 1: 10},
    max_iter=1000
)

feature_columns=['closeness', 'degree', 'pagerank', 
                #  'voterank',
                 'is_articulation', 'betweenness', 'eigencentrality', 'eccentricity', 'is_japanese',
        'lang_group_head_final_sov', 'lang_group_romance_svo', 'lang_group_germanic_v2', 'lang_group_free_order_case', 'lang_group_analytic']
# X_train_trans = feature_pipeline.fit_transform(X_train)
# X_valid_trans = feature_pipeline.transform(X_valid)
best_model.fit(X_train[feature_columns], y_train)


# Evaluate on validation set
print("\nModel Performance (Validation Set):")
overall_metrics, per_language_metrics = evaluate_model_performance(
    best_model, 
    X_valid, 
    y_valid,
    feature_columns
)

print("\nOverall Performance:")
print(f"Precision: {overall_metrics['precision']:.4f}")
print(f"Recall: {overall_metrics['recall']:.4f}")
print(f"F1 Score: {overall_metrics['f1']:.4f}")

print("\nPer-Language Performance:")
for lang, metrics in per_language_metrics.items():
    print(f"\n{lang}:")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1 Score: {metrics['f1']:.4f}")


Model Performance (Validation Set):

Overall Performance:
Precision: 0.2971
Recall: 0.2971
F1 Score: 0.2971

Per-Language Performance:

Japanese:
  Precision: 0.0480
  Recall: 0.0480
  F1 Score: 0.0480

Finnish:
  Precision: 0.3760
  Recall: 0.3760
  F1 Score: 0.3760

Galician:
  Precision: 0.2800
  Recall: 0.2800
  F1 Score: 0.2800

English:
  Precision: 0.2480
  Recall: 0.2480
  F1 Score: 0.2480

Hindi:
  Precision: 0.2320
  Recall: 0.2320
  F1 Score: 0.2320

French:
  Precision: 0.2960
  Recall: 0.2960
  F1 Score: 0.2960

Italian:
  Precision: 0.2320
  Recall: 0.2320
  F1 Score: 0.2320

Indonesian:
  Precision: 0.3440
  Recall: 0.3440
  F1 Score: 0.3440

Swedish:
  Precision: 0.3680
  Recall: 0.3680
  F1 Score: 0.3680

Spanish:
  Precision: 0.3280
  Recall: 0.3280
  F1 Score: 0.3280

Icelandic:
  Precision: 0.2560
  Recall: 0.2560
  F1 Score: 0.2560

German:
  Precision: 0.2800
  Recall: 0.2800
  F1 Score: 0.2800

Korean:
  Precision: 0.3200
  Recall: 0.3200
  F1 Score: 0.3200

Pol

In [15]:
# Create a DataFrame for overall metrics
overall_df = pd.DataFrame([overall_metrics], index=['Overall'])

# Create a DataFrame for per-language metrics
per_lang_df = pd.DataFrame(per_language_metrics).T

# Combine both DataFrames
combined_metrics = pd.concat([overall_df, per_lang_df])

# Format the metrics to 4 decimal places
formatted_metrics = combined_metrics.round(4)

# Display the formatted table
print("\nPerformance Metrics:")
print(formatted_metrics.to_string())

# Save to CSV
formatted_metrics.to_csv('results_log2.csv')

# print("Results have been saved to results.csv")


Performance Metrics:
            precision  recall     f1
Overall         0.304   0.304  0.304
Japanese        0.048   0.048  0.048
Finnish         0.392   0.392  0.392
Galician        0.304   0.304  0.304
English         0.248   0.248  0.248
Hindi           0.264   0.264  0.264
French          0.304   0.304  0.304
Italian         0.264   0.264  0.264
Indonesian      0.328   0.328  0.328
Swedish         0.368   0.368  0.368
Spanish         0.328   0.328  0.328
Icelandic       0.272   0.272  0.272
German          0.296   0.296  0.296
Korean          0.328   0.328  0.328
Polish          0.312   0.312  0.312
Thai            0.312   0.312  0.312
Turkish         0.344   0.344  0.344
Czech           0.352   0.352  0.352
Chinese         0.256   0.256  0.256
Portuguese      0.344   0.344  0.344
Arabic          0.344   0.344  0.344
Russian         0.376   0.376  0.376


# Test

In [17]:
feature_columns = ['closeness', 'pagerank',
                   'is_articulation', 'betweenness', 'eigencentrality', 'eccentricity', 'is_japanese',
        'lang_group_head_final_sov', 'lang_group_romance_svo', 'lang_group_germanic_v2', 'lang_group_free_order_case', 'lang_group_analytic',
      'path_asymmetry', 'avg_neighbor_degree', 'diameter_path']

In [18]:
# Create final model with best parameters and generate predictions
print("\nTraining final model with best parameters...")
import pandas as pd
from sklearn.linear_model import LogisticRegression
train_data=pd.read_csv("/Users/elnararb/Documents/UPC/Machine Learning/Project_ML/data/train_random_processed.csv")
# Process training data
y_train = train_data['is_root']
X_train_processed = train_data.drop(columns=['is_root'])

# Initialize and train model
# Using pop() (does nothing if key is not present)
# trial.params.pop('config_type', None)

# Uncomment this if using Optuna
# final_model = LogisticRegression(**trial.params)

# Best parameters with C > 0.0000001
# final_model = LogisticRegression(
#     C=2.7145102022431556e-07, 
#     l1_ratio=0.8868773763606811,
#     penalty='elasticnet',
#     solver='saga',
#     class_weight='balanced',
#     max_iter=1000
# )

# Best parameters with C > 1
# final_model = LogisticRegression(
#     C=4.990410553642827, 
#     penalty='l2',
#     solver='liblinear',
#     class_weight={0: 1, 1: 10},
#     max_iter=2000
# )

final_model = LogisticRegression(
    C=0.06800570611214875, 
    penalty='l1',
    solver='liblinear',
    class_weight={0: 1, 1: 10},
    max_iter=2000
)

X_train_trans = feature_pipeline.fit_transform(X_train_processed)
# X_valid_trans = feature_pipeline.transform(X_valid)

final_model.fit(X_train_trans[feature_columns], y_train)


Training final model with best parameters...


In [19]:
import joblib
joblib.dump(final_model, 'logreg_final4.pkl')

['logreg_final4.pkl']

In [20]:
# Load test data
test_data = pd.read_csv('/Users/elnararb/Documents/UPC/Machine Learning/Project_ML/data/test_random_processed.csv')

test_scaled = feature_pipeline.transform(test_data)

# Generate predictions
test_proba = final_model.predict_proba(test_scaled[feature_columns])[:, 1]
test_data['root_probability'] = test_proba

# Create predictions ensuring one root per sentence
predictions = pd.Series(np.zeros(len(test_data)), index=test_data.index)
group_cols = ['language', 'sentence']

for _, group_df in test_data.groupby(group_cols):
    if not group_df.empty:
        idx_max = group_df['root_probability'].idxmax()
        predictions.loc[idx_max] = 1

In [16]:
def generate_submission_file(predictions_df, original_test_data, output_path='submission.csv'):
    """Generate the final submission file in the required format"""
    # Create a mapping from (language, sentence) to original ID
    sentence_lang_to_id = {}
    
    # Create mapping from (language, sentence) to id
    for _, row in original_test_data.drop_duplicates(['language', 'sentence']).iterrows():
        sentence_lang_to_id[(row['language'], row['sentence'])] = row['id']
    
    # Extract sentence IDs and predicted root vertices
    submission = []
    
    for (lang, sent), group in predictions_df.groupby(['language', 'sentence']):
        # Get the node with highest probability for this sentence and language
        top_node = group.sort_values('root_probability', ascending=False).iloc[0]
        
        # Use the original id
        original_id = sentence_lang_to_id.get((lang, sent))
        
        submission.append({
            'id': original_id,
            'root': int(top_node['vertex'])
        })
    
    # Create and sort submission dataframe
    submission_df = pd.DataFrame(submission)
    submission_df = submission_df.sort_values('id')
    
    # Save to CSV
    submission_df.to_csv(output_path, index=False)
    print(f"Submission saved to {output_path}")
    
    return submission_df

In [21]:

test_scaled = feature_pipeline.transform(test_data)

# Generate predictions
test_proba = final_model.predict_proba(test_scaled[feature_columns])[:, 1]
test_data['root_probability'] = test_proba

# Create submission dataframe
submission = []
for (lang, sent), group in test_data.groupby(['language', 'sentence']):
    # Get the node with highest probability for this sentence and language
    top_node = group.loc[group['root_probability'].idxmax()]
    submission.append({
        'id': top_node['id'],
        'root': int(top_node['vertex'])
    })

# Create and sort submission dataframe
submission_df = pd.DataFrame(submission)
submission_df = submission_df.sort_values('id')

# Save to CSV
submission_df.to_csv('submission.csv', index=False)
print("Submission saved to submission.csv")

Submission saved to submission.csv


In [22]:
# Method 1: Using coefficients directly
# Get feature names and coefficients
feature_names = feature_columns  # or X_train_processed.columns if using pandas
coefficients = final_model.coef_[0]  # Get coefficients for first class

# Create a DataFrame for better visualization
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Abs_Coefficient': np.abs(coefficients)
})

# Sort by absolute coefficient value
feature_importance = feature_importance.sort_values('Abs_Coefficient', ascending=False)

# Display top features
print("\nTop 10 most important features:")
print(feature_importance.head(10))


Top 10 most important features:
                Feature  Coefficient  Abs_Coefficient
3           betweenness     3.865115         3.865115
0             closeness     3.157181         3.157181
1              pagerank     1.587637         1.587637
4       eigencentrality     1.082676         1.082676
2       is_articulation     0.984942         0.984942
5          eccentricity     0.804079         0.804079
12       path_asymmetry     0.276785         0.276785
6           is_japanese     0.256801         0.256801
14        diameter_path    -0.221502         0.221502
13  avg_neighbor_degree    -0.169559         0.169559


In [23]:
import pandas as pd
import numpy as np

# Assuming final_model, feature_columns, and preprocessing are already defined
feature_names = feature_columns  # or X_train_processed.columns if applicable
coefficients = final_model.coef_[0]  # Get coefficients for first class

# Calculate absolute coefficients and normalize them
abs_coefficients = np.abs(coefficients)
normalized_importance = abs_coefficients / abs_coefficients.sum()

# Create a DataFrame for feature importances
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Abs_Coefficient': abs_coefficients,
    'Normalized_Importance': normalized_importance
})

# Sort by absolute coefficient value
feature_importance = feature_importance.sort_values('Abs_Coefficient', ascending=False)

# Display the top 10 most important features
print("\nTop 10 most important features:")
print(feature_importance.head(10))

# Optionally, print all features
print("\nAll feature importances (normalized):")
print(feature_importance.to_string(index=False))



Top 10 most important features:
                Feature  Coefficient  Abs_Coefficient  Normalized_Importance
3           betweenness     3.865115         3.865115               0.300591
0             closeness     3.157181         3.157181               0.245535
1              pagerank     1.587637         1.587637               0.123471
4       eigencentrality     1.082676         1.082676               0.084200
2       is_articulation     0.984942         0.984942               0.076599
5          eccentricity     0.804079         0.804079               0.062534
12       path_asymmetry     0.276785         0.276785               0.021526
6           is_japanese     0.256801         0.256801               0.019971
14        diameter_path    -0.221502         0.221502               0.017226
13  avg_neighbor_degree    -0.169559         0.169559               0.013187

All feature importances (normalized):
                   Feature  Coefficient  Abs_Coefficient  Normalized_Importance
 

In [None]:
# Check the intercept
print("Model intercept:", final_model.intercept_)

# Check the coefficients
print("\nCoefficients shape:", final_model.coef_.shape)
print("All coefficients are zero:", np.all(final_model.coef_ == 0))

# Let's see what predictions look like
predictions = final_model.predict(X_train_processed[feature_columns])
probabilities = final_model.predict_proba(X_train_processed[feature_columns])

print("\nPrediction probabilities for first 5 samples:")
print(probabilities[:50])

Model intercept: [-3.08824134]

Coefficients shape: (1, 14)
All coefficients are zero: False

Prediction probabilities for first 5 samples:
[[0.73267468 0.26732532]
 [0.93784242 0.06215758]
 [0.58101784 0.41898216]
 [0.93417023 0.06582977]
 [0.7072357  0.2927643 ]
 [0.27392727 0.72607273]
 [0.7577484  0.2422516 ]
 [0.94271431 0.05728569]
 [0.70641903 0.29358097]
 [0.63911198 0.36088802]
 [0.72072594 0.27927406]
 [0.35271236 0.64728764]
 [0.35538885 0.64461115]
 [0.54504052 0.45495948]
 [0.33849164 0.66150836]
 [0.67500845 0.32499155]
 [0.71386899 0.28613101]
 [0.93234136 0.06765864]
 [0.53991606 0.46008394]
 [0.75159328 0.24840672]
 [0.94195132 0.05804868]
 [0.82205325 0.17794675]
 [0.93558184 0.06441816]
 [0.70060199 0.29939801]
 [0.9303193  0.0696807 ]
 [0.63822662 0.36177338]
 [0.72391175 0.27608825]
 [0.34735074 0.65264926]
 [0.6279275  0.3720725 ]
 [0.91909163 0.08090837]
 [0.25819169 0.74180831]
 [0.6279275  0.3720725 ]
 [0.91909163 0.08090837]
 [0.90778737 0.09221263]
 [0.216410

In [None]:
# Check if all predictions are the same
unique_predictions = np.unique(predictions)
print("\nUnique predictions:", unique_predictions)

# Check class distribution
print("\nClass distribution in training data:")
print(pd.Series(y_train).value_counts(normalize=True))


Unique predictions: [0 1]

Class distribution in training data:
is_root
0    0.94683
1    0.05317
Name: proportion, dtype: float64
