In [None]:
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn import set_config
import numpy as np
import pandas as pd
import optuna
from functools import partial

set_config(transform_output="pandas")
np.random.seed(42)


In [None]:
def one_root_f1_scorer(estimator, X_with_groups_and_features, y_true, feature_columns_for_model):
    """Custom scorer. X_with_groups_and_features should be a pandas DataFrame containing
       both group columns ('language', 'sentence') and the feature_columns_for_model."""

    if not isinstance(X_with_groups_and_features, pd.DataFrame):
        raise ValueError("X_with_groups_and_features must be a pandas DataFrame for this scorer version.")

    # Make predictions using only the specified feature columns
    X_for_predict = X_with_groups_and_features[feature_columns_for_model]
    try:
        proba_values = estimator.predict_proba(X_for_predict)[:, 1]
    except Exception as e:
        print(f"Error during predict_proba in scorer: {e}. Columns expected: {feature_columns_for_model}, Columns in X_for_predict: {X_for_predict.columns.tolist() if isinstance(X_for_predict, pd.DataFrame) else 'N/A (NumPy)'}")
        return 0.0 # Return a score that indicates failure

    # Add probabilities to a copy of the DataFrame to use its index for grouping and selection
    df_copy = X_with_groups_and_features.copy()
    df_copy['__proba__'] = proba_values

    y_pred = pd.Series(np.zeros(len(df_copy)), index=df_copy.index)
    group_cols = ['language', 'sentence']

    for _, group_df in df_copy.groupby(group_cols):
        if not group_df.empty:
            if '__proba__' in group_df.columns:
                idx_max = group_df['__proba__'].idxmax()
                y_pred.loc[idx_max] = 1
            else:
                # This case should not happen if proba_values were assigned correctly
                print(f"Warning: '__proba__' column missing in group_df during scoring.")
    
    # Ensure y_true and y_pred are numpy arrays for f1_score to avoid potential index issues
    return f1_score(y_true.values, y_pred.values)

In [None]:
def create_folds(df, n_folds, group_column):
    """Create folds for GroupKFold cross-validation"""
    df_copy = df.copy()
    df_copy['kfold'] = -1
    y = df_copy['is_root']
    groups = df_copy[group_column]

    kf = GroupKFold(n_splits=n_folds)

    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df_copy, y=y, groups=groups)):
        df_copy.loc[val_idx, 'kfold'] = fold
        
    return df_copy

In [None]:
from sklearn.base import clone

def run_model(df_with_folds, fold, estimator, params, feature_columns, feature_pipeline=None, trial=None):
    """Run model training and evaluation for a single fold."""
    df_train = df_with_folds[df_with_folds.kfold != fold].reset_index(drop=True)
    df_valid = df_with_folds[df_with_folds.kfold == fold].reset_index(drop=True)

    X_train_original = df_train.drop(columns=['is_root', 'kfold'])
    X_valid_original = df_valid.drop(columns=['is_root', 'kfold'])
    y_train = df_train['is_root']
    y_valid = df_valid['is_root']

    X_train_processed = X_train_original
    X_valid_processed = X_valid_original
    
    current_feature_columns = list(feature_columns)

    if feature_pipeline:
        X_train_processed = feature_pipeline.fit_transform(X_train_original)
        X_valid_processed = feature_pipeline.transform(X_valid_original)
        
        if not isinstance(X_train_processed, pd.DataFrame) and hasattr(feature_pipeline, 'get_feature_names_out'):
            try:
                current_feature_columns = feature_pipeline.get_feature_names_out()
                X_train_processed = pd.DataFrame(X_train_processed, columns=current_feature_columns)
                X_valid_processed = pd.DataFrame(X_valid_processed, columns=current_feature_columns)
            except Exception as e:
                if trial: trial.set_user_attr(f'fold_{fold}_feature_pipeline_error', str(e))
                print(f"Fold {fold} error getting feature names from pipeline: {e}")
                return float('nan')
        elif not isinstance(X_train_processed, pd.DataFrame):
            X_train_processed = pd.DataFrame(X_train_processed, columns=feature_columns)
            X_valid_processed = pd.DataFrame(X_valid_processed, columns=feature_columns)

    model = clone(estimator)
    model.set_params(**params) # random_state is already in params from objective
    
    try:
        model.fit(X_train_processed[current_feature_columns], y_train)
    except Exception as e:
        if trial: trial.set_user_attr(f'fold_{fold}_fit_error', str(e))
        print(f"Fold {fold} model fitting failed: {str(e)}. Features used: {current_feature_columns}")
        return float('nan')

    try:
        score = one_root_f1_scorer(
            model,
            X_valid_processed,
            y_valid,
            feature_columns_for_model=current_feature_columns
        )
    except Exception as e:
        if trial: trial.set_user_attr(f'fold_{fold}_score_error', str(e))
        print(f"Fold {fold} scoring failed: {str(e)}")
        return float('nan')
        
    return score

In [None]:

def evaluate_model_performance(model, X_with_groups_and_features, y_true, feature_columns_for_model):
    if not isinstance(X_with_groups_and_features, pd.DataFrame):
        raise ValueError("X_with_groups_and_features must be a pandas DataFrame for this scorer version.")

    # Make predictions using only the specified feature columns
    X_for_predict = X_with_groups_and_features[feature_columns_for_model]
    try:
        proba_values = model.predict_proba(X_for_predict)[:, 1]
    except Exception as e:
        print(f"Error during predict_proba in evaluation: {e}")
        return {'precision': 0.0, 'recall': 0.0, 'f1': 0.0}, {}

    # Add probabilities to a copy of the DataFrame
    df_copy = X_with_groups_and_features.copy()
    df_copy['__proba__'] = proba_values

    # Initialize predictions
    y_pred = pd.Series(np.zeros(len(df_copy)), index=df_copy.index)
    group_cols = ['language', 'sentence']

    # Ensure one root per sentence
    for _, group_df in df_copy.groupby(group_cols):
        if not group_df.empty:
            if '__proba__' in group_df.columns:
                idx_max = group_df['__proba__'].idxmax()
                y_pred.loc[idx_max] = 1
            else:
                print(f"Warning: '__proba__' column missing in group_df during evaluation.")

    # Calculate overall metrics
    overall_metrics = {
        'precision': precision_score(y_true.values, y_pred.values),
        'recall': recall_score(y_true.values, y_pred.values),
        'f1': f1_score(y_true.values, y_pred.values)
    }

    # Calculate per-language metrics
    per_language_metrics = {}
    for lang in X_with_groups_and_features['language'].unique():
        lang_mask = X_with_groups_and_features['language'] == lang
        lang_true = y_true[lang_mask]
        lang_pred = y_pred[lang_mask]
        
        # Only calculate metrics if there are samples for this language
        if len(lang_true) > 0:
            per_language_metrics[lang] = {
                'precision': precision_score(lang_true, lang_pred),
                'recall': recall_score(lang_true, lang_pred),
                'f1': f1_score(lang_true, lang_pred)
            }

    return overall_metrics, per_language_metrics

In [None]:
def objective(trial, df, feature_columns, feature_pipeline=None):
    try:
        # First decide which configuration to use
        config_type = trial.suggest_categorical('config_type', ['l1_l2', 'elasticnet'])
        # config_type = trial.suggest_categorical('config_type', ['l1_l2'])
        
        if config_type == 'l1_l2':
            params = {
                'penalty': trial.suggest_categorical('penalty', ['l2']),
                'C': trial.suggest_float('C', 0.0000001, 100, log=True),
                'solver': trial.suggest_categorical('solver', ['liblinear', 'saga']),
                'class_weight': trial.suggest_categorical('class_weight', ['balanced', {0:1, 1:10}]),
                'max_iter': 2000,
                'l1_ratio': None
            } 
        else:  # elasticnet
            params = {
                'penalty': 'elasticnet',
                'C': trial.suggest_float('C', 0.0000001, 0.01, log=True),
                'solver': 'saga',
                'l1_ratio': trial.suggest_float('l1_ratio', 0.1, 0.9),
                'class_weight': 'balanced',
                'max_iter': 1000
            }
        
        n_folds = 4
        df_with_folds = create_folds(df, n_folds, group_column='sentence')
        scores = []

        for fold in range(n_folds):
            trial.set_user_attr(f'fold_{fold+1}_status', 'starting')
            try:
                score = run_model(
                    df_with_folds, 
                    fold, 
                    LogisticRegression(), 
                    params, 
                    feature_columns, 
                    feature_pipeline, 
                    trial=trial
                )
                
                if np.isnan(score):
                    trial.set_user_attr(f'fold_{fold+1}_status', 'returned_nan')
                    print(f"Trial {trial.number} Fold {fold+1} returned NaN.")
                else:
                    scores.append(score)
                    trial.set_user_attr(f'fold_{fold+1}_score', score)
                    trial.set_user_attr(f'fold_{fold+1}_status', 'completed')

                # Report intermediate value for pruning
                trial.report(np.mean(scores), fold)

            except Exception as e:
                print(f"Trial {trial.number} Fold {fold+1} failed with exception: {str(e)}")
                trial.set_user_attr(f'fold_{fold+1}_status', f'failed_exception: {str(e)}')
                continue
        
        if not scores:
            trial.set_user_attr('pruned_reason', 'No valid scores obtained')
            raise optuna.TrialPruned()
        
        mean_score = np.mean([s for s in scores if not np.isnan(s)])
        if np.isnan(mean_score):
            print(f"Trial {trial.number} resulted in all NaN scores, returning -inf.")
            return float('-inf')
            
        return mean_score
    except Exception as e:
        # print(f"Trial {trial.number} failed completely with error: {str(e)}")
        trial.set_user_attr('complete_failure', str(e))
        return float('-inf')

In [None]:
# Load data
train_data = pd.read_csv('../../data/train_processed_random.csv')

In [None]:
feature_columns = ['closeness', 'degree', 'pagerank', 'voterank','is_articulation', 'betweenness', 'eigencentrality', 'eccentricity', 'is_japanese',
        'lang_group_head_final_sov', 'lang_group_romance_svo', 'lang_group_germanic_v2', 'lang_group_free_order_case', 'lang_group_analytic']

# Create study 
study = optuna.create_study(
    direction='maximize'
)

# Create objective function with fixed parameters
objective_func = partial(
    objective, 
    df=train_data, 
    feature_columns=feature_columns,
    feature_pipeline=None
)

# Optimize with more trials
study.optimize(objective_func, n_trials=50, show_progress_bar=True) 

# Print results
print("\nBest trial:")
trial = study.best_trial
print(f"  Value (F1 Score): {trial.value}")
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

# Validation

In [60]:
n_folds = 4
df_with_folds = create_folds(train_data, n_folds, group_column='sentence')

# Use the last fold as validation set
val_fold = n_folds - 1
df_train = df_with_folds[df_with_folds.kfold != val_fold].reset_index(drop=True)
df_valid = df_with_folds[df_with_folds.kfold == val_fold].reset_index(drop=True)

X_valid = df_valid.drop(columns=['is_root', 'kfold'])
y_valid = df_valid['is_root']

# Uncomment this if using Optuna
# best_model = LogisticRegression(**trial.params)

# Best parameters with C > 0.0000001
best_model = LogisticRegression(
    C=2.7145102022431556e-07, 
    l1_ratio=0.8868773763606811,
    penalty='elasticnet',
    solver='saga',
    class_weight='balanced',
    max_iter=1000
)

# Best parameters with C > 1
# best_model = LogisticRegression(
#     C=99.86981387827636, 
#     penalty='l2',
#     solver='liblinear',
#     class_weight={0: 1, 1: 10},
#     max_iter=1000
# )

feature_columns=['closeness', 'degree', 'pagerank', 'voterank','is_articulation', 'betweenness', 'eigencentrality', 'eccentricity', 'is_japanese',
        'lang_group_head_final_sov', 'lang_group_romance_svo', 'lang_group_germanic_v2', 'lang_group_free_order_case', 'lang_group_analytic']

best_model.fit(df_train[feature_columns], df_train['is_root'])

# Evaluate on validation set
print("\nModel Performance (Validation Set):")
overall_metrics, per_language_metrics = evaluate_model_performance(
    best_model, 
    X_valid, 
    y_valid,
    feature_columns
)

print("\nOverall Performance:")
print(f"Precision: {overall_metrics['precision']:.4f}")
print(f"Recall: {overall_metrics['recall']:.4f}")
print(f"F1 Score: {overall_metrics['f1']:.4f}")

print("\nPer-Language Performance:")
for lang, metrics in per_language_metrics.items():
    print(f"\n{lang}:")
    print(f"  Precision: {metrics['precision']:.4f}")
    print(f"  Recall: {metrics['recall']:.4f}")
    print(f"  F1 Score: {metrics['f1']:.4f}")


Model Performance (Validation Set):

Overall Performance:
Precision: 0.5810
Recall: 0.5810
F1 Score: 0.5810

Per-Language Performance:

Japanese:
  Precision: 0.0800
  Recall: 0.0800
  F1 Score: 0.0800

Finnish:
  Precision: 0.5440
  Recall: 0.5440
  F1 Score: 0.5440

Galician:
  Precision: 0.6080
  Recall: 0.6080
  F1 Score: 0.6080

English:
  Precision: 0.6240
  Recall: 0.6240
  F1 Score: 0.6240

Hindi:
  Precision: 0.1440
  Recall: 0.1440
  F1 Score: 0.1440

French:
  Precision: 0.5520
  Recall: 0.5520
  F1 Score: 0.5520

Italian:
  Precision: 0.6160
  Recall: 0.6160
  F1 Score: 0.6160

Indonesian:
  Precision: 0.8800
  Recall: 0.8800
  F1 Score: 0.8800

Swedish:
  Precision: 0.7120
  Recall: 0.7120
  F1 Score: 0.7120

Spanish:
  Precision: 0.5920
  Recall: 0.5920
  F1 Score: 0.5920

Icelandic:
  Precision: 0.7760
  Recall: 0.7760
  F1 Score: 0.7760

German:
  Precision: 0.6400
  Recall: 0.6400
  F1 Score: 0.6400

Korean:
  Precision: 0.2160
  Recall: 0.2160
  F1 Score: 0.2160

Pol

In [61]:
# Create a DataFrame for overall metrics
overall_df = pd.DataFrame([overall_metrics], index=['Overall'])

# Create a DataFrame for per-language metrics
per_lang_df = pd.DataFrame(per_language_metrics).T

# Combine both DataFrames
combined_metrics = pd.concat([overall_df, per_lang_df])

# Format the metrics to 4 decimal places
formatted_metrics = combined_metrics.round(4)

# Display the formatted table
print("\nPerformance Metrics:")
print(formatted_metrics.to_string())

# Save to CSV
# formatted_metrics.to_csv('results.csv')

# print("Results have been saved to results.csv")


Performance Metrics:
            precision  recall     f1
Overall         0.581   0.581  0.581
Japanese        0.080   0.080  0.080
Finnish         0.544   0.544  0.544
Galician        0.608   0.608  0.608
English         0.624   0.624  0.624
Hindi           0.144   0.144  0.144
French          0.552   0.552  0.552
Italian         0.616   0.616  0.616
Indonesian      0.880   0.880  0.880
Swedish         0.712   0.712  0.712
Spanish         0.592   0.592  0.592
Icelandic       0.776   0.776  0.776
German          0.640   0.640  0.640
Korean          0.216   0.216  0.216
Polish          0.760   0.760  0.760
Thai            0.944   0.944  0.944
Turkish         0.288   0.288  0.288
Czech           0.680   0.680  0.680
Chinese         0.248   0.248  0.248
Portuguese      0.576   0.576  0.576
Arabic          0.928   0.928  0.928
Russian         0.792   0.792  0.792


# Test

In [62]:
# Create final model with best parameters and generate predictions
print("\nTraining final model with best parameters...")

# Process training data
y_train = train_data['is_root']
X_train_processed = train_data.drop(columns=['is_root'])

# Initialize and train model
# Using pop() (does nothing if key is not present)
trial.params.pop('config_type', None)

# Uncomment this if using Optuna
# final_model = LogisticRegression(**trial.params)

# Best parameters with C > 0.0000001
final_model = LogisticRegression(
    C=2.7145102022431556e-07, 
    l1_ratio=0.8868773763606811,
    penalty='elasticnet',
    solver='saga',
    class_weight='balanced',
    max_iter=1000
)

# Best parameters with C > 1
final_model = LogisticRegression(
    C=99.86981387827636, 
    penalty='l2',
    solver='liblinear',
    class_weight={0: 1, 1: 10},
    max_iter=1000
)

final_model.fit(X_train_processed[feature_columns], y_train)


Training final model with best parameters...


In [63]:
# Load test data
test_data = pd.read_csv('../../data/test_processed_random.csv')

# Generate predictions
test_proba = final_model.predict_proba(test_data[feature_columns])[:, 1]
test_data['root_probability'] = test_proba

# Create predictions ensuring one root per sentence
predictions = pd.Series(np.zeros(len(test_data)), index=test_data.index)
group_cols = ['language', 'sentence']

for _, group_df in test_data.groupby(group_cols):
    if not group_df.empty:
        idx_max = group_df['root_probability'].idxmax()
        predictions.loc[idx_max] = 1

In [64]:
def generate_submission_file(predictions_df, original_test_data, output_path='submission.csv'):
    """Generate the final submission file in the required format"""
    # Create a mapping from (language, sentence) to original ID
    sentence_lang_to_id = {}
    
    # Create mapping from (language, sentence) to id
    for _, row in original_test_data.drop_duplicates(['language', 'sentence']).iterrows():
        sentence_lang_to_id[(row['language'], row['sentence'])] = row['id']
    
    # Extract sentence IDs and predicted root vertices
    submission = []
    
    for (lang, sent), group in predictions_df.groupby(['language', 'sentence']):
        # Get the node with highest probability for this sentence and language
        top_node = group.sort_values('root_probability', ascending=False).iloc[0]
        
        # Use the original id
        original_id = sentence_lang_to_id.get((lang, sent))
        
        submission.append({
            'id': original_id,
            'root': int(top_node['vertex'])
        })
    
    # Create and sort submission dataframe
    submission_df = pd.DataFrame(submission)
    submission_df = submission_df.sort_values('id')
    
    # Save to CSV
    submission_df.to_csv(output_path, index=False)
    print(f"Submission saved to {output_path}")
    
    return submission_df

In [65]:
# Generate predictions
test_proba = final_model.predict_proba(test_data[feature_columns])[:, 1]
test_data['root_probability'] = test_proba

# Create submission dataframe
submission = []
for (lang, sent), group in test_data.groupby(['language', 'sentence']):
    # Get the node with highest probability for this sentence and language
    top_node = group.loc[group['root_probability'].idxmax()]
    submission.append({
        'id': top_node['id'],
        'root': int(top_node['vertex'])
    })

# Create and sort submission dataframe
submission_df = pd.DataFrame(submission)
submission_df = submission_df.sort_values('id')

# Save to CSV
submission_df.to_csv('submission.csv', index=False)
print("Submission saved to submission.csv")

Submission saved to submission.csv


In [66]:
# Method 1: Using coefficients directly
# Get feature names and coefficients
feature_names = feature_columns  # or X_train_processed.columns if using pandas
coefficients = final_model.coef_[0]  # Get coefficients for first class

# Create a DataFrame for better visualization
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'Abs_Coefficient': np.abs(coefficients)
})

# Sort by absolute coefficient value
feature_importance = feature_importance.sort_values('Abs_Coefficient', ascending=False)

# Display top features
print("\nTop 10 most important features:")
print(feature_importance.head(10))


Top 10 most important features:
                       Feature  Coefficient  Abs_Coefficient
2                     pagerank    39.331878        39.331878
1                       degree   -17.742186        17.742186
5                  betweenness     3.326968         3.326968
6              eigencentrality     1.429287         1.429287
0                    closeness     1.394055         1.394055
3                     voterank     0.712704         0.712704
4              is_articulation     0.480440         0.480440
8                  is_japanese     0.300659         0.300659
11      lang_group_germanic_v2    -0.154276         0.154276
12  lang_group_free_order_case    -0.121119         0.121119


In [67]:
# Check the intercept
print("Model intercept:", final_model.intercept_)

# Check the coefficients
print("\nCoefficients shape:", final_model.coef_.shape)
print("All coefficients are zero:", np.all(final_model.coef_ == 0))

# Let's see what predictions look like
predictions = final_model.predict(X_train_processed[feature_columns])
probabilities = final_model.predict_proba(X_train_processed[feature_columns])

print("\nPrediction probabilities for first 5 samples:")
print(probabilities[:50])

Model intercept: [-3.08824134]

Coefficients shape: (1, 14)
All coefficients are zero: False

Prediction probabilities for first 5 samples:
[[0.73267468 0.26732532]
 [0.93784242 0.06215758]
 [0.58101784 0.41898216]
 [0.93417023 0.06582977]
 [0.7072357  0.2927643 ]
 [0.27392727 0.72607273]
 [0.7577484  0.2422516 ]
 [0.94271431 0.05728569]
 [0.70641903 0.29358097]
 [0.63911198 0.36088802]
 [0.72072594 0.27927406]
 [0.35271236 0.64728764]
 [0.35538885 0.64461115]
 [0.54504052 0.45495948]
 [0.33849164 0.66150836]
 [0.67500845 0.32499155]
 [0.71386899 0.28613101]
 [0.93234136 0.06765864]
 [0.53991606 0.46008394]
 [0.75159328 0.24840672]
 [0.94195132 0.05804868]
 [0.82205325 0.17794675]
 [0.93558184 0.06441816]
 [0.70060199 0.29939801]
 [0.9303193  0.0696807 ]
 [0.63822662 0.36177338]
 [0.72391175 0.27608825]
 [0.34735074 0.65264926]
 [0.6279275  0.3720725 ]
 [0.91909163 0.08090837]
 [0.25819169 0.74180831]
 [0.6279275  0.3720725 ]
 [0.91909163 0.08090837]
 [0.90778737 0.09221263]
 [0.216410

In [68]:
# Check if all predictions are the same
unique_predictions = np.unique(predictions)
print("\nUnique predictions:", unique_predictions)

# Check class distribution
print("\nClass distribution in training data:")
print(pd.Series(y_train).value_counts(normalize=True))


Unique predictions: [0 1]

Class distribution in training data:
is_root
0    0.94683
1    0.05317
Name: proportion, dtype: float64
