In [None]:
from sklearn.model_selection import GroupKFold
from sklearn.base import clone
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler

from itertools import product
def one_root_f1_scorer(estimator, X, y_true, feature_columns=None):
    """Custom scorer that enforces one root per language-sentence pair"""
    # Ensure we have a DataFrame with proper columns
    if not isinstance(X, pd.DataFrame):
        raise ValueError("X must be a pandas DataFrame with language and sentence columns")
    
    # Get probabilities (using only feature columns)
    proba = estimator.predict_proba(X[feature_columns])[:, 1]
    
    # Enforce one root per language-sentence group
    y_pred = np.zeros(len(X))
    group_cols = ['language', 'sentence']  # Columns defining our groups
    
    for _, group in X.groupby(group_cols):
        idx_max = group.index[np.argmax(proba[group.index])]
        y_pred[idx_max] = 1
    
    return f1_score(y_true, y_pred)

def create_folds(df, n_folds, group_column):
    """Create folds for GroupKFold cross-validation"""
    df = df.copy()
    df['kfold'] = -1
    y = df['is_root']  # Assuming this is your target column
    groups = df[group_column]
    
    kf = GroupKFold(n_splits=n_folds)
    
    for fold, (train_idx, val_idx) in enumerate(kf.split(X=df, y=y, groups=groups)):
        df.loc[val_idx, 'kfold'] = fold
        
    return df

# Main grid search logic
def custom_grid_search(df, estimator, param_grid, feature_columns, 
                      group_column='sentence', scoring_columns=['language', 'sentence'],
                      n_folds=5, scoring=one_root_f1_scorer, feature_pipeline=None):
    """
    Custom grid search with GroupKFold and feature transformations
    
    Parameters:
    -----------
    df: pd.DataFrame
        Contains all data including features and target
    estimator: sklearn estimator
    param_grid: dict
        Parameter grid for search
    feature_columns: list
        Columns to use as features
    group_column: str
        Column to use for grouping
    scoring_columns: list
        Columns needed for custom scoring
    n_folds: int
        Number of cross-validation folds
    scoring: callable
        Custom scoring function
    feature_pipeline: sklearn Pipeline
        Pipeline for feature transformations
    """
    # Create folds
    df = create_folds(df, n_folds, group_column)
    
    best_score = -np.inf
    best_params = None
    # best_estimator = None
    
    # Generate all parameter combinations
    param_combinations = [dict(zip(param_grid.keys(), v)) 
                         for v in product(*param_grid.values())]
    
    for params in param_combinations:
        fold_scores = []
        print(f'Checking params:{params}')
        
        for fold in range(n_folds):
            print(f'Running fold:{fold+1}')
            # Split train/valid and reset to default integer index
            df_train = df[df.kfold != fold].reset_index(drop=True)
            df_valid = df[df.kfold == fold].reset_index(drop=True)

            # Remove fold column
            df_train = df_train.drop(columns=['kfold'])
            df_valid = df_valid.drop(columns=['kfold'])

            # Separate features and target
            X_train_no_target = df_train.drop(columns=['is_root'])
            X_valid_no_target = df_valid.drop(columns=['is_root'])
            y_train = df_train['is_root']
            y_valid = df_valid['is_root']

            # Apply feature transformations
            if feature_pipeline:
                X_train_trans = feature_pipeline.fit_transform(X_train_no_target)
                X_valid_trans = feature_pipeline.transform(X_valid_no_target)
            else:
                X_train_trans = X_train_no_target
                X_valid_trans = X_valid_no_target

            # If output is not a DataFrame, convert and restore index and column names
            if not isinstance(X_train_trans, pd.DataFrame):
                X_train_trans = pd.DataFrame(
                    X_train_trans, 
                    index=X_train_no_target.index, 
                    columns=feature_columns
                )
                X_valid_trans = pd.DataFrame(
                    X_valid_trans, 
                    index=X_valid_no_target.index, 
                    columns=feature_columns
                )

            # Select only feature columns (in case transformers add extra columns)
            X_train = X_train_trans[feature_columns]
            X_valid = X_valid_trans[feature_columns]

            # Resample
            # rus = RandomUnderSampler(random_state=42)
            # X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

            # Now fit estimator on resampled training data
            current_estimator = clone(estimator)
            try:
                current_estimator.set_params(**params)
                current_estimator.fit(X_train, y_train)
            except Exception as e:
                print(f"Skipping invalid combo {params}: {str(e)}")
                continue
        
            # Score (pass full validation DataFrame to scoring function)
            score = scoring(current_estimator, df_valid, y_valid, feature_columns=feature_columns)
            fold_scores.append(score)
        if not fold_scores:
            print(f"All folds failed for params {params}, skipping this combination.")
            continue  # Skip to next parameter combination

        mean_score = np.mean(fold_scores)
        
        if mean_score > best_score:
            best_score = mean_score
            best_params = params
    return {
        'best_params': best_params,
        'best_score': best_score
    }

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, RobustScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from sklearn.model_selection import GroupKFold, GridSearchCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import PowerTransformer, RobustScaler

from sklearn.compose import ColumnTransformer
import pandas as pd
from sklearn.compose import make_column_selector
from sklearn import set_config
set_config(transform_output="pandas")

class DebugTransformer(BaseEstimator, TransformerMixin):
    """Transformer to debug data at pipeline steps"""
    def __init__(self, name=""):
        self.name = name
        
    def fit(self, X, y=None):
        print(f"\n[{self.name}] Fit - Columns: {X.columns.tolist()}")
        print(f"\n[{self.name}] Transform - Columns: {X.dtypes}")
        print(f"[{self.name}] Sample data:\n{X.head()}")
        return self
    
    def transform(self, X):
        print(f"\n[{self.name}] Transform - Columns: {X.columns.tolist()}")
        print(f"\n[{self.name}] Transform - Columns: {X.dtypes}")
        print(f"[{self.name}] Sample data:\n{X.head()}")
        return X
    
class DataFrameColumnTransformer(ColumnTransformer):
    """Wrapper that maintains DataFrame output, feature names, and dtypes."""
    def transform(self, X):
        X_trans = super().transform(X)
        if isinstance(X, pd.DataFrame):
            feature_names = self.get_feature_names_out()
            # Remove transformer prefixes
            clean_names = [name.split('__')[-1] for name in feature_names]
            df_out = pd.DataFrame(X_trans, columns=clean_names, index=X.index)
            
            # Only try to restore dtypes for columns that exist in input X
            for col in clean_names:
                if col in X.columns:
                    df_out[col] = df_out[col].astype(X[col].dtype, errors='ignore')
                else:
                    df_out[col] = pd.to_numeric(df_out[col], errors='coerce')
            return df_out
        return X_trans

    def fit_transform(self, X, y=None):
        X_trans = super().fit_transform(X, y)
        if isinstance(X, pd.DataFrame):
            feature_names = self.get_feature_names_out()
            clean_names = [name.split('__')[-1] for name in feature_names]
            df_out = pd.DataFrame(X_trans, columns=clean_names, index=X.index)
            
            # Only try to restore dtypes for columns that exist in input X
            for col in clean_names:
                if col in X.columns:
                    df_out[col] = df_out[col].astype(X[col].dtype, errors='ignore')
                else:
                    df_out[col] = pd.to_numeric(df_out[col], errors='coerce')
            return df_out
        return X_trans
    
def safe_log1p(x):
    if isinstance(x, pd.DataFrame):
        return np.log1p(x)
    elif isinstance(x, pd.Series):
        return np.log1p(x).to_frame()
    else:
        x = np.asarray(x)
        if x.ndim == 1:
            return np.log1p(x).reshape(-1, 1)
        return np.log1p(x)

class SentenceGroupScaler(BaseEstimator, TransformerMixin):
    def __init__(self, features_to_scale, group_columns=['language', 'sentence']):
        self.features_to_scale = features_to_scale
        self.group_columns = group_columns
        
    def fit(self, X, y=None):
        # No need to store anything
        return self
    
    def transform(self, X):
        X = X.copy()
        X[self.features_to_scale] = X[self.features_to_scale].astype(np.float64)
        
        for _, group_df in X.groupby(self.group_columns):
            group_idx = group_df.index
            scaler = MinMaxScaler()
            X.loc[group_idx, self.features_to_scale] = scaler.fit_transform(
                group_df[self.features_to_scale]
            )
        return X
    
    def fit_transform(self, X, y=None):
        return self.transform(X)
    # def set_output(self, *, transform=None):
    #     self._output_config = transform
    #     return self
    
def create_feature_pipeline(boxcox_features, log_features, sentence_scaled_features):
    # Create individual transformers
    
    boxcox_transformer = DataFrameColumnTransformer(
        [('boxcox', PowerTransformer(method='box-cox'), boxcox_features)],
        remainder='passthrough',
        verbose_feature_names_out=False
    )
    
    log_transformer = DataFrameColumnTransformer(
        [('log', FunctionTransformer(safe_log1p, feature_names_out="one-to-one"), log_features)],
        remainder='passthrough',
        verbose_feature_names_out=False
    )
    
    global_scaler = DataFrameColumnTransformer(
        [('scale', MinMaxScaler(), boxcox_features + log_features)],
        remainder='passthrough',
        verbose_feature_names_out=False
    )
    all_feature_columns = list(set(boxcox_features + log_features + sentence_scaled_features))

    return Pipeline([
        # ('debug_input', DebugTransformer("Input Data")),
        ('boxcox', boxcox_transformer),
        # ('debug_post_boxcox', DebugTransformer("After Box-Cox")),
        ('log', log_transformer),
        # ('debug_post_log', DebugTransformer("After Log Transform")),
        ('sentence_scale', SentenceGroupScaler(
            features_to_scale=sentence_scaled_features
     )),
        # ('global_scale', global_scaler)
    ])

In [None]:
# Define which features get which transformations
boxcox_features = ['closeness', 'degree', 'pagerank', 'eccentricity']
log_features = []
sentence_scaled_features = ['eccentricity'
                            # , 'subtree_size'
                            ] # Features for sentence-level scaling


# Build the pipeline
feature_pipeline = create_feature_pipeline(boxcox_features, log_features, sentence_scaled_features)
# feature_pipeline.set_output(transform="pandas")

In [None]:
train_data=pd.read_csv('../data/train_processed.csv')
train_data.head()

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from itertools import product

# Initialize LDAq
lda = LinearDiscriminantAnalysis()

# Define parameter grid
param_grid = {'solver': ['lsqr', 'eigen'], 'shrinkage': [None, 'auto', 0.1, 0.5, 0.9]}


In [None]:
train_data = train_data[train_data['language'] != 'Japanese'].reset_index(drop=True)

# Run grid search
results = custom_grid_search(
    df=train_data,  # Your DataFrame
    estimator=lda,
    param_grid=param_grid,
    feature_columns=boxcox_features+log_features+['is_articulation', 'betweenness', 'eigencentrality'
                                                #   ,'n','vertex'
                                                ],
    group_column='sentence',  # For GroupKFold
    scoring_columns=['language', 'sentence'],  # Needed for one_root_f1_scorer
    n_folds=5,  # Number of CV folds
    scoring=one_root_f1_scorer,  # Your custom scorer
    feature_pipeline=feature_pipeline  # Your pre-defined pipeline
)

# Print results
print(f"Best F1 Score: {results['best_score']:.4f}")
print(f"Best Params: {results['best_params']}")

## Submission File

In [None]:
model=LinearDiscriminantAnalysis(solver='lsqr',shrinkage=0.5)
# 1. Separate features and target from the full training data
X_train_full = train_data.drop(columns=['is_root'])
y_train_full = train_data['is_root']

# 3. Transform both the full training and test features
X_train_full_trans = feature_pipeline.fit_transform(X_train_full)
feature_columns=boxcox_features+log_features+['is_articulation', 'betweenness', 'eigencentrality'
                                                #   ,'n','vertex'
                                                ]
X_train_full_mod = X_train_full_trans[feature_columns]

# 4. Fit the model on the transformed full training data
model.fit(X_train_full_mod, y_train_full)

In [None]:
original_test_data=pd.read_csv('../data/test.csv')
test_data=pd.read_csv('../data/test_processed.csv')

In [None]:
X_test=test_data.copy()
X_test = X_test.reindex(columns=X_train_full_trans.columns)
X_test_trans = feature_pipeline.transform(X_test)

In [None]:
root_proba = model.predict_proba(X_test_trans[feature_columns])[:, 1]

# 2. Build the DataFrame
predictions_df = X_test_trans.copy()
predictions_df['root_probability'] = root_proba

In [None]:
def generate_submission_file(predictions_df, original_test_data, output_path='submission.csv'):
    """Generate the final submission file in the required format"""
    # Create a mapping from (language, sentence) to original ID
    sentence_lang_to_id = {}
    
    # Create mapping from (language, sentence) to id
    for _, row in original_test_data.drop_duplicates(['language', 'sentence']).iterrows():
        sentence_lang_to_id[(row['language'], row['sentence'])] = row['id']
    
    # Extract sentence IDs and predicted root vertices
    submission = []
    
    for (lang, sent), group in predictions_df.groupby(['language', 'sentence']):
        # Get the node with highest probability for this sentence and language
        top_node = group.sort_values('root_probability', ascending=False).iloc[0]
        
        # Use the original id
        original_id = sentence_lang_to_id.get((lang, sent))
        
        submission.append({
            'id': original_id,
            'root': int(top_node['vertex'])
        })
    
    # Create and sort submission dataframe
    submission_df = pd.DataFrame(submission)
    submission_df = submission_df.sort_values('id')
    
    # Save to CSV
    submission_df.to_csv(output_path, index=False)
    print(f"Submission saved to {output_path}")
    
    return submission_df

In [None]:
generate_submission_file(predictions_df, test_data, output_path='submission.csv')

Submission saved to submission.csv


Unnamed: 0,id,root
5940,1,37
5941,2,46
5942,3,16
5943,4,11
5944,5,6
...,...,...
8410,10391,1
8411,10392,8
8412,10393,26
8413,10394,21
