# Prediction

In [None]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GroupKFold
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.ensemble import BalancedRandomForestClassifier

### Normalization and Scaling

In [None]:
def sentence_robust_scale_features(df):
    """Scale features within each sentence group using Robust Scaler"""
    scaled_df = df.copy()
    
    # Get all numeric features
    numeric_cols = scaled_df.select_dtypes(include=[np.number]).columns
    features = [col for col in numeric_cols 
               if col not in ['id', 'n', 'sentence', 'is_root', 'vertex', 'is_articulation']]

    scaled_df[features] = scaled_df[features].astype(float)

    # Group by language and sentence ID
    for (lang, sent), group in scaled_df.groupby(['language', 'sentence']):
        
        # Apply RobustScaler within each sentence group
        scaler = RobustScaler()  
        for feature in features:
            scaled_values = scaler.fit_transform(group[[feature]].values.reshape(-1, 1))
            scaled_df.loc[group.index, feature] = scaled_values.flatten()

    return scaled_df

### Data Preparation
- Create a group id to determine language and sentence groups
- Apply feature scaling
- Define the features and the target variable

In [None]:
def prepare_data(df):
    """Prepare data for training"""
    # Create a unique identifier for each sentence
    df['sentence_id'] = df['language'] + '_' + df['sentence'].astype(str)
    
    # Scale features within each sentence
    df_scaled = sentence_robust_scale_features(df)
    
    feature_list = [col for col in df_scaled.columns 
               if col not in ['id', 'n', 'sentence', 'is_root', 'vertex', 'language', 'sentence_id']
               and df_scaled[col].dtype in [np.float64, np.int64]]
    
    print(f"Using features: {feature_list}")
    
    X = df_scaled[feature_list]
    
    # Only extract y if is_root exists in the dataframe
    if 'is_root' in df_scaled.columns:
        y = df_scaled['is_root']
    else:
        y = None
        
    groups = df_scaled['sentence_id']
    
    return X, y, groups, df_scaled

### Train Classifier

In [None]:
def train_root_classifier(train_data):
    """Train a classifier for root node detection"""
    X, y, groups, _ = prepare_data(train_data)
    
    # Set up GroupKFold to ensure sentences stay together
    group_kfold = GroupKFold(n_splits=5)

    clf = BalancedRandomForestClassifier(
        n_estimators=100,
        random_state=42,
        sampling_strategy='auto',
        class_weight='balanced',
        replacement=True, 
        bootstrap=False   
    )
    
    # Initialize balanced classifier to handle class imbalance
    # clf = BalancedRandomForestClassifier(
    #     n_estimators=200,         
    #     max_depth=None,            
    #     max_features='sqrt',      
    #     sampling_strategy='auto', 
    #     replacement=True,         
    #     bootstrap=False,           
    #     random_state=42,
    #     n_jobs=-1                 
    # )
    
    # Perform cross-validation, ensuring sentences stay together
    for fold, (train_idx, val_idx) in enumerate(group_kfold.split(X, y, groups)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_val)
        y_proba = clf.predict_proba(X_val)

        print(f"\nFold {fold+1} Metrics:")
        print(classification_report(y_val, y_pred, digits=4))
        print(f"AUC-ROC: {roc_auc_score(y_val, y_proba[:, 1]):.4f}")
    
    # Train final model on all data
    final_clf = BalancedRandomForestClassifier(
        n_estimators=100,
        random_state=42,
        sampling_strategy='auto',
        class_weight='balanced',
        replacement=True, 
        bootstrap=False   
    )

    # final_clf = BalancedRandomForestClassifier(
    #     n_estimators=200,         
    #     max_depth=None,                 
    #     max_features='sqrt',      
    #     sampling_strategy='auto', 
    #     replacement=True,         
    #     bootstrap=False,           
    #     random_state=42,
    #     n_jobs=-1                 
    # )

    final_clf.fit(X, y)
        
    # Feature importance
    feature_importances = pd.DataFrame({
        'Feature': X.columns,
        'Importance': final_clf.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nTop 10 Feature Importances:")
    print(feature_importances.head(10))
    
    return final_clf

### Prediction

In [None]:
def predict_roots(classifier, test_data):
    """Make predictions on test data"""
    X_test, y_test, _, normalized_test = prepare_data(test_data)
    
    # Get probability predictions
    return classifier.predict_proba(X_test)[:, 1]

### Generate Submission File

In [None]:
def generate_submission_file(predictions_df, original_test_data, output_path='submission.csv'):
    """Generate the final submission file in the required format"""
    # Create a mapping from (language, sentence) to original ID
    sentence_lang_to_id = {}
    
    # Create mapping from (language, sentence) to id
    for _, row in original_test_data.drop_duplicates(['language', 'sentence']).iterrows():
        sentence_lang_to_id[(row['language'], row['sentence'])] = row['id']
    
    # Extract sentence IDs and predicted root vertices
    submission = []
    
    for (lang, sent), group in predictions_df.groupby(['language', 'sentence']):
        # Get the node with highest probability for this sentence and language
        top_node = group.sort_values('root_probability', ascending=False).iloc[0]
        
        # Use the original id
        original_id = sentence_lang_to_id.get((lang, sent))
        
        submission.append({
            'id': original_id,
            'root': int(top_node['vertex'])
        })
    
    # Create and sort submission dataframe
    submission_df = pd.DataFrame(submission)
    submission_df = submission_df.sort_values('id')
    
    # Save to CSV
    submission_df.to_csv(output_path, index=False)
    print(f"Submission saved to {output_path}")
    
    return submission_df

### Run

In [None]:
# Load Datasets
train_data = pd.read_csv('../data/train_processed.csv')
test_data = pd.read_csv('../data/test_processed.csv')

print("Training root node classifier...")
classifier = train_root_classifier(train_data)
    
print("\nGenerating predictions...")
test_data['root_probability'] = predict_roots(classifier, test_data)
    
print("\nCreating submission file...")
submission = generate_submission_file(predictions, test_data, 'submission.csv')