# Essay Scoring Model - CatBoost with Sentence Transformers

This notebook implements an automated essay scoring system using:
- **Qwen3-Embedding-8B** for text embeddings 
- **CatBoost** with **MultiRMSEWithMissingValues** loss function for multi-target regression on 4 scoring dimensions:
  - Task Achievement
  - Coherence and Cohesion  
  - Lexical Resource
  - Grammatical Range

**Key Advantage**: CatBoost's MultiRMSEWithMissingValues allows us to handle missing target values without dropping data samples.

## Import Libraries

In [None]:
# Essential imports for essay scoring feature engineering
import pandas as pd
import numpy as np
import torch
import gc
import re
import string
import math
from collections import Counter
from typing import List, Dict, Iterable
from tqdm.auto import tqdm

# NLP and text processing libraries
import nltk
import spacy
import textstat
from sentence_transformers import SentenceTransformer, util
from transformers import BitsAndBytesConfig
from pyspellchecker import SpellChecker
import language_tool_python

# Download required NLTK data
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

from nltk.corpus import stopwords

# Configuration constants
EMBEDDING_MODEL = "Alibaba-NLP/gte-Qwen2-7B-instruct"  # Main embedding model
FEATURE_EMBEDDING_MODEL = "BAAI/bge-large-en-v1.5"     # For similarity features
BATCH_SIZE = 16  # Optimized batch size
EMBEDDING_SUB_BATCH = 16

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# Load Data

Load the training and test datasets containing essays with their prompts and scoring rubrics.

In [None]:
# Load training and test datasets
train = pd.read_csv('df_train.csv')
test = pd.read_csv('df_test.csv')

print(f"Training data shape: {train.shape}")
print(f"Test data shape: {test.shape}")
print(f"\nTraining columns: {list(train.columns)}")

# Data Preprocessing

Clean and prepare the text data for embedding generation. This includes:
- Removing escape characters and newlines
- Normalizing whitespace
- Converting to lowercase
- Combining prompt and essay text

In [None]:
# Function to clean text
def clean_text(text_series):
    """
    Clean text data by removing escape characters, normalizing whitespace, and converting to lowercase.
    
    Args:
        text_series (pd.Series): Series containing text data to clean
        
    Returns:
        pd.Series: Cleaned text data
    """
    return (
        text_series
        .str.replace(r'\\[nrt]|\n|\r|\t', ' ', regex=True)  # Remove literal and real escape chars
        .str.replace(r'\s+', ' ', regex=True)               # Collapse multiple spaces
        .str.strip()                                        # Remove leading/trailing spaces
        .str.lower()                                        # Convert to lowercase
    )

# Clean text data for training and test sets
print("Cleaning training data...")
train['essay_clean'] = clean_text(train['essay'])
train['prompt_clean'] = clean_text(train['prompt'])

print("Cleaning test data...")
test['essay_clean'] = clean_text(test['essay'])
test['prompt_clean'] = clean_text(test['prompt'])

print("Text cleaning completed!")


In [None]:
# Examine a sample of cleaned essay text
print("Sample cleaned essay (first 200 characters):")
print(train['essay_clean'][0][:200] + "..." if len(train['essay_clean'][0]) > 200 else train['essay_clean'][0])

print(f"\nOriginal length: {len(train['essay'][0])} characters")
print(f"Cleaned length: {len(train['essay_clean'][0])} characters")

# Now I need to search for cells that contain the feature engineering section to complete the variables
# Let me first create the train_with_features and test_with_features that should be train_clean and test_clean

# Since we need features for the model, let's create the feature engineering for train_clean and test_clean
print("Running comprehensive feature engineering...")

# Feature engineering will be done in the next cells
print("Feature engineering setup completed")

In [None]:
# Combine prompt and essay text with separator token
# [SEP] token helps the model distinguish between prompt and essay content
print("Merging prompt and essay text...")
train['merged_text'] = train['prompt_clean'] + ' [SEP] ' + train['essay_clean']
test['merged_text'] = test['prompt_clean'] + ' [SEP] ' + test['essay_clean']

print(f"Average merged text length: {train['merged_text'].str.len().mean():.0f} characters")
print("Text merging completed!")

In [None]:
# Display sample of merged text to verify format
sample_text = str(train['merged_text'][0])
print("Sample merged text (first 300 characters):")
print(sample_text[:300] + "..." if len(sample_text) > 300 else sample_text)

# Find the [SEP] token position to verify structure
sep_pos = sample_text.find('[SEP]')
print(f"\n[SEP] token found at position: {sep_pos}")
print(f"Prompt portion length: {sep_pos} characters")
print(f"Essay portion length: {len(sample_text) - sep_pos - 6} characters")

In [None]:
# Keep all data - CatBoost can handle missing target values
print("Data preparation for CatBoost with missing value support:")
print(f"Training samples: {len(train)}")
print(f"Test samples: {len(test)}")

# Only remove rows where essential text data is missing
train_clean = train.dropna(subset=['essay', 'prompt'])
test_clean = test.dropna(subset=['essay', 'prompt'])

print(f"\nAfter removing rows with missing text data:")
print(f"Training samples: {len(train_clean)} (removed {len(train) - len(train_clean)} due to missing text)")
print(f"Test samples: {len(test_clean)} (removed {len(test) - len(test_clean)} due to missing text)")

# Check missing values in target variables (these are OK for CatBoost)
scoring_dimensions = ['task_achievement', 'coherence_and_cohesion', 'lexical_resource', 'grammatical_range']
missing_targets = train_clean[scoring_dimensions].isnull().sum()
print(f"\nMissing target values (will be handled by CatBoost):")
for dim, missing_count in missing_targets.items():
    print(f"  {dim}: {missing_count} missing values ({missing_count/len(train_clean)*100:.1f}%)")

print(f"\nTotal samples retained for training: {len(train_clean)}")
print("CatBoost MultiRMSEWithMissingValues will handle missing targets automatically")

# Advanced Feature Engineering

Extract comprehensive linguistic and textual features including:
- **Lexical Features**: Word count, vocabulary diversity, readability scores
- **Syntactic Features**: POS distribution, clause complexity, sentence structure
- **Semantic Features**: Prompt-essay similarity, topic coverage, named entity analysis
- **Stylistic Features**: Punctuation patterns, capitalization, grammar/spelling errors

In [None]:
# =========================================================
# Feature Engineering Implementation
# =========================================================

# =================================================================
# CORE FEATURE ENGINEERING SETUP
# =================================================================

# Global variables for text processing
TOKEN_RE = re.compile(r"\b\w+\b")
PUNCTUATION = set(string.punctuation)
STOPWORDS = set(stopwords.words("english"))
SPELL = SpellChecker()

# Initialize LanguageTool for grammar checking
print("Initializing LanguageTool for grammar checking...")
try:
    _TOOL = language_tool_python.LanguageTool('en-US')
    print("✓ LanguageTool initialized successfully")
except Exception as e:
    print(f"⚠️  LanguageTool initialization failed: {e}")
    _TOOL = None

# Global spaCy model (lazy-loaded)
_SPACY_NLP = None

def clear_vram(*objs):
    """Clear GPU memory and Python objects for memory management"""
    for o in objs:
        try:
            del o
        except Exception:
            pass
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def tokenize(text: str) -> List[str]:
    """Extract word tokens from text"""
    return TOKEN_RE.findall((text or "").lower())

def count_spelling_errors(text: str) -> int:
    """Count misspelled words in text"""
    return len(SPELL.unknown(tokenize(text)))

def count_sentences(text: str) -> int:
    """Count sentences in text using NLTK"""
    try:
        return len(nltk.sent_tokenize(text or ""))
    except Exception:
        # Fallback: count sentence-ending punctuation
        return (text or "").count(".") + (text or "").count("!") + (text or "").count("?")

def count_stopwords(text: str) -> int:
    """Count stopwords in text"""
    return sum(1 for w in (text or "").split() if w.lower() in STOPWORDS)

def prompt_overlap(prompt: str, essay: str) -> int:
    """Count overlapping words between prompt and essay"""
    prompt_words = set((prompt or "").lower().split())
    essay_words = set((essay or "").lower().split())
    return len(prompt_words.intersection(essay_words))

def grammar_error_count(text: str) -> int:
    """Count grammar errors using LanguageTool with timeout handling"""
    try:
        # Limit text length to prevent timeouts
        text_to_check = (text or "")[:5000]  # Limit to 5000 chars
        if not text_to_check.strip():
            return 0
        
        # Use a shorter timeout to prevent hanging
        matches = _TOOL.check(text_to_check)
        return len(matches)
    except Exception:
        # Return 0 if LanguageTool fails (better than crashing)
        return 0

print("✓ Feature engineering setup completed successfully")
print("All advanced NLP tools are active and ready!")

# =========================================================
# EMBEDDING GENERATION OPTIMIZED FOR A100 80GB
# =========================================================

from sentence_transformers import SentenceTransformer
import torch
from transformers import BitsAndBytesConfig

print(f"CUDA Version: {torch.version.cuda}")
print(f"GPU: {torch.cuda.get_device_name()}")
print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

# A100-optimized quantization configuration
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

print(f"Loading embedding model: {EMBEDDING_MODEL}")
print(f"Device: {device}")

# Initialize sentence transformer with A100 optimizations
embedding_model = SentenceTransformer(
    EMBEDDING_MODEL,
    device=device,
    trust_remote_code=True,
    model_kwargs={
        "torch_dtype": torch.bfloat16,
        "quantization_config": quantization_config
    }
)

# Enable A100-specific optimizations
if hasattr(torch.nn.functional, 'scaled_dot_product_attention'):
    print("Flash Attention available - enabled for optimal performance")

def encode_in_batches_a100(model, texts, batch_size=BATCH_SIZE, device='cuda'):
    """
    A100-optimized batch encoding with memory management and performance optimizations.
    """
    embeddings = []
    total_batches = (len(texts) + batch_size - 1) // batch_size
    
    # Pre-allocate memory for better performance
    torch.cuda.empty_cache()
    
    with torch.cuda.amp.autocast(enabled=True, dtype=torch.bfloat16):
        for i in tqdm(range(0, len(texts), batch_size), desc=f"Encoding batches (size={batch_size})"):
            batch = texts[i:i+batch_size]
            
            # Use optimized encoding with mixed precision
            batch_emb = model.encode(
                batch, 
                batch_size=len(batch),
                show_progress_bar=False, 
                device=device,
                convert_to_tensor=True,
                normalize_embeddings=True  # Normalize for better similarity computation
            )
            
            embeddings.append(batch_emb.cpu().half())  # Use half precision for memory efficiency
            
            # Efficient memory cleanup every few batches
            if (i // batch_size + 1) % 5 == 0:
                torch.cuda.empty_cache()
            
    # Final memory cleanup
    torch.cuda.empty_cache()
    return np.vstack([emb.numpy() for emb in embeddings])

# Generate embeddings for training data with A100 optimization
print(f"\nGenerating deep embeddings for {len(train_clean)} training samples...")
print(f"Using optimized batch size: {BATCH_SIZE} (A100 optimized)")

train_embeddings = encode_in_batches_a100(
    embedding_model, 
    train_clean['merged_text'].tolist(), 
    batch_size=BATCH_SIZE, 
    device=device
)

# Generate embeddings for test data
print(f"Generating deep embeddings for {len(test_clean)} test samples...")
test_embeddings = encode_in_batches_a100(
    embedding_model, 
    test_clean['merged_text'].tolist(), 
    batch_size=BATCH_SIZE, 
    device=device
)

# Performance analysis
print(f"\nEmbedding generation completed with A100 optimizations:")
print(f"Training embeddings: {train_embeddings.shape}")
print(f"Test embeddings: {test_embeddings.shape}")
print(f"Embedding dimension: {train_embeddings.shape[1]}")
print(f"Memory usage optimized with half precision and batch processing")

# Verify data integrity
nan_train = np.isnan(train_embeddings).sum()
nan_test = np.isnan(test_embeddings).sum()
print(f"Data integrity check - NaN values: Train={nan_train}, Test={nan_test}")

# Clean up GPU memory
del embedding_model
torch.cuda.empty_cache()

print("Deep embedding generation optimized for A100 completed successfully")

## Basic Text Processing Functions

These fundamental text processing functions extract basic linguistic features from essays:

### Why These Features Matter for Essay Scoring:

1. **Tokenization & Word Counting**: Essential for measuring essay length and complexity
2. **Sentence Detection**: Helps assess structural organization and coherence
3. **Spelling & Grammar Error Detection**: Direct indicators of language proficiency
4. **Stopword Analysis**: Balance between content words and function words
5. **Prompt-Essay Overlap**: Measures how well the essay addresses the given prompt

In [None]:
def tokenize(text: str) -> List[str]:
    """
    Extract word tokens from text using regex pattern.
    
    Args:
        text: Input text string
        
    Returns:
        List of lowercase word tokens
    """
    return TOKEN_RE.findall((text or "").lower())

def count_sentences(text: str) -> int:
    """
    Count sentences in text using NLTK sentence tokenizer.
    
    Args:
        text: Input text string
        
    Returns:
        Number of sentences (fallback to punctuation counting if NLTK fails)
    """
    try:
        return len(nltk.sent_tokenize(text or ""))
    except Exception:
        # Fallback: count sentence-ending punctuation
        return (text or "").count(".") + (text or "").count("!") + (text or "").count("?")

def count_spelling_errors(text: str) -> int:
    """
    Count misspelled words using PySpellChecker.
    
    Args:
        text: Input text string
        
    Returns:
        Number of misspelled words
    """
    tokens = tokenize(text)
    unknown_words = SPELL.unknown(tokens)
    return len(unknown_words)

def count_stopwords(text: str) -> int:
    """
    Count stopwords (common function words) in text.
    
    Args:
        text: Input text string
        
    Returns:
        Number of stopwords
    """
    return sum(1 for word in (text or "").split() if word.lower() in STOPWORDS)

def prompt_overlap(prompt: str, essay: str) -> int:
    """
    Calculate word overlap between prompt and essay.
    Measures how well the essay addresses the prompt topic.
    
    Args:
        prompt: The essay prompt text
        essay: The student's essay text
        
    Returns:
        Number of overlapping words between prompt and essay
    """
    prompt_words = set((prompt or "").lower().split())
    essay_words = set((essay or "").lower().split())
    return len(prompt_words.intersection(essay_words))

def grammar_error_count(text: str) -> int:
    """
    Count grammar errors using LanguageTool with timeout handling.
    
    Args:
        text: Input text string
        
    Returns:
        Number of grammar errors (0 if LanguageTool fails)
    """
    if not _TOOL:
        return 0
        
    try:
        # Limit text length to prevent timeouts
        text_to_check = (text or "")[:5000]  # Limit to 5000 chars
        if not text_to_check.strip():
            return 0
        
        matches = _TOOL.check(text_to_check)
        return len(matches)
    except Exception:
        # Return 0 if LanguageTool fails (better than crashing)
        return 0

print("✓ Basic text processing functions defined")

## Advanced NLP Features with spaCy

These advanced features use spaCy's linguistic analysis capabilities to extract sophisticated features that correlate with essay quality:

### Why These Features Are Important:

1. **Capitalization Features**: 
   - Proper capitalization indicates writing maturity
   - Proper nouns show specific knowledge and detail

2. **Part-of-Speech Distribution**:
   - Noun ratio: Content density and concrete thinking
   - Verb ratio: Action and dynamic writing
   - Adjective ratio: Descriptive language quality
   - Adverb ratio: Modification and nuance

3. **Syntactic Complexity**:
   - Clause complexity: Sophisticated sentence structures
   - Shows advanced grammatical knowledge

4. **Named Entity Analysis**:
   - Entity overlap with prompt: Topic relevance
   - Entity count: Specificity and detail level

In [None]:
# -------------------------
# spaCy-based linguistic features
# -------------------------

def _get_nlp():
    """
    Lazy-load spaCy model for memory efficiency.
    
    Returns:
        Loaded spaCy English model
    """
    global _SPACY_NLP
    if _SPACY_NLP is None:
        print("Loading spaCy model...")
        try:
            _SPACY_NLP = spacy.load("en_core_web_sm")
            print("✓ spaCy model loaded successfully")
        except OSError:
            print("⚠️  spaCy model 'en_core_web_sm' not found. Please install it with:")
            print("python -m spacy download en_core_web_sm")
            return None
    return _SPACY_NLP

def capitalization_features(doc) -> dict:
    """
    Extract capitalization patterns from spaCy document.
    
    Args:
        doc: spaCy processed document
        
    Returns:
        Dictionary with capitalization metrics
    """
    alpha_tokens = [token for token in doc if token.is_alpha]
    word_count = len(alpha_tokens) or 1  # Avoid division by zero
    
    cap_words = sum(1 for token in alpha_tokens 
                   if token.text and token.text[0].isupper())
    proper_nouns = sum(1 for token in doc if token.pos_ == "PROPN")
    
    return {
        "capitalized_word_count": cap_words,
        "proper_noun_count": proper_nouns,
        "capitalized_word_ratio": cap_words / word_count,
        "proper_noun_ratio": proper_nouns / word_count
    }

def pos_distribution(doc) -> dict:
    """
    Extract part-of-speech distribution from spaCy document.
    
    Args:
        doc: spaCy processed document
        
    Returns:
        Dictionary with POS ratios
    """
    alpha_tokens = [token for token in doc if token.is_alpha]
    word_count = len(alpha_tokens) or 1
    pos_counts = Counter(token.pos_ for token in doc)
    
    return {
        "noun_ratio": pos_counts.get("NOUN", 0) / word_count,
        "verb_ratio": pos_counts.get("VERB", 0) / word_count,
        "adj_ratio": pos_counts.get("ADJ", 0) / word_count,
        "adv_ratio": pos_counts.get("ADV", 0) / word_count
    }

def clause_complexity(doc) -> float:
    """
    Calculate syntactic complexity as clauses per sentence.
    Higher values indicate more complex sentence structures.
    
    Args:
        doc: spaCy processed document
        
    Returns:
        Average clauses per sentence
    """
    sentences = list(doc.sents)
    if not sentences:
        return 0.0
    
    # Dependencies that indicate clauses
    clause_dependencies = {"csubj", "ccomp", "xcomp", "advcl", "relcl"}
    clause_count = sum(1 for token in doc if token.dep_ in clause_dependencies)
    
    return clause_count / len(sentences)

def named_entity_features(prompt_doc, essay_doc) -> dict:
    """
    Extract named entity overlap between prompt and essay.
    Measures topic relevance and specificity.
    
    Args:
        prompt_doc: spaCy processed prompt document
        essay_doc: spaCy processed essay document
        
    Returns:
        Dictionary with named entity metrics
    """
    essay_entities = set((ent.text.lower(), ent.label_) for ent in essay_doc.ents)
    prompt_entities = set((ent.text.lower(), ent.label_) for ent in prompt_doc.ents)
    overlapping_entities = essay_entities & prompt_entities
    
    return {
        "ner_count": len(essay_entities),
        "ner_overlap_count": len(overlapping_entities),
        "ner_overlap_ratio": (len(overlapping_entities) / len(prompt_entities)) if prompt_entities else 0.0
    }

print("✓ Advanced spaCy-based linguistic features defined")

In [None]:
# -------------------------
# Readability and stylistic features
# -------------------------

def topic_coverage(prompt: str, essay: str) -> float:
    """
    Calculate what fraction of prompt keywords appear in the essay.
    Higher values indicate better prompt adherence.
    
    Args:
        prompt: The essay prompt text
        essay: The student's essay text
        
    Returns:
        Ratio of prompt keywords covered in the essay (0.0 to 1.0)
    """
    prompt_keywords = set(tokenize(prompt))
    if not prompt_keywords:
        return 0.0
    essay_words = set(tokenize(essay))
    return len(prompt_keywords & essay_words) / len(prompt_keywords)

def punctuation_features(text: str) -> dict:
    """
    Extract punctuation usage patterns as indicators of writing sophistication.
    
    Args:
        text: Input text string
        
    Returns:
        Dictionary with punctuation metrics
    """
    punctuation_counts = Counter(char for char in (text or "") if char in PUNCTUATION)
    total_punctuation = sum(punctuation_counts.values())
    word_count = len(tokenize(text)) or 1
    
    return {
        "comma_count": punctuation_counts.get(",", 0),
        "period_count": punctuation_counts.get(".", 0),
        "question_count": punctuation_counts.get("?", 0),
        "exclam_count": punctuation_counts.get("!", 0),
        "punctuation_ratio": total_punctuation / word_count
    }

def syllables_per_word(text: str) -> float:
    """
    Calculate average syllables per word as a complexity metric.
    
    Args:
        text: Input text string
        
    Returns:
        Average syllables per word
    """
    words = tokenize(text)
    if not words:
        return 0.0
    total_syllables = sum(textstat.syllable_count(word) for word in words)
    return total_syllables / len(words)

def readability_scores(text: str) -> dict:
    """
    Calculate multiple readability metrics using textstat library.
    
    Why these metrics matter:
    - Flesch Reading Ease: Higher scores = easier to read
    - Flesch-Kincaid Grade: Grade level required to understand text
    - Dale-Chall Score: Difficulty based on familiar words
    
    Args:
        text: Input text string
        
    Returns:
        Dictionary with readability scores
    """
    text = text or ""
    
    def safe_metric(metric_func, default=0.0):
        """Safely compute metric with fallback"""
        try:
            return float(metric_func(text))
        except Exception:
            return default
    
    return {
        "flesch_reading_ease": safe_metric(textstat.flesch_reading_ease, 0.0),
        "flesch_kincaid_grade": safe_metric(textstat.flesch_kincaid_grade, 0.0),
        "dale_chall_score": safe_metric(textstat.dale_chall_readability_score, 0.0),
    }

print("✓ Readability and stylistic feature functions defined")

## Readability and Stylistic Features

These features measure the sophistication and accessibility of the writing:

### Why These Features Matter:

1. **Topic Coverage**: 
   - Measures prompt adherence (key scoring criterion)
   - Shows how well the essay addresses the given task

2. **Punctuation Analysis**:
   - Comma usage: Sentence complexity and proper grammar
   - Question/exclamation marks: Rhetorical sophistication
   - Overall punctuation ratio: Writing mechanics proficiency

3. **Syllable Complexity**:
   - Average syllables per word: Vocabulary sophistication
   - Correlates with lexical resource scoring dimension

4. **Readability Metrics**:
   - **Flesch Reading Ease**: Sentence length and syllable complexity balance
   - **Flesch-Kincaid Grade**: Appropriate complexity for target audience  
   - **Dale-Chall Score**: Vocabulary difficulty assessment

These metrics help evaluate both the **Lexical Resource** and **Grammatical Range** scoring dimensions.

In [None]:
# -------------------------
# Embedding utilities optimized for A100 80GB
# -------------------------

def chunked_iterable(iterable: Iterable, chunk_size: int):
    """
    Split iterable into chunks for efficient batch processing.
    
    Args:
        iterable: Input iterable to chunk
        chunk_size: Size of each chunk
        
    Yields:
        Chunks of the iterable
    """
    iterable_list = list(iterable)
    for i in range(0, len(iterable_list), chunk_size):
        yield iterable_list[i:i+chunk_size]

def _load_sentence_transformer(model_name: str, device: torch.device, use_fp16: bool = True) -> SentenceTransformer:
    """
    Load sentence transformer model with GPU optimizations.
    
    Args:
        model_name: HuggingFace model name
        device: Torch device to use
        use_fp16: Whether to use half-precision for memory efficiency
        
    Returns:
        Loaded SentenceTransformer model
    """
    try:
        model = SentenceTransformer(model_name, device=device)
        print(f"✓ Successfully loaded model: {model_name}")
    except Exception as e:
        fallback_model = "sentence-transformers/all-mpnet-base-v2"
        print(f"⚠️  Model loading failed: {e}")
        print(f"Using fallback model: {fallback_model}")
        model = SentenceTransformer(fallback_model, device=device)
    
    # GPU precision optimization
    if use_fp16 and device.type == "cuda":
        try:
            model.half()
            print("✓ Model converted to FP16 for memory optimization")
        except Exception:
            print("⚠️  FP16 conversion failed - using default precision")
    
    model.eval()
    return model

def encode_with_model_once(texts: List[str],
                          model_name: str = "BAAI/bge-large-en-v1.5",
                          sub_batch_size: int = 32,
                          device: torch.device = device,
                          move_to_cpu: bool = True,
                          use_fp16: bool = True) -> torch.Tensor:
    """
    Encode texts using sentence transformer with efficient memory management.
    
    Args:
        texts: List of texts to encode
        model_name: HuggingFace model name
        sub_batch_size: Batch size for encoding
        device: Device to use for computation
        move_to_cpu: Whether to move results to CPU for memory efficiency
        use_fp16: Whether to use half-precision
        
    Returns:
        Tensor of encoded embeddings
    """
    model = _load_sentence_transformer(model_name, device=device, use_fp16=use_fp16)
    all_embeddings: List[torch.Tensor] = []
    
    total_chunks = len(list(chunked_iterable(texts, sub_batch_size)))
    
    with torch.no_grad():
        with torch.cuda.amp.autocast(enabled=use_fp16):
            for i, sub_batch in enumerate(tqdm(
                chunked_iterable(texts, sub_batch_size),
                desc=f"Encoding {len(texts):,} texts ({sub_batch_size} per batch)",
                total=total_chunks
            )):
                embeddings = model.encode(
                    sub_batch, 
                    batch_size=len(sub_batch),
                    convert_to_tensor=True, 
                    device=device, 
                    show_progress_bar=False,
                    normalize_embeddings=True
                )
                
                if move_to_cpu:
                    all_embeddings.append(embeddings.cpu())
                    del embeddings
                    # Periodic memory cleanup
                    if (i + 1) % 8 == 0:
                        clear_vram()
                else:
                    all_embeddings.append(embeddings)
    
    # Clean up model memory
    del model
    clear_vram()
    
    return torch.cat(all_embeddings, dim=0) if all_embeddings else torch.empty((0, 0))

print("✓ GPU-optimized embedding utilities configured")

## Embedding Utilities for Semantic Features

These utilities handle the generation of semantic embeddings that capture deep meaning relationships:

### Why Semantic Embeddings Are Crucial:

1. **Prompt-Essay Similarity**: 
   - Measures semantic alignment between prompt and essay
   - Goes beyond simple word overlap to capture meaning
   - Critical for **Task Achievement** scoring

2. **Memory-Efficient Processing**:
   - Chunked processing prevents GPU memory overflow
   - FP16 precision doubles throughput while maintaining quality
   - Automatic fallback models ensure robustness

3. **Semantic Understanding**:
   - Captures nuanced language understanding
   - Identifies conceptual relationships and coherence
   - Supports **Coherence and Cohesion** evaluation

The semantic similarity features complement rule-based features by capturing meaning that traditional NLP metrics might miss.

In [None]:
# =================================================================
# FEATURE COLUMN DEFINITIONS
# =================================================================

# Basic linguistic features
BASE_FEATURES = [
    "word_count", "char_count", "sentence_count", "unique_words", 
    "spelling_errors", "grammar_errors", "stopword_count",
    "avg_word_length", "avg_sentence_length", "unique_word_ratio",
    "spelling_error_ratio", "stopword_ratio", "prompt_overlap",
    "prompt_essay_similarity",
]

# Advanced linguistic features  
ADVANCED_FEATURES = [
    "syllables_per_word",
    "flesch_reading_ease", "flesch_kincaid_grade", "dale_chall_score",
    "comma_count", "period_count", "question_count", "exclam_count", "punctuation_ratio",
    "capitalized_word_count", "proper_noun_count", "capitalized_word_ratio", "proper_noun_ratio",
    "noun_ratio", "verb_ratio", "adj_ratio", "adv_ratio",
    "clause_per_sentence",
    "ner_count", "ner_overlap_count", "ner_overlap_ratio",
    "topic_coverage",
]

# All tabular features for the model
ALL_TABULAR_FEATURES = BASE_FEATURES + ADVANCED_FEATURES

print(f"✓ Feature definitions loaded:")
print(f"  - Base features: {len(BASE_FEATURES)}")
print(f"  - Advanced features: {len(ADVANCED_FEATURES)}")
print(f"  - Total tabular features: {len(ALL_TABULAR_FEATURES)}")

# =================================================================
# MAIN FEATURE ENGINEERING FUNCTION
# =================================================================

def engineer_basic_features(df: pd.DataFrame, tqdm_bar: bool = True) -> pd.DataFrame:
    """
    Extract basic linguistic features from essays and prompts.
    
    Args:
        df: DataFrame with 'essay' and 'prompt' columns
        tqdm_bar: Whether to show progress bars
        
    Returns:
        DataFrame with basic features added
    """
    print(f"[INFO] Extracting basic features for {len(df)} samples...")
    output_df = df.copy()
    
    # Tokenization (needed for multiple features)
    print("[step] Tokenizing essays...")
    if tqdm_bar:
        output_df["tokens"] = [tokenize(essay) for essay in tqdm(output_df["essay"], desc="Tokenizing")]
    else:
        output_df["tokens"] = output_df["essay"].map(tokenize)
    
    # Basic counts
    feature_steps = [
        ("word_count", lambda tokens: len(tokens), "tokens"),
        ("char_count", lambda essay: len(essay or ""), "essay"),
        ("sentence_count", count_sentences, "essay"),
        ("unique_words", lambda tokens: len(set(tokens)), "tokens"),
        ("stopword_count", count_stopwords, "essay"),
        ("spelling_errors", count_spelling_errors, "essay"),
        ("grammar_errors", grammar_error_count, "essay"),
    ]
    
    for feature_name, feature_func, column in feature_steps:
        print(f"[step] Computing {feature_name}...")
        if tqdm_bar:
            if column == "tokens":
                output_df[feature_name] = [feature_func(tokens) for tokens in 
                                         tqdm(output_df[column], desc=feature_name)]
            else:
                output_df[feature_name] = [feature_func(text) for text in 
                                         tqdm(output_df[column], desc=feature_name)]
        else:
            output_df[feature_name] = output_df[column].map(feature_func)
    
    # Computed ratios and averages (avoid division by zero)
    print("[step] Computing ratios and averages...")
    word_count_safe = output_df["word_count"].replace(0, np.nan)
    sentence_count_safe = output_df["sentence_count"].replace(0, np.nan)
    
    output_df["avg_word_length"] = output_df["char_count"] / word_count_safe
    output_df["avg_sentence_length"] = word_count_safe / sentence_count_safe
    output_df["unique_word_ratio"] = output_df["unique_words"] / word_count_safe
    output_df["spelling_error_ratio"] = output_df["spelling_errors"] / word_count_safe
    output_df["stopword_ratio"] = output_df["stopword_count"] / word_count_safe
    
    # Prompt-essay overlap
    print("[step] Computing prompt-essay word overlap...")
    if tqdm_bar:
        output_df["prompt_overlap"] = [
            prompt_overlap(row["prompt"], row["essay"]) 
            for _, row in tqdm(output_df.iterrows(), desc="Prompt overlap", total=len(output_df))
        ]
    else:
        output_df["prompt_overlap"] = output_df.apply(
            lambda row: prompt_overlap(row["prompt"], row["essay"]), axis=1
        )
    
    # Clean up temporary columns
    output_df.drop(columns=["tokens"], inplace=True)
    
    return output_df

print("✓ Basic feature engineering function defined")

## Main Feature Engineering Pipeline

This section defines the complete feature engineering pipeline that transforms raw essay text into numerical features for machine learning:

### Feature Engineering Strategy:

Our approach creates **54 comprehensive features** that map to the **4 scoring dimensions**:

#### 1. **Task Achievement Features** (12 features)
- Prompt-essay word overlap and semantic similarity
- Topic coverage and keyword usage
- Named entity overlap with prompt
- Essay length and completeness metrics

#### 2. **Coherence and Cohesion Features** (15 features)
- Sentence length statistics and variation
- Discourse marker usage
- Paragraph structure analysis
- Syntactic complexity (clause per sentence)

#### 3. **Lexical Resource Features** (15 features)
- Vocabulary diversity (TTR, hapax legomena)
- Word sophistication (syllables, readability scores)
- POS distribution (noun/verb/adjective ratios)
- Spelling error detection and frequency

#### 4. **Grammatical Range Features** (12 features)
- Grammar error detection and counting
- Punctuation usage patterns
- Capitalization and proper noun usage
- Sentence structure complexity

### Pipeline Architecture:

1. **Basic Features**: Fast, rule-based linguistic metrics
2. **Advanced Features**: spaCy-powered syntactic analysis  
3. **Semantic Features**: Transformer-based similarity computation
4. **Handcrafted Features**: Domain-specific essay scoring metrics

This multi-layered approach ensures we capture both surface-level and deep linguistic patterns that human raters consider when scoring essays.

In [None]:
# -------------------------
# Complete feature engineering implementation
# -------------------------

def engineer_features_complete(df: pd.DataFrame,
                              model_name: str = FEATURE_EMBEDDING_MODEL,
                              embedding_sub_batch: int = 16,
                              device: torch.device = device,
                              use_fp16: bool = True,
                              compute_embeddings: bool = True,
                              tqdm_bar: bool = True) -> pd.DataFrame:
    """Complete feature engineering with all advanced features"""
    
    # Start with basic features
    out = engineer_features(df, model_name, embedding_sub_batch, device, use_fp16, False, tqdm_bar)
    
    # ------------ Advanced linguistic features with individual progress bars ------------
    print("[step] Advanced linguistic features (spaCy + textstat)...")
    nlp = _get_nlp()  # Load spaCy model
    adv_store: Dict[str, list] = {k: [] for k in ADVANCED_COLS}
    
    # Prepare text data
    essays = out["essay"].fillna("").tolist()
    prompts = out["prompt"].fillna("").tolist()
    
    # Individual readability features with separate progress bars
    print("  Computing syllables per word...")
    if tqdm_bar:
        for essay in tqdm(essays, desc="Syllables/word"):
            adv_store["syllables_per_word"].append(syllables_per_word(essay))
    else:
        for essay in essays:
            adv_store["syllables_per_word"].append(syllables_per_word(essay))
    
    print("  Computing Flesch Reading Ease scores...")
    if tqdm_bar:
        for essay in tqdm(essays, desc="Flesch Reading Ease"):
            r = readability_scores(essay)
            adv_store["flesch_reading_ease"].append(r["flesch_reading_ease"])
    else:
        for essay in essays:
            r = readability_scores(essay)
            adv_store["flesch_reading_ease"].append(r["flesch_reading_ease"])
    
    print("  Computing Flesch-Kincaid Grade scores...")
    if tqdm_bar:
        for essay in tqdm(essays, desc="Flesch-Kincaid Grade"):
            r = readability_scores(essay)
            adv_store["flesch_kincaid_grade"].append(r["flesch_kincaid_grade"])
    else:
        for essay in essays:
            r = readability_scores(essay)
            adv_store["flesch_kincaid_grade"].append(r["flesch_kincaid_grade"])
    
    print("  Computing Dale-Chall readability scores...")
    if tqdm_bar:
        for essay in tqdm(essays, desc="Dale-Chall scores"):
            r = readability_scores(essay)
            adv_store["dale_chall_score"].append(r["dale_chall_score"])
    else:
        for essay in essays:
            r = readability_scores(essay)
            adv_store["dale_chall_score"].append(r["dale_chall_score"])
    
    # Individual punctuation features with separate progress bars
    print("  Computing comma counts...")
    if tqdm_bar:
        for essay in tqdm(essays, desc="Comma counts"):
            p = punctuation_features(essay)
            adv_store["comma_count"].append(p["comma_count"])
    else:
        for essay in essays:
            p = punctuation_features(essay)
            adv_store["comma_count"].append(p["comma_count"])
    
    print("  Computing period counts...")
    if tqdm_bar:
        for essay in tqdm(essays, desc="Period counts"):
            p = punctuation_features(essay)
            adv_store["period_count"].append(p["period_count"])
    else:
        for essay in essays:
            p = punctuation_features(essay)
            adv_store["period_count"].append(p["period_count"])
    
    print("  Computing question mark counts...")
    if tqdm_bar:
        for essay in tqdm(essays, desc="Question marks"):
            p = punctuation_features(essay)
            adv_store["question_count"].append(p["question_count"])
    else:
        for essay in essays:
            p = punctuation_features(essay)
            adv_store["question_count"].append(p["question_count"])
    
    print("  Computing exclamation mark counts...")
    if tqdm_bar:
        for essay in tqdm(essays, desc="Exclamation marks"):
            p = punctuation_features(essay)
            adv_store["exclam_count"].append(p["exclam_count"])
    else:
        for essay in essays:
            p = punctuation_features(essay)
            adv_store["exclam_count"].append(p["exclam_count"])
    
    print("  Computing punctuation ratios...")
    if tqdm_bar:
        for essay in tqdm(essays, desc="Punctuation ratios"):
            p = punctuation_features(essay)
            adv_store["punctuation_ratio"].append(p["punctuation_ratio"])
    else:
        for essay in essays:
            p = punctuation_features(essay)
            adv_store["punctuation_ratio"].append(p["punctuation_ratio"])
    
    # Topic coverage
    print("  Computing topic coverage ratios...")
    if tqdm_bar:
        for prompt, essay in tqdm(zip(prompts, essays), desc="Topic coverage", total=len(essays)):
            adv_store["topic_coverage"].append(topic_coverage(prompt, essay))
    else:
        for prompt, essay in zip(prompts, essays):
            adv_store["topic_coverage"].append(topic_coverage(prompt, essay))
    
    # spaCy-based features with individual progress bars
    print("  Processing essays with spaCy...")
    if tqdm_bar:
        essay_docs = [nlp(essay) for essay in tqdm(essays, desc="spaCy essay processing")]
    else:
        essay_docs = [nlp(essay) for essay in essays]
    
    print("  Processing prompts with spaCy...")
    if tqdm_bar:
        prompt_docs = [nlp(prompt) for prompt in tqdm(prompts, desc="spaCy prompt processing")]
    else:
        prompt_docs = [nlp(prompt) for prompt in prompts]
    
    # Individual capitalization features
    print("  Computing capitalized word counts...")
    if tqdm_bar:
        for doc in tqdm(essay_docs, desc="Capitalized words"):
            cap = capitalization_features(doc)
            adv_store["capitalized_word_count"].append(cap["capitalized_word_count"])
    else:
        for doc in essay_docs:
            cap = capitalization_features(doc)
            adv_store["capitalized_word_count"].append(cap["capitalized_word_count"])
    
    print("  Computing proper noun counts...")
    if tqdm_bar:
        for doc in tqdm(essay_docs, desc="Proper nouns"):
            cap = capitalization_features(doc)
            adv_store["proper_noun_count"].append(cap["proper_noun_count"])
    else:
        for doc in essay_docs:
            cap = capitalization_features(doc)
            adv_store["proper_noun_count"].append(cap["proper_noun_count"])
    
    print("  Computing capitalized word ratios...")
    if tqdm_bar:
        for doc in tqdm(essay_docs, desc="Capitalization ratios"):
            cap = capitalization_features(doc)
            adv_store["capitalized_word_ratio"].append(cap["capitalized_word_ratio"])
    else:
        for doc in essay_docs:
            cap = capitalization_features(doc)
            adv_store["capitalized_word_ratio"].append(cap["capitalized_word_ratio"])
    
    print("  Computing proper noun ratios...")
    if tqdm_bar:
        for doc in tqdm(essay_docs, desc="Proper noun ratios"):
            cap = capitalization_features(doc)
            adv_store["proper_noun_ratio"].append(cap["proper_noun_ratio"])
    else:
        for doc in essay_docs:
            cap = capitalization_features(doc)
            adv_store["proper_noun_ratio"].append(cap["proper_noun_ratio"])
    
    # Individual POS distribution features
    print("  Computing noun ratios...")
    if tqdm_bar:
        for doc in tqdm(essay_docs, desc="Noun ratios"):
            pos = pos_distribution(doc)
            adv_store["noun_ratio"].append(pos["noun_ratio"])
    else:
        for doc in essay_docs:
            pos = pos_distribution(doc)
            adv_store["noun_ratio"].append(pos["noun_ratio"])
    
    print("  Computing verb ratios...")
    if tqdm_bar:
        for doc in tqdm(essay_docs, desc="Verb ratios"):
            pos = pos_distribution(doc)
            adv_store["verb_ratio"].append(pos["verb_ratio"])
    else:
        for doc in essay_docs:
            pos = pos_distribution(doc)
            adv_store["verb_ratio"].append(pos["verb_ratio"])
    
    print("  Computing adjective ratios...")
    if tqdm_bar:
        for doc in tqdm(essay_docs, desc="Adjective ratios"):
            pos = pos_distribution(doc)
            adv_store["adj_ratio"].append(pos["adj_ratio"])
    else:
        for doc in essay_docs:
            pos = pos_distribution(doc)
            adv_store["adj_ratio"].append(pos["adj_ratio"])
    
    print("  Computing adverb ratios...")
    if tqdm_bar:
        for doc in tqdm(essay_docs, desc="Adverb ratios"):
            pos = pos_distribution(doc)
            adv_store["adv_ratio"].append(pos["adv_ratio"])
    else:
        for doc in essay_docs:
            pos = pos_distribution(doc)
            adv_store["adv_ratio"].append(pos["adv_ratio"])
    
    # Clause complexity
    print("  Computing clause complexity...")
    if tqdm_bar:
        for doc in tqdm(essay_docs, desc="Clause complexity"):
            adv_store["clause_per_sentence"].append(clause_complexity(doc))
    else:
        for doc in essay_docs:
            adv_store["clause_per_sentence"].append(clause_complexity(doc))
    
    # Individual named entity features
    print("  Computing named entity counts...")
    if tqdm_bar:
        for prompt_doc, essay_doc in tqdm(zip(prompt_docs, essay_docs), desc="NE counts", total=len(essay_docs)):
            ner = named_entity_features(prompt_doc, essay_doc)
            adv_store["ner_count"].append(ner["ner_count"])
    else:
        for prompt_doc, essay_doc in zip(prompt_docs, essay_docs):
            ner = named_entity_features(prompt_doc, essay_doc)
            adv_store["ner_count"].append(ner["ner_count"])
    
    print("  Computing named entity overlap counts...")
    if tqdm_bar:
        for prompt_doc, essay_doc in tqdm(zip(prompt_docs, essay_docs), desc="NE overlap counts", total=len(essay_docs)):
            ner = named_entity_features(prompt_doc, essay_doc)
            adv_store["ner_overlap_count"].append(ner["ner_overlap_count"])
    else:
        for prompt_doc, essay_doc in zip(prompt_docs, essay_docs):
            ner = named_entity_features(prompt_doc, essay_doc)
            adv_store["ner_overlap_count"].append(ner["ner_overlap_count"])
    
    print("  Computing named entity overlap ratios...")
    if tqdm_bar:
        for prompt_doc, essay_doc in tqdm(zip(prompt_docs, essay_docs), desc="NE overlap ratios", total=len(essay_docs)):
            ner = named_entity_features(prompt_doc, essay_doc)
            adv_store["ner_overlap_ratio"].append(ner["ner_overlap_ratio"])
    else:
        for prompt_doc, essay_doc in zip(prompt_docs, essay_docs):
            ner = named_entity_features(prompt_doc, essay_doc)
            adv_store["ner_overlap_ratio"].append(ner["ner_overlap_ratio"])
    
    # Add advanced features to dataframe
    print("  Adding advanced features to dataframe...")
    for col, vals in adv_store.items():
        out[col] = vals
    
    # ------------ Embedding similarity ------------
    if compute_embeddings:
        print("[step] Computing prompt-essay embedding similarity...")
        prompt_emb = encode_with_model_once(
            out["prompt"].fillna("").tolist(),
            model_name=model_name,
            sub_batch_size=embedding_sub_batch,
            device=device,
            use_fp16=use_fp16
        )
        clear_vram()
        
        essay_emb = encode_with_model_once(
            out["essay"].fillna("").tolist(),
            model_name=model_name,
            sub_batch_size=embedding_sub_batch,
            device=device,
            use_fp16=use_fp16
        )
        clear_vram()
        
        # Calculate cosine similarity
        print("  Computing cosine similarity...")
        sims = util.pytorch_cos_sim(prompt_emb, essay_emb).diag()
        out["prompt_essay_similarity"] = sims.numpy()
        clear_vram(prompt_emb, essay_emb, sims)
    else:
        out["prompt_essay_similarity"] = 0.0
    
    # Clean up temporary columns
    if "tokens" in out.columns:
        out.drop(columns=["tokens"], inplace=True)
    
    # Fill any remaining NaN values
    print("[step] Filling remaining NaN values...")
    if tqdm_bar:
        for col in tqdm(DEFAULT_TAB_COLS, desc="Fill NaNs"):
            if col in out.columns:
                out[col] = out[col].fillna(0.0)
    else:
        for col in DEFAULT_TAB_COLS:
            if col in out.columns:
                out[col] = out[col].fillna(0.0)
    
    print(f"Feature engineering complete. Generated {len(DEFAULT_TAB_COLS)} features")
    return out

print("Complete feature engineering function defined")

def engineer_advanced_features(df: pd.DataFrame, tqdm_bar: bool = True) -> pd.DataFrame:
    """
    Extract advanced linguistic features using spaCy and textstat.
    
    Args:
        df: DataFrame with basic features already computed
        tqdm_bar: Whether to show progress bars
        
    Returns:
        DataFrame with advanced features added
    """
    print(f"[INFO] Extracting advanced features for {len(df)} samples...")
    output_df = df.copy()
    
    # Initialize spaCy model
    nlp = _get_nlp()
    if nlp is None:
        print("⚠️  spaCy not available - skipping advanced features")
        return output_df
    
    # Storage for advanced features
    advanced_features_storage: Dict[str, list] = {feature: [] for feature in ADVANCED_FEATURES}
    
    # Prepare text data
    essays = output_df["essay"].fillna("").tolist()
    prompts = output_df["prompt"].fillna("").tolist()
    
    # Process texts with spaCy (batch processing for efficiency)
    print("  Processing essays with spaCy...")
    if tqdm_bar:
        essay_docs = list(tqdm(nlp.pipe(essays), desc="spaCy essay processing", total=len(essays)))
        prompt_docs = list(tqdm(nlp.pipe(prompts), desc="spaCy prompt processing", total=len(prompts)))
    else:
        essay_docs = list(nlp.pipe(essays))
        prompt_docs = list(nlp.pipe(prompts))
    
    # Extract features for each document
    progress_bar = tqdm(zip(essays, prompts, essay_docs, prompt_docs), 
                       desc="Extracting advanced features", 
                       total=len(essays)) if tqdm_bar else zip(essays, prompts, essay_docs, prompt_docs)
    
    for essay_text, prompt_text, essay_doc, prompt_doc in progress_bar:
        # Readability features
        advanced_features_storage["syllables_per_word"].append(syllables_per_word(essay_text))
        readability = readability_scores(essay_text)
        advanced_features_storage["flesch_reading_ease"].append(readability["flesch_reading_ease"])
        advanced_features_storage["flesch_kincaid_grade"].append(readability["flesch_kincaid_grade"])
        advanced_features_storage["dale_chall_score"].append(readability["dale_chall_score"])
        
        # Punctuation features
        punct = punctuation_features(essay_text)
        for punct_key in ["comma_count", "period_count", "question_count", "exclam_count", "punctuation_ratio"]:
            advanced_features_storage[punct_key].append(punct[punct_key])
        
        # Capitalization features
        cap = capitalization_features(essay_doc)
        for cap_key in ["capitalized_word_count", "proper_noun_count", "capitalized_word_ratio", "proper_noun_ratio"]:
            advanced_features_storage[cap_key].append(cap[cap_key])
        
        # POS distribution
        pos = pos_distribution(essay_doc)
        for pos_key in ["noun_ratio", "verb_ratio", "adj_ratio", "adv_ratio"]:
            advanced_features_storage[pos_key].append(pos[pos_key])
        
        # Syntactic complexity
        advanced_features_storage["clause_per_sentence"].append(clause_complexity(essay_doc))
        
        # Named entity features
        ner = named_entity_features(prompt_doc, essay_doc)
        for ner_key in ["ner_count", "ner_overlap_count", "ner_overlap_ratio"]:
            advanced_features_storage[ner_key].append(ner[ner_key])
        
        # Topic coverage
        advanced_features_storage["topic_coverage"].append(topic_coverage(prompt_text, essay_text))
    
    # Add features to dataframe
    for feature_name, values in advanced_features_storage.items():
        output_df[feature_name] = values
    
    return output_df

def engineer_complete_features(df: pd.DataFrame,
                              model_name: str = FEATURE_EMBEDDING_MODEL,
                              embedding_batch_size: int = 16,
                              device: torch.device = device,
                              use_fp16: bool = True,
                              compute_embeddings: bool = True,
                              tqdm_bar: bool = True) -> pd.DataFrame:
    """
    Complete feature engineering pipeline combining all feature types.
    
    Args:
        df: DataFrame with 'essay' and 'prompt' columns
        model_name: Embedding model for semantic features
        embedding_batch_size: Batch size for embedding computation
        device: Device for computation
        use_fp16: Use half precision for memory efficiency
        compute_embeddings: Whether to compute semantic similarity
        tqdm_bar: Show progress bars
        
    Returns:
        DataFrame with all features engineered
    """
    assert "essay" in df.columns and "prompt" in df.columns, \
           "DataFrame must contain 'essay' and 'prompt' columns"
    
    print("=" * 60)
    print("COMPREHENSIVE FEATURE ENGINEERING PIPELINE")
    print("=" * 60)
    
    # Step 1: Basic features
    output_df = engineer_basic_features(df, tqdm_bar=tqdm_bar)
    
    # Step 2: Advanced features
    output_df = engineer_advanced_features(output_df, tqdm_bar=tqdm_bar)
    
    # Step 3: Semantic similarity features
    if compute_embeddings:
        print(f"[INFO] Computing semantic similarity using {model_name}...")
        
        # Generate embeddings for prompts and essays
        prompt_embeddings = encode_with_model_once(
            output_df["prompt"].fillna("").tolist(),
            model_name=model_name,
            sub_batch_size=embedding_batch_size,
            device=device,
            use_fp16=use_fp16
        )
        
        essay_embeddings = encode_with_model_once(
            output_df["essay"].fillna("").tolist(),
            model_name=model_name,
            sub_batch_size=embedding_batch_size,
            device=device,
            use_fp16=use_fp16
        )
        
        # Calculate cosine similarity
        print("  Computing prompt-essay semantic similarity...")
        similarities = util.pytorch_cos_sim(prompt_embeddings, essay_embeddings).diag()
        output_df["prompt_essay_similarity"] = similarities.cpu().numpy()
        
        # Clean up GPU memory
        clear_vram(prompt_embeddings, essay_embeddings, similarities)
    else:
        output_df["prompt_essay_similarity"] = 0.0
    
    # Step 4: Fill missing values
    print("[INFO] Filling missing values with zeros...")
    for feature in ALL_TABULAR_FEATURES:
        if feature in output_df.columns:
            output_df[feature] = output_df[feature].fillna(0.0)
    
    print(f"✓ Feature engineering complete!")
    print(f"  Generated {len(ALL_TABULAR_FEATURES)} features")
    print(f"  Final shape: {output_df.shape}")
    
    return output_df

print("✓ Complete feature engineering pipeline defined")

In [None]:
# =================================================================
# APPLY COMPLETE FEATURE ENGINEERING PIPELINE
# =================================================================

print("=" * 70)
print("APPLYING COMPREHENSIVE FEATURE ENGINEERING PIPELINE")
print("=" * 70)

# Apply complete feature engineering to training data
print(f"\n🔄 Processing training data ({len(train_clean)} samples)...")
print("Features will include:")
print("  ✓ Basic linguistic features (14 features)")
print("  ✓ Advanced NLP features with spaCy (27 features)")  
print("  ✓ Semantic similarity features (1 feature)")
print("  ✓ Handcrafted essay-specific features (12+ features)")

train_with_features = engineer_complete_features(
    train_clean[['prompt', 'essay']].copy(),
    model_name=FEATURE_EMBEDDING_MODEL,
    embedding_batch_size=EMBEDDING_SUB_BATCH,
    device=device,
    use_fp16=True,
    compute_embeddings=True,
    tqdm_bar=True
)

# Apply feature engineering to test data
print(f"\n🔄 Processing test data ({len(test_clean)} samples)...")
test_with_features = engineer_complete_features(
    test_clean[['prompt', 'essay']].copy(),
    model_name=FEATURE_EMBEDDING_MODEL,
    embedding_batch_size=EMBEDDING_SUB_BATCH,
    device=device,
    use_fp16=True,
    compute_embeddings=True,
    tqdm_bar=True
)

# Add handcrafted features
print(f"\n🔄 Adding specialized handcrafted features...")
train_with_features = add_handcrafted_features(
    train_with_features, 
    prompt_col="prompt", 
    essay_col="essay",
    add_spacy_features=True
)

test_with_features = add_handcrafted_features(
    test_with_features, 
    prompt_col="prompt", 
    essay_col="essay",
    add_spacy_features=True
)

# Combine with original data (including target variables for training)
print("\n🔗 Combining with original data...")
train_final = pd.concat([
    train_clean.reset_index(drop=True),
    train_with_features.drop(columns=['prompt', 'essay']).reset_index(drop=True)
], axis=1)

test_final = pd.concat([
    test_clean.reset_index(drop=True),
    test_with_features.drop(columns=['prompt', 'essay']).reset_index(drop=True)
], axis=1)

# Feature engineering summary
print("\n" + "=" * 70)
print("FEATURE ENGINEERING COMPLETED SUCCESSFULLY")
print("=" * 70)
print(f"📊 Training data shape: {train_final.shape}")
print(f"📊 Test data shape: {test_final.shape}")

# Count features by category
basic_feature_count = len([f for f in BASE_FEATURES if f in train_with_features.columns])
advanced_feature_count = len([f for f in ADVANCED_FEATURES if f in train_with_features.columns])
handcrafted_feature_count = len([f for f in train_with_features.columns 
                               if f not in BASE_FEATURES + ADVANCED_FEATURES + ['prompt', 'essay']])

total_engineered_features = basic_feature_count + advanced_feature_count + handcrafted_feature_count

print(f"\n📈 Feature Categories:")
print(f"  ✓ Basic linguistic features: {basic_feature_count}")
print(f"  ✓ Advanced NLP features: {advanced_feature_count}")
print(f"  ✓ Handcrafted features: {handcrafted_feature_count}")
print(f"  ✓ Total engineered features: {total_engineered_features}")

# Display sample feature statistics
engineered_columns = [col for col in train_with_features.columns if col not in ['prompt', 'essay']]
feature_sample = train_with_features[engineered_columns[:10]]  # First 10 features
print(f"\n📋 Sample feature statistics (first 10 features):")
print(feature_sample.describe().round(3))

print(f"\n✅ Ready for CatBoost model training!")
print(f"   Features are optimally engineered for the 4 scoring dimensions:")
print(f"   • Task Achievement • Coherence & Cohesion • Lexical Resource • Grammatical Range")

## Feature Engineering Summary & Mapping to Scoring Dimensions

Our comprehensive feature engineering creates **54+ features** that directly map to the four essay scoring dimensions:

### 🎯 **Task Achievement Features** (15 features)
- **Prompt Analysis**: `prompt_overlap`, `prompt_essay_similarity`, `prompt_keyword_coverage`
- **Completeness**: `word_count`, `sentence_count`, `paragraph_count`
- **Topic Coverage**: `topic_coverage`, `ner_overlap_ratio`, `has_conclusion`
- **Content Depth**: `unique_words`, `unique_word_ratio`, `ner_count`

### 🔗 **Coherence and Cohesion Features** (18 features)  
- **Organization**: `discourse_marker_count`, `paragraph_count`, `avg_sent_per_para`
- **Flow & Transition**: `sentence_count`, `sent_len_mean`, `sent_len_std`, `sent_len_cv`
- **Structural Complexity**: `clause_per_sentence`, `dep_tree_depth_mean`, `subordination_ratio`
- **Punctuation Flow**: `comma_count`, `period_count`, `punctuation_ratio`, `punct_entropy`

### 📚 **Lexical Resource Features** (12 features)
- **Vocabulary Richness**: `ttr`, `hapax_ratio`, `dis_ratio`, `yules_k`, `honore_r`
- **Word Sophistication**: `syllables_per_word`, `avg_word_length`
- **Readability**: `flesch_reading_ease`, `flesch_kincaid_grade`, `dale_chall_score`
- **Spelling Accuracy**: `spelling_errors`, `spelling_errors_per100`

### ⚙️ **Grammatical Range Features** (9+ features)
- **Grammar Accuracy**: `grammar_errors`, `grammar_errors_per100`
- **Sentence Variety**: `sent_len_cv`, `clause_per_sentence`, `subordination_ratio`
- **POS Distribution**: `noun_ratio`, `verb_ratio`, `adj_ratio`, `adv_ratio`
- **Capitalization**: `capitalized_word_ratio`, `proper_noun_ratio`

### 🚀 **Technical Advantages**

1. **Multi-Modal Feature Fusion**:
   - Rule-based linguistic features (fast, interpretable)
   - Deep learning embeddings (semantic understanding)  
   - Domain-specific handcrafted features (expert knowledge)

2. **Memory-Optimized Processing**:
   - GPU batch processing with FP16 precision
   - Automatic memory cleanup and garbage collection
   - Chunked processing for large datasets

3. **Robustness & Error Handling**:
   - Graceful fallbacks for failed computations
   - Missing value imputation strategies
   - Timeout handling for external tools

4. **Scalability**:
   - Vectorized operations where possible
   - Progress tracking for long-running processes
   - Modular design for easy feature addition/removal

This feature engineering approach provides CatBoost with rich, interpretable features that closely mirror how human raters evaluate essay quality across all four scoring dimensions.

In [None]:
# =================================================================
# ADDITIONAL HANDCRAFTED FEATURES FOR ESSAY SCORING
# =================================================================

import re
import math
import numpy as np
import pandas as pd
from collections import Counter

# Regex patterns for text analysis
_WORD_RE = re.compile(r"[A-Za-z']+")
_SENT_SPLIT = re.compile(r"[.!?]+")
_PARA_SPLIT = re.compile(r"\n{1,}|\r{1,}")

# Discourse markers indicating essay organization
DISCOURSE_MARKERS = [
    # Addition/continuation
    "moreover", "furthermore", "in addition", "additionally", "also",
    # Contrast/comparison
    "however", "nevertheless", "nonetheless", "on the other hand", 
    "although", "though", "whereas",
    # Cause and effect
    "therefore", "thus", "hence", "consequently", "as a result", "so that",
    # Examples/illustration
    "for example", "for instance", "such as",
    # Sequence/conclusion
    "first", "second", "third", "finally", "in conclusion", "to conclude", "overall"
]

def _tokenize_words(text):
    """Extract alphabetic words from text"""
    return _WORD_RE.findall(text.lower())

def _safe_divide(numerator, denominator):
    """Safe division avoiding division by zero"""
    return float(numerator) / denominator if denominator else 0.0

def lexical_diversity_metrics(words):
    """
    Calculate advanced lexical diversity measures.
    
    Why these matter for essay scoring:
    - TTR: Type-Token Ratio measures vocabulary richness
    - Hapax Legomena: Words appearing once (vocabulary breadth)
    - Dis Legomena: Words appearing twice (vocabulary control)
    - Yule's K: Vocabulary distribution complexity
    - Honore's R: Advanced vocabulary richness measure
    
    Args:
        words: List of word tokens
        
    Returns:
        Dictionary with lexical diversity metrics
    """
    word_count = len(words)
    unique_words = set(words)
    ttr = _safe_divide(len(unique_words), word_count)
    
    word_frequencies = Counter(words)
    hapax = sum(1 for word, count in word_frequencies.items() if count == 1)
    dis = sum(1 for word, count in word_frequencies.items() if count == 2)
    
    hapax_ratio = _safe_divide(hapax, word_count)
    dis_ratio = _safe_divide(dis, word_count)
    
    # Yule's K (vocabulary distribution complexity)
    m1 = float(word_count)
    m2 = sum(count * count for count in word_frequencies.values())
    yules_k = 1e4 * _safe_divide(m2 - m1, m1 * m1)
    
    # Honore's R (advanced vocabulary richness)
    honore_r = 100 * math.log(max(word_count, 1)) * _safe_divide(
        len(unique_words), max(1, (word_count - hapax))
    )
    
    return {
        "ttr": ttr,
        "hapax_ratio": hapax_ratio,
        "dis_ratio": dis_ratio,
        "yules_k": yules_k,
        "honore_r": honore_r
    }

def punctuation_entropy(text):
    """
    Calculate entropy of punctuation usage as sophistication measure.
    Higher entropy indicates more varied punctuation usage.
    
    Args:
        text: Input text string
        
    Returns:
        Punctuation entropy value
    """
    punctuation_marks = [char for char in text if char in ",.;:!?"]
    if not punctuation_marks:
        return 0.0
    
    frequencies = Counter(punctuation_marks)
    total = len(punctuation_marks)
    probabilities = [count / total for count in frequencies.values()]
    
    return -sum(prob * math.log(prob + 1e-12) for prob in probabilities)

def discourse_marker_count(text):
    """
    Count discourse markers indicating essay organization.
    Higher counts suggest better coherence and cohesion.
    
    Args:
        text: Input text string
        
    Returns:
        Number of discourse markers found
    """
    text_lower = text.lower()
    count = 0
    
    for marker in DISCOURSE_MARKERS:
        # Count occurrences with word boundaries
        count += text_lower.count(" " + marker + " ")
        # Check if text starts with marker
        if text_lower.startswith(marker + " "):
            count += 1
    
    return count

def sentence_length_statistics(text):
    """
    Calculate sentence length variation as complexity indicator.
    
    Why this matters:
    - Mean length: Overall sentence complexity
    - Standard deviation: Sentence variety (good writing varies)
    - Coefficient of variation: Relative sentence variety
    
    Args:
        text: Input text string
        
    Returns:
        Dictionary with sentence length statistics
    """
    sentences = [sentence.strip() for sentence in _SENT_SPLIT.split(text) 
                if sentence.strip()]
    sentence_lengths = [len(_tokenize_words(sentence)) for sentence in sentences] or [0]
    
    mean_length = float(np.mean(sentence_lengths))
    std_length = float(np.std(sentence_lengths))
    cv = _safe_divide(std_length, mean_length) if mean_length > 0 else 0.0
    
    return {
        "sent_len_mean": mean_length,
        "sent_len_std": std_length,
        "sent_len_cv": cv,
        "sentence_count": len(sentences),
    }

def structural_organization_metrics(text):
    """
    Analyze essay structure and organization.
    
    Args:
        text: Input text string
        
    Returns:
        Dictionary with structural metrics
    """
    paragraphs = [para.strip() for para in _PARA_SPLIT.split(text) if para.strip()]
    sentences = _SENT_SPLIT.split(text)
    
    # Check for conclusion indicators
    has_conclusion = 1.0 if re.search(
        r"\b(in conclusion|to conclude|overall|in summary|finally)\b", 
        text.lower()
    ) else 0.0
    
    return {
        "paragraph_count": len(paragraphs),
        "avg_sent_per_para": _safe_divide(len(sentences), len(paragraphs)) if paragraphs else 0.0,
        "has_conclusion": has_conclusion
    }

def prompt_keyword_coverage_ratio(prompt_text, essay_text, top_k=20):
    """
    Calculate how well essay covers key prompt concepts.
    
    Args:
        prompt_text: The essay prompt
        essay_text: Student's essay
        top_k: Number of top prompt keywords to consider
        
    Returns:
        Ratio of prompt keywords covered in essay
    """
    prompt_words = [word for word in _tokenize_words(prompt_text) if len(word) > 3]
    essay_words = set(_tokenize_words(essay_text))
    
    # Get top-k most frequent keywords from prompt
    top_keywords = [word for word, _ in Counter(prompt_words).most_common(top_k)]
    if not top_keywords:
        return 0.0
    
    covered_keywords = sum(1 for word in top_keywords if word in essay_words)
    return _safe_divide(covered_keywords, len(top_keywords))

print("✓ Handcrafted feature functions defined")

def add_handcrafted_features(df, prompt_col="prompt_clean", essay_col="essay_clean",
                             misspell_col="spelling_errors", grammar_col="grammar_errors",
                             add_spacy=False, spacy_model="en_core_web_sm"):
    out = df.copy()

    lex_ttr, lex_hapax, lex_dis, lex_yule, lex_hon = [], [], [], [], []
    punct_ent, disc_ct = [], []
    s_mean, s_std, s_cv, s_cnt = [], [], [], []
    para_cnt, sent_per_para, has_concl = [], [], []
    kw_cover = []

    for p,e in zip(out[prompt_col].fillna(""), out[essay_col].fillna("")):
        words = _tokenize_words(e)
        lm = lexical_diversity_metrics(words)
        lex_ttr.append(lm["ttr"])
        lex_hapax.append(lm["hapax_ratio"])
        lex_dis.append(lm["dis_ratio"])
        lex_yule.append(lm["yules_k"])
        lex_hon.append(lm["honore_r"])

        punct_ent.append(punctuation_entropy(e))
        disc_ct.append(discourse_marker_count(e))

        sstats = sentence_length_statistics(e)
        s_mean.append(sstats["sent_len_mean"])
        s_std.append(sstats["sent_len_std"])
        s_cv.append(sstats["sent_len_cv"])
        s_cnt.append(sstats["sentence_count"])

        sm = structural_organization_metrics(e)
        para_cnt.append(sm["paragraph_count"])
        sent_per_para.append(sm["avg_sent_per_para"])
        has_concl.append(sm["has_conclusion"])

        kw_cover.append(prompt_keyword_coverage_ratio(p, e, top_k=20))

    out["ttr"] = lex_ttr
    out["hapax_ratio"] = lex_hapax
    out["dis_ratio"] = lex_dis
    out["yules_k"] = lex_yule
    out["honore_r"] = lex_hon

    out["punct_entropy"] = punct_ent
    out["discourse_marker_count"] = disc_ct

    out["sent_len_mean2"] = s_mean
    out["sent_len_std2"] = s_std
    out["sent_len_cv"] = s_cv
    out["sentence_count2"] = s_cnt  # parallel to your existing sentence_count

    out["paragraph_count"] = para_cnt
    out["avg_sent_per_para"] = sent_per_para
    out["has_conclusion"] = has_concl

    out["prompt_keyword_coverage"] = kw_cover

    # Normalize existing error counts by length (per 100 words)
    wcounts = out["word_count"].replace(0, 1)
    if misspell_col in out.columns:
        out["spelling_errors_per100"] = 100.0 * out[misspell_col] / wcounts
    if grammar_col in out.columns:
        out["grammar_errors_per100"] = 100.0 * out[grammar_col] / wcounts

    # OPTIONAL: spaCy syntactic complexity feats
    if add_spacy:
        try:
            import spacy
            nlp = spacy.load(spacy_model, disable=["ner","textcat"])
        except Exception:
            nlp = None
        if nlp is not None:
            dep_depths, subord_ratio = [], []
            for e in out[essay_col].fillna(""):
                doc = nlp(e)
                # avg dependency tree depth per sentence (approx)
                depths = []
                sub_tokens = 0
                for sent in doc.sents:
                    # longest path to a root within the sentence
                    d = max((len(list(tok.ancestors)) for tok in sent), default=0)
                    depths.append(d)
                    sub_tokens += sum(1 for tok in sent if tok.dep_ in ("mark","advcl","ccomp","xcomp","acl"))
                dep_depths.append(float(np.mean(depths)) if depths else 0.0)
                total_tokens = len(doc)
                subord_ratio.append(_safe_div(sub_tokens, total_tokens))
            out["dep_tree_depth_mean"] = dep_depths
            out["subordination_ratio"] = subord_ratio

    return out

# ---- APPLY to your dataframes (train/test) ----
# Assumes you already have train/test loaded with 'prompt_clean' and 'essay_clean'
train = add_handcrafted_features(train, add_spacy=True)  # flip True if spaCy installed
test  = add_handcrafted_features(test, add_spacy=True)

## Specialized Handcrafted Features for Essay Scoring

These domain-specific features are designed specifically for automated essay scoring based on research in writing assessment:

### Advanced Lexical Diversity Measures

**Why lexical diversity matters for scoring:**
- **Type-Token Ratio (TTR)**: Basic vocabulary richness indicator
- **Hapax Legomena Ratio**: Measures vocabulary breadth (words used once)
- **Dis Legomena Ratio**: Indicates vocabulary control (words used twice)  
- **Yule's K**: Sophisticated measure of vocabulary distribution
- **Honore's R**: Advanced vocabulary richness accounting for text length

These metrics directly correlate with the **Lexical Resource** scoring dimension.

### Discourse Organization Features

**Why discourse matters:**
- **Discourse Markers**: Words like "however", "therefore", "in conclusion" 
- Indicates essay organization and logical flow
- Directly supports **Coherence and Cohesion** evaluation
- Shows awareness of rhetorical structure

### Syntactic Sophistication

**Why sentence variation matters:**
- **Sentence Length Statistics**: Mean, standard deviation, coefficient of variation
- Good writers vary sentence length for rhythm and emphasis
- Supports **Grammatical Range** scoring dimension
- Shows command of complex sentence structures

### Structural Analysis

**Why essay structure matters:**
- **Paragraph Organization**: Proper essay formatting
- **Conclusion Detection**: Shows essay completion awareness
- **Sentence-to-Paragraph Ratio**: Balanced development
- Critical for **Task Achievement** and **Coherence and Cohesion**

These handcrafted features complement automated NLP features by incorporating domain expertise about what makes effective academic writing.

In [None]:
def add_handcrafted_features(df, 
                           prompt_col="prompt_clean", 
                           essay_col="essay_clean",
                           spelling_col="spelling_errors", 
                           grammar_col="grammar_errors",
                           add_spacy_features=False, 
                           spacy_model="en_core_web_sm"):
    """
    Add specialized handcrafted features to the dataframe.
    
    Args:
        df: Input dataframe
        prompt_col: Column name for prompt text
        essay_col: Column name for essay text  
        spelling_col: Column name for spelling error count
        grammar_col: Column name for grammar error count
        add_spacy_features: Whether to add spaCy-based syntactic features
        spacy_model: spaCy model name to use
        
    Returns:
        DataFrame with handcrafted features added
    """
    print(f"[INFO] Adding handcrafted features to {len(df)} samples...")
    output_df = df.copy()

    # Initialize feature storage lists
    feature_lists = {
        'ttr': [], 'hapax_ratio': [], 'dis_ratio': [], 'yules_k': [], 'honore_r': [],
        'punct_entropy': [], 'discourse_marker_count': [],
        'sent_len_mean2': [], 'sent_len_std2': [], 'sent_len_cv': [], 'sentence_count2': [],
        'paragraph_count': [], 'avg_sent_per_para': [], 'has_conclusion': [],
        'prompt_keyword_coverage': []
    }

    # Process each essay-prompt pair
    for prompt_text, essay_text in tqdm(zip(output_df[prompt_col].fillna(""), 
                                          output_df[essay_col].fillna("")),
                                      desc="Extracting handcrafted features",
                                      total=len(output_df)):
        
        # Lexical diversity features
        words = _tokenize_words(essay_text)
        lexical_metrics = lexical_diversity_metrics(words)
        feature_lists['ttr'].append(lexical_metrics["ttr"])
        feature_lists['hapax_ratio'].append(lexical_metrics["hapax_ratio"])
        feature_lists['dis_ratio'].append(lexical_metrics["dis_ratio"])
        feature_lists['yules_k'].append(lexical_metrics["yules_k"])
        feature_lists['honore_r'].append(lexical_metrics["honore_r"])

        # Punctuation and discourse features
        feature_lists['punct_entropy'].append(punctuation_entropy(essay_text))
        feature_lists['discourse_marker_count'].append(discourse_marker_count(essay_text))

        # Sentence statistics
        sentence_stats = sentence_length_statistics(essay_text)
        feature_lists['sent_len_mean2'].append(sentence_stats["sent_len_mean"])
        feature_lists['sent_len_std2'].append(sentence_stats["sent_len_std"])
        feature_lists['sent_len_cv'].append(sentence_stats["sent_len_cv"])
        feature_lists['sentence_count2'].append(sentence_stats["sentence_count"])

        # Structural features
        structure_metrics = structural_organization_metrics(essay_text)
        feature_lists['paragraph_count'].append(structure_metrics["paragraph_count"])
        feature_lists['avg_sent_per_para'].append(structure_metrics["avg_sent_per_para"])
        feature_lists['has_conclusion'].append(structure_metrics["has_conclusion"])

        # Prompt coverage
        feature_lists['prompt_keyword_coverage'].append(
            prompt_keyword_coverage_ratio(prompt_text, essay_text, top_k=20)
        )

    # Add all features to dataframe
    for feature_name, values in feature_lists.items():
        output_df[feature_name] = values

    # Normalize error counts by word length (per 100 words)
    word_counts = output_df["word_count"].replace(0, 1)  # Avoid division by zero
    if spelling_col in output_df.columns:
        output_df["spelling_errors_per100"] = 100.0 * output_df[spelling_col] / word_counts
    if grammar_col in output_df.columns:
        output_df["grammar_errors_per100"] = 100.0 * output_df[grammar_col] / word_counts

    # Optional: Add spaCy syntactic complexity features
    if add_spacy_features:
        print("  Adding spaCy syntactic complexity features...")
        try:
            import spacy
            nlp = spacy.load(spacy_model, disable=["ner", "textcat"])
            
            dependency_depths = []
            subordination_ratios = []
            
            for essay_text in tqdm(output_df[essay_col].fillna(""), 
                                 desc="spaCy syntactic analysis"):
                doc = nlp(essay_text)
                
                # Calculate average dependency tree depth per sentence
                sentence_depths = []
                subordinate_tokens = 0
                
                for sentence in doc.sents:
                    # Find maximum depth to root in this sentence
                    max_depth = max((len(list(token.ancestors)) for token in sentence), default=0)
                    sentence_depths.append(max_depth)
                    
                    # Count subordinate constructions
                    subordinate_tokens += sum(1 for token in sentence 
                                            if token.dep_ in ("mark", "advcl", "ccomp", "xcomp", "acl"))
                
                avg_depth = float(np.mean(sentence_depths)) if sentence_depths else 0.0
                total_tokens = len(doc)
                subord_ratio = _safe_divide(subordinate_tokens, total_tokens)
                
                dependency_depths.append(avg_depth)
                subordination_ratios.append(subord_ratio)
            
            output_df["dep_tree_depth_mean"] = dependency_depths
            output_df["subordination_ratio"] = subordination_ratios
            
        except Exception as e:
            print(f"⚠️  spaCy features failed: {e}")

    print(f"✓ Added {len(feature_lists)} handcrafted features")
    return output_df

# Example usage (commented out - will be applied in the main pipeline)
# train_with_handcrafted = add_handcrafted_features(train, add_spacy_features=True)
# test_with_handcrafted = add_handcrafted_features(test, add_spacy_features=True)

print("✓ Handcrafted feature application function defined")