In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import zipfile
import gc

from torch.utils.data import Dataset as TorchDataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
)
from safetensors.torch import load_file
# We need these from camel_tools for the feature calculation placeholder
from camel_tools.tokenizers.word import simple_word_tokenize

# =====================================================================================
# 1. CONFIGURATION
# =====================================================================================
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
TARGET_CLASSES = 19
NUM_FEATURES = 7 # This MUST match your hybrid model's training

# --- MODIFICATION: Set paths for your local machine ---
# Use a raw string (r"...") for Windows paths to handle backslashes correctly
CHECKPOINT_PATH = r"D:\arabic_readability_project\results\hybrid_regression_readability-arabertv2-d3tok-reg\checkpoint-48944"

# Assuming your 'data' and 'submission' folders are in the same project directory
BASE_DIR = r"D:\arabic_readability_project" 
DATA_DIR = os.path.join(BASE_DIR, "data")
SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")

DOC_BLIND_TEST_PATH = os.path.join(DATA_DIR, 'doc_blind_test_data.csv')
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_document.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_document.zip")

os.makedirs(SUBMISSION_DIR, exist_ok=True)
print(f"✔️ Configuration loaded. Using checkpoint: {CHECKPOINT_PATH}")


# =====================================================================================
# 2. FEATURE CALCULATION - !! ACTION REQUIRED !!
# =====================================================================================
def calculate_lexical_features(sentence_text):
    """
    Calculates the 7 lexical features for a single sentence.

    !!! IMPORTANT !!!
    You MUST replace the logic in this function with the EXACT same feature
    calculations you used to create your 'train_processed_full.csv' file.
    The order and type of features must be identical.
    
    This placeholder function demonstrates the concept.
    """
    # --- START OF PLACEHOLDER LOGIC ---
    
    # Example Feature 1: Word Count
    words = simple_word_tokenize(sentence_text)
    word_count = len(words)
    
    # Example Feature 2: Character Count
    char_count = len(sentence_text)
    
    # Example Feature 3: Average Word Length
    avg_word_len = np.mean([len(w) for w in words]) if words else 0

    # ... and so on for the other 4 features.
    
    # The final list MUST contain exactly NUM_FEATURES (7) items.
    # The dummy values below ensure the script runs. Replace them.
    feature_vector = [
        word_count,
        char_count,
        avg_word_len,
        0, # Dummy feature 4
        0, # Dummy feature 5
        0, # Dummy feature 6
        0, # Dummy feature 7
    ]
    
    # --- END OF PLACEHOLDER LOGIC ---

    return feature_vector


# =====================================================================================
# 3. HYBRID MODEL AND DATASET CLASSES
# These definitions MUST match the ones used for training your hybrid model.
# =====================================================================================

class HybridRegressionModel(nn.Module):
    """This is the architecture from your successful notebook."""
    def __init__(self, model_name, num_extra_features):
        super(HybridRegressionModel, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        transformer_output_dim = self.transformer.config.hidden_size
        # This complex head must match your trained model EXACTLY
        self.head = nn.Sequential(
            nn.Linear(transformer_output_dim + num_extra_features, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 1)
        )

    def forward(self, input_ids, attention_mask, features, labels=None):
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = transformer_outputs.last_hidden_state[:, 0, :]
        combined_features = torch.cat([cls_embedding, features], dim=1)
        logits = self.head(combined_features).squeeze(-1)

        if labels is not None:
            loss_fn = nn.MSELoss()
            loss = loss_fn(logits, labels.float())
            return (loss, logits)
        return logits

class ReadabilityDataset(TorchDataset):
    """Custom Dataset to format text and features for the hybrid model."""
    def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256):
        self.texts = texts
        self.features = features
        self.labels = labels
        self.tokenizer = tokenizer_obj
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        feature_vec = torch.tensor(self.features[idx], dtype=torch.float)
        inputs = self.tokenizer.encode_plus(
            text, None, add_special_tokens=True, max_length=self.max_len,
            padding='max_length', truncation=True, return_token_type_ids=True
        )
        item = {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'features': feature_vec
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item


# =====================================================================================
# 4. PREDICTION LOGIC
# =====================================================================================

def generate_document_predictions(checkpoint_path):
    print("\n===== 🚀 STARTING HYBRID DOCUMENT PREDICTION PIPELINE =====\n")
    try:
        # --- 1. Load Model and Tokenizer ---
        print("Initializing Tokenizer...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        
        print(f"Loading model weights from checkpoint: {checkpoint_path}")
        model_weights_path = os.path.join(checkpoint_path, "model.safetensors")
        if not os.path.exists(model_weights_path):
            raise FileNotFoundError(f"'model.safetensors' not found in the checkpoint directory.")

        model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
        state_dict = load_file(model_weights_path)
        model.load_state_dict(state_dict)
        print("✔ Model loaded successfully.")
        
        # --- 2. Load and Process Document Data ---
        print(f"Loading document test data from: {DOC_BLIND_TEST_PATH}")
        doc_test_df = pd.read_csv(DOC_BLIND_TEST_PATH)
        doc_test_df.dropna(subset=['ID', 'Sentences'], inplace=True)
        
        print("Processing documents: splitting sentences and calculating lexical features...")
        all_doc_ids = []
        all_sentences = []
        all_features = []
        for _, row in doc_test_df.iterrows():
            doc_id = row['ID']
            full_text = row['Sentences']
            
            if isinstance(full_text, str) and full_text.strip():
                sentences_list = [s.strip() for s in full_text.split('\n') if s.strip()]
                for sentence in sentences_list:
                    # This is the new, critical step
                    lexical_features = calculate_lexical_features(sentence)
                    all_doc_ids.append(doc_id)
                    all_sentences.append(sentence)
                    all_features.append(lexical_features)

        sentence_df = pd.DataFrame({
            'doc_id': all_doc_ids,
            'sentence_text': all_sentences,
            'features': all_features
        })
        print(f"✔ Successfully created {len(sentence_df):,} sentences with features from {len(doc_test_df):,} documents.")

        # --- 3. Run Prediction ---
        trainer = Trainer(model=model, args=TrainingArguments(output_dir="./temp_results", per_device_eval_batch_size=32, report_to="none"))
        
        print("\nGenerating predictions for all sentences...")
        test_dataset = ReadabilityDataset(
            texts=sentence_df['sentence_text'].tolist(),
            features=sentence_df['features'].tolist(),
            tokenizer_obj=tokenizer
        )
        raw_predictions = trainer.predict(test_dataset)
        sentence_df['raw_prediction'] = raw_predictions.predictions.flatten()

        # --- 4. Aggregate and Save Submission ---
        print("Aggregating results: finding the max readability score per document...")
        doc_predictions = sentence_df.groupby('doc_id')['raw_prediction'].max()
        
        clipped_preds = np.clip(np.round(doc_predictions.values), 0, TARGET_CLASSES - 1)
        final_labels = (clipped_preds + 1).astype(int)
        
        submission_df = pd.DataFrame({'Sentence ID': doc_predictions.index, 'Prediction': final_labels})
        
        # Merge to ensure all original IDs are present
        final_submission_df = pd.DataFrame({'Sentence ID': doc_test_df['ID']}).merge(submission_df, on='Sentence ID', how='left')
        final_submission_df['Prediction'].fillna(1, inplace=True) # Default for docs with no text
        final_submission_df['Prediction'] = final_submission_df['Prediction'].astype(int)

        print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
        final_submission_df.to_csv(SUBMISSION_PATH, index=False)
        
        print(f"Compressing into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...")
        with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
            zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
        
        print(f"\n--- ✅ SUCCESS! Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' created successfully. ---")

    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")
    finally:
        gc.collect()
        if 'model' in locals(): del model
        if 'trainer' in locals(): del trainer
        torch.cuda.empty_cache()


# =====================================================================================
# 5. EXECUTE THE SCRIPT
# =====================================================================================
if __name__ == "__main__":
    generate_document_predictions(CHECKPOINT_PATH)

✔️ Configuration loaded. Using checkpoint: D:\arabic_readability_project\results\hybrid_regression_readability-arabertv2-d3tok-reg\checkpoint-48944

===== 🚀 STARTING HYBRID DOCUMENT PREDICTION PIPELINE =====

Initializing Tokenizer...
Loading model weights from checkpoint: D:\arabic_readability_project\results\hybrid_regression_readability-arabertv2-d3tok-reg\checkpoint-48944
✔ Model loaded successfully.
Loading document test data from: D:\arabic_readability_project\data\doc_blind_test_data.csv
Processing documents: splitting sentences and calculating lexical features...
✔ Successfully created 3,420 sentences with features from 100 documents.

Generating predictions for all sentences...


Aggregating results: finding the max readability score per document...

Saving prediction file to: D:\arabic_readability_project\submission\submission_hybrid_document.csv
Compressing into submission_hybrid_document.zip...

--- ✅ SUCCESS! Submission file 'submission_hybrid_document.zip' created successfully. ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_submission_df['Prediction'].fillna(1, inplace=True) # Default for docs with no text


In [2]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import zipfile
import gc

from torch.utils.data import Dataset as TorchDataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
)
from safetensors.torch import load_file
from camel_tools.tokenizers.word import simple_word_tokenize

# =====================================================================================
# 1. CONFIGURATION
# =====================================================================================
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
TARGET_CLASSES = 19
NUM_FEATURES = 7
CHECKPOINT_PATH = r"D:\arabic_readability_project\results\hybrid_regression_readability-arabertv2-d3tok-reg\checkpoint-48944"
BASE_DIR = r"D:\arabic_readability_project" 
DATA_DIR = os.path.join(BASE_DIR, "data")
SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")
DOC_BLIND_TEST_PATH = os.path.join(DATA_DIR, 'doc_blind_test_data.csv')
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_document.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_document.zip")

os.makedirs(SUBMISSION_DIR, exist_ok=True)
print(f"✔️ Configuration loaded. Using checkpoint: {CHECKPOINT_PATH}")


# =====================================================================================
# 2. FEATURE CALCULATION - !! THIS IS THE MOST LIKELY PROBLEM AREA !!
# =====================================================================================
def calculate_lexical_features(sentence_text):
    """
    You MUST replace this with the EXACT feature calculation logic from your training notebook.
    """
    words = simple_word_tokenize(sentence_text)
    word_count = len(words)
    char_count = len(sentence_text)
    avg_word_len = np.mean([len(w) for w in words]) if words else 0
    
    # The dummy values below ensure the script runs. Replace them.
    feature_vector = [word_count, char_count, avg_word_len, 0, 0, 0, 0]
    return feature_vector


# =====================================================================================
# 3. HYBRID MODEL AND DATASET CLASSES (These must match your training setup)
# =====================================================================================

class HybridRegressionModel(nn.Module):
    def __init__(self, model_name, num_extra_features):
        super(HybridRegressionModel, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        transformer_output_dim = self.transformer.config.hidden_size
        self.head = nn.Sequential(
            nn.Linear(transformer_output_dim + num_extra_features, 512),
            nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, 1)
        )
    def forward(self, input_ids, attention_mask, features, labels=None):
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = transformer_outputs.last_hidden_state[:, 0, :]
        combined_features = torch.cat([cls_embedding, features], dim=1)
        logits = self.head(combined_features).squeeze(-1)
        if labels is not None:
            loss = nn.MSELoss()(logits, labels.float())
            return (loss, logits)
        return logits

class ReadabilityDataset(TorchDataset):
    def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256):
        self.texts = texts; self.features = features; self.labels = labels
        self.tokenizer = tokenizer_obj; self.max_len = max_len
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        feature_vec = torch.tensor(self.features[idx], dtype=torch.float)
        inputs = self.tokenizer.encode_plus(text, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True)
        item = {
            'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'features': feature_vec
        }
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

# Dummy d3tok function since it's not available locally but was part of your training
# We will use the raw sentence text instead, which is a better test anyway.
def dummy_preprocess_d3tok(text):
    return text


# =====================================================================================
# 4. PREDICTION LOGIC WITH DEBUGGING OUTPUT
# =====================================================================================

def generate_document_predictions(checkpoint_path):
    print("\n===== 🚀 STARTING HYBRID DOCUMENT PREDICTION PIPELINE =====\n")
    try:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        print(f"Loading model weights from checkpoint: {checkpoint_path}")
        model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
        model.load_state_dict(load_file(os.path.join(checkpoint_path, "model.safetensors")))
        print("✔ Model loaded successfully.")
        
        doc_test_df = pd.read_csv(DOC_BLIND_TEST_PATH)
        doc_test_df.dropna(subset=['ID', 'Sentences'], inplace=True)
        
        print("Processing documents: splitting sentences and calculating features...")
        rows_for_df = []
        for _, row in doc_test_df.iterrows():
            doc_id = row['ID']
            full_text = row['Sentences']
            if isinstance(full_text, str) and full_text.strip():
                sentences_list = [s.strip() for s in full_text.split('\n') if s.strip()]
                for sentence in sentences_list:
                    features = calculate_lexical_features(sentence)
                    # For debugging, we use the raw text, not a d3tok version
                    processed_text = dummy_preprocess_d3tok(sentence)
                    rows_for_df.append({
                        'doc_id': doc_id,
                        'sentence_text': sentence,
                        'features': features,
                        'processed_text': processed_text
                    })

        if not rows_for_df:
            raise ValueError("No sentences could be extracted from the document file.")
            
        sentence_df = pd.DataFrame(rows_for_df)
        print(f"✔ Successfully created {len(sentence_df)} sentences with features.")

        # --- NEW: Save the comprehensive review file ---
        review_path = 'review_full_pipeline_output.csv'
        print(f"\n--- Saving ALL processed data for review to: {review_path} ---")
        sentence_df.to_csv(review_path, index=False, encoding='utf-8-sig')
        print("✔ Review file saved. Please inspect it for errors in 'features' and 'processed_text'.")

        trainer = Trainer(model=model, args=TrainingArguments(output_dir="./temp_results", per_device_eval_batch_size=32, report_to="none"))
        
        print("\nGenerating predictions for all sentences...")
        test_dataset = ReadabilityDataset(
            texts=sentence_df['processed_text'].tolist(),
            features=sentence_df['features'].tolist(),
            tokenizer_obj=tokenizer
        )
        raw_predictions = trainer.predict(test_dataset)
        sentence_df['raw_prediction'] = raw_predictions.predictions.flatten()

        print("Aggregating results...")
        doc_predictions = sentence_df.groupby('doc_id')['raw_prediction'].max()
        
        clipped_preds = np.clip(np.round(doc_predictions.values), 0, TARGET_CLASSES - 1)
        final_labels = (clipped_preds + 1).astype(int)
        
        submission_df = pd.DataFrame({'Sentence ID': doc_predictions.index, 'Prediction': final_labels})
        
        final_submission_df = pd.DataFrame({'Sentence ID': doc_test_df['ID']}).merge(submission_df, on='Sentence ID', how='left')
        # FIX: Correct way to handle fillna to avoid FutureWarnings
        final_submission_df['Prediction'] = final_submission_df['Prediction'].fillna(1)
        final_submission_df['Prediction'] = final_submission_df['Prediction'].astype(int)

        print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
        final_submission_df.to_csv(SUBMISSION_PATH, index=False)
        
        with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
            zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
        
        print(f"\n--- ✅ SUCCESS! Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' created. ---")

    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")
    finally:
        gc.collect()
        if 'model' in locals(): del model
        if 'trainer' in locals(): del trainer
        torch.cuda.empty_cache()


# =====================================================================================
# 5. EXECUTE THE SCRIPT
# =====================================================================================
if __name__ == "__main__":
    generate_document_predictions(CHECKPOINT_PATH)

✔️ Configuration loaded. Using checkpoint: D:\arabic_readability_project\results\hybrid_regression_readability-arabertv2-d3tok-reg\checkpoint-48944

===== 🚀 STARTING HYBRID DOCUMENT PREDICTION PIPELINE =====

Loading model weights from checkpoint: D:\arabic_readability_project\results\hybrid_regression_readability-arabertv2-d3tok-reg\checkpoint-48944
✔ Model loaded successfully.
Processing documents: splitting sentences and calculating features...
✔ Successfully created 3420 sentences with features.

--- Saving ALL processed data for review to: review_full_pipeline_output.csv ---
✔ Review file saved. Please inspect it for errors in 'features' and 'processed_text'.

Generating predictions for all sentences...


Aggregating results...

Saving prediction file to: D:\arabic_readability_project\submission\submission_hybrid_document.csv

--- ✅ SUCCESS! Submission file 'submission_hybrid_document.zip' created. ---


In [None]:
# Evaluating the document on the Hybrid Model trained on samer + samer lexicon

In [7]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import zipfile
import gc

from torch.utils.data import Dataset as TorchDataset
from transformers import (
    AutoTokenizer,
    AutoModel,
    TrainingArguments,
    Trainer,
)
from safetensors.torch import load_file
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar
from tqdm.auto import tqdm

# =====================================================================================
# 1. CONFIGURATION
# =====================================================================================
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
TARGET_CLASSES = 19
NUM_FEATURES = 7 

CHECKPOINT_PATH = r"D:\arabic_readability_project\results\hybrid_regression_readability-arabertv2-d3tok-reg\checkpoint-48944"
BASE_DIR = r"D:\arabic_readability_project" 
DATA_DIR = os.path.join(BASE_DIR, "data")
SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")

DOC_BLIND_TEST_PATH = os.path.join(DATA_DIR, 'doc_blind_test_data.csv')
SAMER_LEXICON_PATH = os.path.join(DATA_DIR, 'samer_lexicon.tsv') 

SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_document_final.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_hybrid_document_final.zip")

os.makedirs(SUBMISSION_DIR, exist_ok=True)
print(f"✔️ Configuration loaded. Using checkpoint: {CHECKPOINT_PATH}")


# =====================================================================================
# 2. FEATURE CALCULATION - WITH THE FINAL API FIX
# =====================================================================================
def calculate_features_and_d3tok(sentence_text, disambiguator, lexicon_map):
    if not isinstance(sentence_text, str) or not sentence_text.strip():
        return ([0.0] * NUM_FEATURES, "")

    try:
        # --- THE FINAL FIX IS HERE ---
        # Pass the flat list of tokens directly to the disambiguator.
        tokens = simple_word_tokenize(sentence_text)
        disambiguated_sentence = disambiguator.disambiguate(tokens)

        d3tok_forms = []
        for da in disambiguated_sentence:
            if da.analyses and 'd3tok' in da.analyses[0][1]:
                d3tok_value = da.analyses[0][1]['d3tok']
                if isinstance(d3tok_value, str):
                    d3tok_forms.append(dediac_ar(d3tok_value).replace("_+", " +").replace("+_", "+ "))
                elif isinstance(da.word, str): d3tok_forms.append(da.word)
            elif isinstance(da.word, str): d3tok_forms.append(da.word)
        d3tok_text = " ".join(d3tok_forms)

        scores = []
        for dw in disambiguated_sentence:
            if dw.analyses:
                analysis = dw.analyses[0][1]
                lemma, pos = analysis.get('lex'), analysis.get('pos')
                if pos and isinstance(lemma, str):
                    score = lexicon_map.get(f"{dediac_ar(lemma)}#{pos}")
                    if score is not None: scores.append(score)
        
        avg_readability = np.mean(scores) if scores else 0.0
        max_readability = np.max(scores) if scores else 0.0

        # !!! ACTION REQUIRED: Add your other 5 features here !!!
        feature_3, feature_4, feature_5, feature_6, feature_7 = 0.0, 0.0, 0.0, 0.0, 0.0
        feature_vector = [avg_readability, max_readability, feature_3, feature_4, feature_5, feature_6, feature_7]
        
        return feature_vector, d3tok_text

    except TypeError as e:
        error_message = f"A TypeError occurred processing sentence: >>>{sentence_text}<<< Original error: {e}"
        raise TypeError(error_message)
    except Exception as e:
        print(f"Warning: An error '{e}' occurred on sentence: '{sentence_text}'. Skipping.")
        return ([0.0] * NUM_FEATURES, "")


# =====================================================================================
# 3. HYBRID MODEL AND DATASET CLASSES (Copied from your notebook)
# =====================================================================================

class HybridRegressionModel(nn.Module):
    def __init__(self, model_name, num_extra_features):
        super(HybridRegressionModel, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        transformer_output_dim = self.transformer.config.hidden_size
        self.head = nn.Sequential(
            nn.Linear(transformer_output_dim + num_extra_features, 512),
            nn.BatchNorm1d(512), nn.ReLU(), nn.Dropout(0.3),
            nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.2),
            nn.Linear(256, 1)
        )
    def forward(self, input_ids, attention_mask, features, labels=None):
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = transformer_outputs.last_hidden_state[:, 0, :]
        combined_features = torch.cat([cls_embedding, features], dim=1)
        logits = self.head(combined_features).squeeze(-1)
        if labels is not None:
            loss = nn.MSELoss()(logits, labels.float())
            return (loss, logits)
        return logits

class ReadabilityDataset(TorchDataset):
    def __init__(self, texts, features, labels=None, tokenizer_obj=None, max_len=256):
        self.texts=texts; self.features=features; self.labels=labels
        self.tokenizer=tokenizer_obj; self.max_len=max_len
    def __len__(self): return len(self.texts)
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        feature_vec = torch.tensor(self.features[idx], dtype=torch.float)
        inputs = self.tokenizer.encode_plus(text, None, add_special_tokens=True, max_length=self.max_len, padding='max_length', truncation=True)
        item = {'input_ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
                'attention_mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
                'features': feature_vec}
        if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item


# =====================================================================================
# 4. PREDICTION LOGIC
# =====================================================================================

def generate_document_predictions(checkpoint_path):
    print("\n===== 🚀 STARTING HYBRID DOCUMENT PREDICTION PIPELINE =====\n")
    try:
        print("Initializing Tokenizer and Disambiguator...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        disambiguator = BERTUnfactoredDisambiguator.pretrained('msa')
        
        print(f"Loading SAMER Lexicon from: {SAMER_LEXICON_PATH}")
        lexicon_df = pd.read_csv(SAMER_LEXICON_PATH, sep='\t')
        lexicon_map = lexicon_df.set_index('lemma#pos')['readability (rounded average)'].to_dict()
        
        print(f"Loading model weights from checkpoint: {checkpoint_path}")
        model = HybridRegressionModel(MODEL_NAME, num_extra_features=NUM_FEATURES)
        model.load_state_dict(load_file(os.path.join(checkpoint_path, "model.safetensors")))
        print("✔ All models and data loaded successfully.")
        
        doc_test_df = pd.read_csv(DOC_BLIND_TEST_PATH)
        doc_test_df.dropna(subset=['ID', 'Sentences'], inplace=True)
        
        print("\nProcessing documents: this will take time...")
        rows_for_df = []
        for _, row in tqdm(doc_test_df.iterrows(), total=len(doc_test_df), desc="Processing Documents"):
            doc_id = row['ID']
            full_text = row['Sentences']
            if isinstance(full_text, str) and full_text.strip():
                sentences_list = [s.strip() for s in full_text.split('\n') if s.strip()]
                for sentence in sentences_list:
                    features, processed_text = calculate_features_and_d3tok(sentence, disambiguator, lexicon_map)
                    rows_for_df.append({'doc_id': doc_id, 'features': features, 'processed_text': processed_text})

        if not rows_for_df: raise ValueError("No sentences could be extracted.")
        sentence_df = pd.DataFrame(rows_for_df)
        print(f"✔ Successfully created {len(sentence_df)} sentences with features.")

        trainer = Trainer(model=model, args=TrainingArguments(output_dir="./temp_results", per_device_eval_batch_size=32, report_to="none"))
        
        print("\nGenerating predictions for all sentences...")
        test_dataset = ReadabilityDataset(texts=sentence_df['processed_text'].tolist(), features=sentence_df['features'].tolist(), tokenizer_obj=tokenizer)
        raw_predictions = trainer.predict(test_dataset)
        sentence_df['raw_prediction'] = raw_predictions.predictions.flatten()

        print("Aggregating results...")
        doc_predictions = sentence_df.groupby('doc_id')['raw_prediction'].max()
        
        clipped_preds = np.clip(np.round(doc_predictions.values), 0, TARGET_CLASSES - 1)
        final_labels = (clipped_preds + 1).astype(int)
        
        submission_df = pd.DataFrame({'Sentence ID': doc_predictions.index, 'Prediction': final_labels})
        final_submission_df = pd.DataFrame({'Sentence ID': doc_test_df['ID']}).merge(submission_df, on='Sentence ID', how='left')
        final_submission_df['Prediction'] = final_submission_df['Prediction'].fillna(1).astype(int)

        print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
        final_submission_df.to_csv(SUBMISSION_PATH, index=False)
        
        with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
            zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
        
        print(f"\n--- ✅ SUCCESS! Submission file '{os.path.basename(ZIPPED_SUBMISSION_PATH)}' created. ---")

    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")
    finally:
        gc.collect()
        if 'model' in locals(): del model
        if 'trainer' in locals(): del trainer
        torch.cuda.empty_cache()

# =====================================================================================
# 5. EXECUTE THE SCRIPT
# =====================================================================================
if __name__ == "__main__":
    generate_document_predictions(CHECKPOINT_PATH)

✔️ Configuration loaded. Using checkpoint: D:\arabic_readability_project\results\hybrid_regression_readability-arabertv2-d3tok-reg\checkpoint-48944

===== 🚀 STARTING HYBRID DOCUMENT PREDICTION PIPELINE =====

Initializing Tokenizer and Disambiguator...


Some weights of the model checkpoint at C:\Users\Fatima\AppData\Roaming\camel_tools\data\disambig_bert_unfactored\msa were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading SAMER Lexicon from: D:\arabic_readability_project\data\samer_lexicon.tsv
Loading model weights from checkpoint: D:\arabic_readability_project\results\hybrid_regression_readability-arabertv2-d3tok-reg\checkpoint-48944
✔ All models and data loaded successfully.

Processing documents: this will take time...


Processing Documents:   0%|          | 0/100 [00:00<?, ?it/s]

✔ Successfully created 3420 sentences with features.

Generating predictions for all sentences...


Aggregating results...

Saving prediction file to: D:\arabic_readability_project\submission\submission_hybrid_document_final.csv

--- ✅ SUCCESS! Submission file 'submission_hybrid_document_final.zip' created. ---
