In [1]:
# --- MODIFICATION ---
# Added 'ast' to safely parse string representations of lists
import pandas as pd
import numpy as np
import os
import torch
import zipfile
import ast
from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset as TorchDataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar

In [2]:
# --- Configuration ---
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
NUM_LABELS = 1
TARGET_CLASSES = 19
BASE_DIR = '.'
DATA_DIR = os.path.join(BASE_DIR, "data")
# --- MODIFICATION ---
# The checkpoint directory can remain the same as we are using the same sentence-level model
CHECKPOINT_DIR = os.path.join(BASE_DIR, "results", f"regression_{MODEL_NAME.split('/')[-1]}")
SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")

os.makedirs(CHECKPOINT_DIR, exist_ok=True)
os.makedirs(SUBMISSION_DIR, exist_ok=True)

# --- File Paths ---
# These are the original SENTENCE-LEVEL files for training the model
BAREC_TRAIN_PATH = os.path.join(DATA_DIR, 'train.csv')
BAREC_DEV_PATH = os.path.join(DATA_DIR, 'dev.csv')

# --- MODIFICATION ---
# New path for the DOCUMENT-LEVEL blind test file
# Assuming your document test file is named 'doc_blind_test_data.csv'
DOC_BLIND_TEST_PATH = os.path.join(DATA_DIR, 'doc_blind_test.csv') 

# --- MODIFICATION ---
# Updated submission file names for clarity
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_document_regression_final.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_document_regression_final.zip")

# Preprocessed sentence file paths (these do not change as training data is the same)
TRAIN_PREPROCESSED_PATH = os.path.join(DATA_DIR, 'train_preprocessedv2.csv')
DEV_PREPROCESSED_PATH = os.path.join(DATA_DIR, 'dev_preprocessedv2.csv')

In [3]:



# --- DATA LOADING AND PREPROCESSING ---
# This function remains UNCHANGED as it works on sentence strings
def preprocess_d3tok(text, disambiguator):
    """
    Preprocesses text into the D3Tok format using BERTUnfactoredDisambiguator.
    This version includes robust error handling for missing 'd3tok' keys.
    """
    if not isinstance(text, str) or not text.strip():
        return ""
    tokens = simple_word_tokenize(text)
    disambiguated_sentence = disambiguator.disambiguate(tokens)
    d3tok_forms = []
    for disambig_word in disambiguated_sentence:
        if disambig_word.analyses:
            analysis_dict = disambig_word.analyses[0][1]
            if 'd3tok' in analysis_dict:
                d3tok = dediac_ar(analysis_dict['d3tok']).replace("_+", " +").replace("+_", "+ ")
                d3tok_forms.append(d3tok)
            else:
                d3tok_forms.append(disambig_word.word)
        else:
            d3tok_forms.append(disambig_word.word)
    return " ".join(d3tok_forms)


# This function remains UNCHANGED as it prepares the SENTENCE-LEVEL training data
def load_or_preprocess_data(disambiguator):
    """
    Loads preprocessed data if it exists, otherwise, it runs preprocessing.
    """
    print("--- Loading BAREC SENTENCE-LEVEL Data for Training ---")
    if os.path.exists(TRAIN_PREPROCESSED_PATH) and os.path.exists(DEV_PREPROCESSED_PATH):
        print("✔ Found preprocessed sentence files. Loading them directly...")
        train_df = pd.read_csv(TRAIN_PREPROCESSED_PATH)
        val_df = pd.read_csv(DEV_PREPROCESSED_PATH)
        train_df['text'] = train_df['text'].astype(str)
        val_df['text'] = val_df['text'].astype(str)
        print(f"Successfully loaded {len(train_df)} training and {len(val_df)} validation records.")
        return train_df, val_df
    else:
        print("Preprocessed files not found. Starting one-time preprocessing on sentence data...")
        try:
            train_df = pd.read_csv(BAREC_TRAIN_PATH)
            val_df = pd.read_csv(BAREC_DEV_PATH)
            train_df = train_df[['Sentence', 'Readability_Level_19']].rename(
                columns={'Sentence': 'text', 'Readability_Level_19': 'label'})
            val_df = val_df[['Sentence', 'Readability_Level_19']].rename(
                columns={'Sentence': 'text', 'Readability_Level_19': 'label'})
            train_df.dropna(subset=['text', 'label'], inplace=True)
            val_df.dropna(subset=['label', 'text'], inplace=True)
            train_df['text'] = train_df['text'].astype(str)
            val_df['text'] = val_df['text'].astype(str)
            train_df['label'] = train_df['label'].astype(int) - 1
            val_df['label'] = val_df['label'].astype(int) - 1
            train_df['label'] = train_df['label'].astype(float)
            val_df['label'] = val_df['label'].astype(float)
            print(f"Successfully loaded raw sentence data: {len(train_df)} training and {len(val_df)} validation records.")
            print("\n--- Preprocessing Text to D3Tok format (this will only run once) ---")
            train_df['text'] = train_df['text'].apply(lambda x: preprocess_d3tok(x, disambiguator))
            val_df['text'] = val_df['text'].apply(lambda x: preprocess_d3tok(x, disambiguator))
            print("✔ Text preprocessing finished.")
            print("\n--- Saving preprocessed data for future use... ---")
            train_df.to_csv(TRAIN_PREPROCESSED_PATH, index=False)
            val_df.to_csv(DEV_PREPROCESSED_PATH, index=False)
            print(f"** Saved preprocessed files to {TRAIN_PREPROCESSED_PATH} and {DEV_PREPROCESSED_PATH} **")
            return train_df, val_df
        except FileNotFoundError:
            print(f"! ERROR: Raw file not found. Make sure sentence-level 'train.csv' and 'dev.csv' are in the '{DATA_DIR}' directory.")
            return None, None
        except Exception as e:
            print(f"! ERROR during initial processing: {e}")
            return None, None


print("Initializing BERT Disambiguator for preprocessing...")
bert_disambiguator = BERTUnfactoredDisambiguator.pretrained('msa')

train_df, val_df = load_or_preprocess_data(bert_disambiguator)

if train_df is not None:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
else:
    print("Stopping script due to data loading failure.")
    exit()

# --- DATASET AND METRICS ---
# This class remains UNCHANGED
class ReadabilityDataset(TorchDataset):
    def __init__(self, texts, labels=None):
        self.encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=256)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.encodings.get('input_ids', []))

# This function remains UNCHANGED
def compute_metrics(p):
    preds = p.predictions.flatten()
    rounded_preds = np.round(preds)
    clipped_preds = np.clip(rounded_preds, 0, TARGET_CLASSES - 1).astype(int)
    labels = p.label_ids.astype(int)
    qwk = cohen_kappa_score(labels, clipped_preds, weights='quadratic')
    return {"qwk": qwk}


Initializing BERT Disambiguator for preprocessing...


Some weights of the model checkpoint at C:\Users\Fatima\AppData\Roaming\camel_tools\data\disambig_bert_unfactored\msa were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


--- Loading BAREC SENTENCE-LEVEL Data for Training ---
✔ Found preprocessed sentence files. Loading them directly...
Successfully loaded 54845 training and 7310 validation records.

===== INITIALIZING REGRESSION MODEL AND TRAINER =====





Starting sentence-level model training...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:

# --- MODEL TRAINING ---
# This entire block remains UNCHANGED. We are training the sentence-level model.
print("\n===== INITIALIZING REGRESSION MODEL AND TRAINER =====\n")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)
train_dataset = ReadabilityDataset(train_df['text'].tolist(), train_df['label'].tolist())
val_dataset = ReadabilityDataset(val_df['text'].tolist(), val_df['label'].tolist())

training_args = TrainingArguments(
    output_dir=CHECKPOINT_DIR,
    num_train_epochs=6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=5e-5,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="qwk",
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("Starting sentence-level model training...")
# Check if a trained model already exists to avoid re-training
if not os.path.exists(os.path.join(CHECKPOINT_DIR, "pytorch_model.bin")):
    trainer.train()
    print("✔ Training finished.")
    trainer.save_model(CHECKPOINT_DIR)
    print(f"Model saved to {CHECKPOINT_DIR}")
else:
    print(f"✔ Found existing trained model in {CHECKPOINT_DIR}. Skipping training.")
    # We still need to load the best model into the trainer for prediction
    # The `Trainer` class loads the best checkpoint automatically if `load_best_model_at_end=True`
    # and training was completed in a previous run. If not, we re-instantiate it.
    model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT_DIR)
    trainer = Trainer(model=model)

In [2]:
# --- MODIFICATION: DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION ---
# This entire block is rewritten to handle documents.
print("\n===== DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION =====\n")
try:
    # 1. Load the DOCUMENT test file
    print(f"Loading document test data from {DOC_BLIND_TEST_PATH}...")
    doc_test_df = pd.read_csv(DOC_BLIND_TEST_PATH)
    doc_test_df.dropna(subset=['ID', 'Sentences'], inplace=True)
    
    # 2. Explode documents into a long list of sentences
    # We create a new DataFrame where each row is a single sentence,
    # but we keep track of which document it came from.
    print("Processing documents: exploding into individual sentences...")
    all_sentences = []
    doc_ids = []
    for _, row in doc_test_df.iterrows():
        doc_id = row['ID']
        # The 'Sentences' column is a string representation of a list, e.g., "['sent1', 'sent2']"
        # We use ast.literal_eval to safely parse it into a Python list.
        try:
            sentences_list = ast.literal_eval(row['Sentences'])
            if sentences_list: # Only add if the list is not empty
                all_sentences.extend(sentences_list)
                doc_ids.extend([doc_id] * len(sentences_list))
        except (ValueError, SyntaxError):
            print(f"Warning: Could not parse sentences for document ID {doc_id}. Skipping.")
            continue
            
    sentence_df = pd.DataFrame({
        'doc_id': doc_ids,
        'sentence_text': all_sentences
    })
    print(f"Successfully created {len(sentence_df)} sentences from {len(doc_test_df)} documents.")

    # 3. Preprocess all sentences at once
    print("\nPreprocessing all sentences to D3Tok format...")
    sentence_df['processed_text'] = sentence_df['sentence_text'].apply(lambda x: preprocess_d3tok(x, bert_disambiguator))
    
    # 4. Get predictions for ALL sentences in a single batch
    print("Generating predictions for all sentences...")
    test_dataset = ReadabilityDataset(sentence_df['processed_text'].tolist())
    predictions = trainer.predict(test_dataset)
    sentence_df['raw_prediction'] = predictions.predictions.flatten()
    
    # 5. Aggregate results: find the MAX prediction for each document
    # This is the key step based on the hint.
    print("Aggregating results: finding the max readability score per document...")
    doc_predictions = sentence_df.groupby('doc_id')['raw_prediction'].max()
    
    # 6. Post-process the final document predictions
    rounded_preds = np.round(doc_predictions.values)
    clipped_preds = np.clip(rounded_preds, 0, TARGET_CLASSES - 1)
    
    # 7. Create the final submission file
    submission_df = pd.DataFrame({
        'Sentence ID': doc_predictions.index, # The column name is 'Sentence ID' in the competition
        'Prediction': (clipped_preds + 1).astype(int)
    })
    
    print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
    submission_df.to_csv(SUBMISSION_PATH, index=False)
    
    print(f"Compressing {os.path.basename(SUBMISSION_PATH)} into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...")
    with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
        
    print(f"✔ Submission file {os.path.basename(ZIPPED_SUBMISSION_PATH)} created successfully.")

except FileNotFoundError:
    print(f"! ERROR: Test file not found. Make sure 'doc_blind_test_data.csv' is in the '{DATA_DIR}' directory.")
except Exception as e:
    print(f"An error occurred during final document prediction: {e}")

print("\n--- Script Finished ---")

In [6]:
import pandas as pd
import numpy as np
import os
import torch
import zipfile
import ast # To parse string representations of lists
from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset as TorchDataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar

# --- Configuration ---
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
NUM_LABELS = 1
TARGET_CLASSES = 19
BASE_DIR = '.'
DATA_DIR = os.path.join(BASE_DIR, "data")
SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")

os.makedirs(SUBMISSION_DIR, exist_ok=True)

# --- File Paths ---
# Document test file
DOC_BLIND_TEST_PATH = os.path.join(DATA_DIR, 'doc_blind_test_data.csv') 
# Submission files
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_document_regression_final.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_document_regression_final.zip")


# --- DATA PREPROCESSING (UNCHANGED) ---
# This function preprocesses a single sentence.
def preprocess_d3tok(text, disambiguator):
    """
    Preprocesses text into the D3Tok format using BERTUnfactoredDisambiguator.
    """
    if not isinstance(text, str) or not text.strip():
        return ""
    tokens = simple_word_tokenize(text)
    disambiguated_sentence = disambiguator.disambiguate(tokens)
    d3tok_forms = []
    for disambig_word in disambiguated_sentence:
        if disambig_word.analyses:
            analysis_dict = disambig_word.analyses[0][1]
            if 'd3tok' in analysis_dict:
                d3tok = dediac_ar(analysis_dict['d3tok']).replace("_+", " +").replace("+_", "+ ")
                d3tok_forms.append(d3tok)
            else:
                d3tok_forms.append(disambig_word.word)
        else:
            d3tok_forms.append(disambig_word.word)
    return " ".join(d3tok_forms)

print("Initializing BERT Disambiguator for preprocessing...")
bert_disambiguator = BERTUnfactoredDisambiguator.pretrained('msa')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)


# --- DATASET CLASS (UNCHANGED) ---
class ReadabilityDataset(TorchDataset):
    def __init__(self, texts, labels=None):
        self.encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=256)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item

    def __len__(self):
        return len(self.encodings.get('input_ids', []))


# --- MODEL LOADING ---
# --- THIS IS THE KEY MODIFIED SECTION ---
print("\n===== LOADING PRE-TRAINED SENTENCE-LEVEL MODEL =====\n")

# --- CHANGE 1: Set this variable to your exact checkpoint path.
# Using r"..." (raw string) is the best practice for Windows paths.
CHECKPOINT_DIR = r"D:\arabic_readability_project\results\regression_readability-arabertv2-d3tok-reg\checkpoint-10284"

# --- CHANGE 2: Check for "model.safetensors" in the specified directory.
if os.path.exists(os.path.join(CHECKPOINT_DIR, "model.safetensors")):
    print(f"✔ Found checkpoint at: {CHECKPOINT_DIR}")
    print("Loading model from checkpoint...")
    
    # Load the already trained model from the specified checkpoint directory
    model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT_DIR)
    
    # We only need a minimal trainer object for the .predict() method
    trainer = Trainer(model=model)
    print("✔ Model loaded successfully.")

else:
    # If the checkpoint isn't found, print a clear error message and exit.
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    print(f"! ERROR: Checkpoint not found at the specified path.")
    print(f"! Searched for 'model.safetensors' inside: {CHECKPOINT_DIR}")
    print("! Please ensure the CHECKPOINT_DIR variable in the script is correct.")
    print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
    exit() # Stop the script


# --- DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION ---
print("\n===== DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION =====\n")
try:
    # 1. Load the DOCUMENT test file
    print(f"Loading document test data from {DOC_BLIND_TEST_PATH}...")
    doc_test_df = pd.read_csv(DOC_BLIND_TEST_PATH)
    doc_test_df.dropna(subset=['ID', 'Sentences'], inplace=True)
    
    # 2. Explode documents into a long list of sentences
    print("Processing documents: breaking them down into individual sentences...")
    all_sentences = []
    doc_ids = []
    for _, row in doc_test_df.iterrows():
        doc_id = row['ID']
        # The 'Sentences' column is a string like "['sent1', 'sent2']"
        # ast.literal_eval safely converts this string into a Python list
        try:
            sentences_list = ast.literal_eval(row['Sentences'])
            if sentences_list: # Only process if the list is not empty
                all_sentences.extend(sentences_list)
                doc_ids.extend([doc_id] * len(sentences_list))
        except (ValueError, SyntaxError):
            print(f"Warning: Could not parse sentences for document ID {doc_id}. Skipping this document.")
            continue
            
    sentence_df = pd.DataFrame({
        'doc_id': doc_ids,
        'sentence_text': all_sentences
    })
    print(f"Successfully created {len(sentence_df):,} sentences from {len(doc_test_df):,} documents.")

    # 3. Preprocess all sentences
    print("\nPreprocessing all sentences to D3Tok format (this may take a moment)...")
    sentence_df['processed_text'] = sentence_df['sentence_text'].apply(lambda x: preprocess_d3tok(x, bert_disambiguator))
    
    # 4. Get predictions for ALL sentences using the loaded model
    print("Generating predictions for all sentences...")
    test_dataset = ReadabilityDataset(sentence_df['processed_text'].tolist())
    predictions = trainer.predict(test_dataset)
    sentence_df['raw_prediction'] = predictions.predictions.flatten()
    
    # 5. Aggregate results: find the MAX prediction for each document
    print("Aggregating results: finding the max readability score per document...")
    doc_predictions = sentence_df.groupby('doc_id')['raw_prediction'].max()
    
    # 6. Post-process the final document predictions
    rounded_preds = np.round(doc_predictions.values)
    clipped_preds = np.clip(rounded_preds, 0, TARGET_CLASSES - 1)
    
    # 7. Create the final submission file
    submission_df = pd.DataFrame({
        'Sentence ID': doc_predictions.index, # The column name is 'Sentence ID' in the competition
        'Prediction': (clipped_preds + 1).astype(int)
    })
    
    print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
    submission_df.to_csv(SUBMISSION_PATH, index=False)
    
    print(f"Compressing {os.path.basename(SUBMISSION_PATH)} into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...")
    with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
        
    print(f"✔ Submission file {os.path.basename(ZIPPED_SUBMISSION_PATH)} created successfully.")

except FileNotFoundError:
    print(f"! ERROR: Test file not found. Make sure 'doc_blind_test_data.csv' is in the '{DATA_DIR}' directory.")
except Exception as e:
    print(f"An error occurred during final document prediction: {e}")

print("\n--- Script Finished ---")

Initializing BERT Disambiguator for preprocessing...


Some weights of the model checkpoint at C:\Users\Fatima\AppData\Roaming\camel_tools\data\disambig_bert_unfactored\msa were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



===== LOADING PRE-TRAINED SENTENCE-LEVEL MODEL =====

✔ Found checkpoint at: D:\arabic_readability_project\results\regression_readability-arabertv2-d3tok-reg\checkpoint-10284
Loading model from checkpoint...
✔ Model loaded successfully.

===== DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION =====

Loading document test data from .\data\doc_blind_test_data.csv...
Processing documents: breaking them down into individual sentences...
Successfully created 0 sentences from 100 documents.

Preprocessing all sentences to D3Tok format (this may take a moment)...
Generating predictions for all sentences...
An error occurred during final document prediction: list index out of range

--- Script Finished ---


In [11]:
import pandas as pd
import numpy as np
import os
import torch
import zipfile

from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset as TorchDataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
# --- MODIFICATION: Use pyarabic for sentence splitting ---
import pyarabic.araby as araby

# --- These camel-tools imports are correct and unchanged ---
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar

# --- Configuration ---
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
NUM_LABELS = 1
TARGET_CLASSES = 19
BASE_DIR = '.'
DATA_DIR = os.path.join(BASE_DIR, "data")
SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")
os.makedirs(SUBMISSION_DIR, exist_ok=True)

# --- File Paths ---
DOC_BLIND_TEST_PATH = os.path.join(DATA_DIR, 'doc_blind_test_data.csv') 
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_document_regression_final.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_document_regression_final.zip")


# --- DATA PREPROCESSING (UNCHANGED) ---
def preprocess_d3tok(text, disambiguator):
    if not isinstance(text, str) or not text.strip(): return ""
    tokens = simple_word_tokenize(text)
    disambiguated_sentence = disambiguator.disambiguate(tokens)
    d3tok_forms = []
    for disambig_word in disambiguated_sentence:
        if disambig_word.analyses:
            analysis_dict = disambig_word.analyses[0][1]
            if 'd3tok' in analysis_dict:
                d3tok = dediac_ar(analysis_dict['d3tok']).replace("_+", " +").replace("+_", "+ ")
                d3tok_forms.append(d3tok)
            else: d3tok_forms.append(disambig_word.word)
        else: d3tok_forms.append(disambig_word.word)
    return " ".join(d3tok_forms)

# --- Initialize Tools (camel-tools only, pyarabic is used directly) ---
print("Initializing CAMeL Tools...")
bert_disambiguator = BERTUnfactoredDisambiguator.pretrained('msa')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("✔ Tools initialized.")


# --- DATASET CLASS (UNCHANGED) ---
class ReadabilityDataset(TorchDataset):
    def __init__(self, texts, labels=None):
        self.encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=256)
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
    def __len__(self):
        return len(self.encodings.get('input_ids', []))


# --- MODEL LOADING (UNCHANGED) ---
print("\n===== LOADING PRE-TRAINED SENTENCE-LEVEL MODEL =====\n")
CHECKPOINT_DIR = r"D:\arabic_readability_project\results\regression_readability-arabertv2-d3tok-reg\checkpoint-10284"
if os.path.exists(os.path.join(CHECKPOINT_DIR, "model.safetensors")):
    print(f"✔ Found checkpoint at: {CHECKPOINT_DIR}")
    print("Loading model from checkpoint...")
    model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT_DIR)
    trainer = Trainer(model=model)
    print("✔ Model loaded successfully.")
else:
    print(f"! ERROR: Checkpoint not found at '{CHECKPOINT_DIR}'. Please check the path.")
    exit()


# --- DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION (FIXED) ---
print("\n===== DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION =====\n")
try:
    print(f"Loading document test data from {DOC_BLIND_TEST_PATH}...")
    doc_test_df = pd.read_csv(DOC_BLIND_TEST_PATH)
    doc_test_df.dropna(subset=['ID', 'Document'], inplace=True)
    
    print("Processing documents: tokenizing into sentences using pyarabic...")
    all_sentences, doc_ids = [], []
    for _, row in doc_test_df.iterrows():
        doc_id = row['ID']
        full_document_text = row['Document']
        
        if isinstance(full_document_text, str) and full_document_text.strip():
            # --- FIX: Use pyarabic.araby.sentence_tokenize ---
            sentences_list = araby.sentence_tokenize(full_document_text)
            
            if sentences_list:
                all_sentences.extend(sentences_list)
                doc_ids.extend([doc_id] * len(sentences_list))
        else:
            print(f"Warning: Document ID {doc_id} has empty or invalid text. Skipping.")
            continue

    if not all_sentences:
        print("\n! ERROR: No sentences were extracted. Check the 'Document' column in your CSV.")
        exit()

    sentence_df = pd.DataFrame({'doc_id': doc_ids, 'sentence_text': all_sentences})
    print(f"Successfully created {len(sentence_df):,} sentences from {len(doc_test_df):,} documents.")

    print("\nPreprocessing all sentences to D3Tok format (this may take a moment)...")
    sentence_df['processed_text'] = sentence_df['sentence_text'].apply(lambda x: preprocess_d3tok(x, bert_disambiguator))
    
    print("Generating predictions for all sentences...")
    test_dataset = ReadabilityDataset(sentence_df['processed_text'].tolist())
    predictions = trainer.predict(test_dataset)
    sentence_df['raw_prediction'] = predictions.predictions.flatten()
    
    print("Aggregating results: finding the max readability score per document...")
    doc_predictions = sentence_df.groupby('doc_id')['raw_prediction'].max()
    
    rounded_preds = np.round(doc_predictions.values)
    clipped_preds = np.clip(rounded_preds, 0, TARGET_CLASSES - 1)
    
    final_submission_df = pd.DataFrame({'Sentence ID': doc_test_df['ID']})
    pred_df = pd.DataFrame({
        'Sentence ID': doc_predictions.index, 
        'Prediction': (clipped_preds + 1).astype(int)
    })
    final_submission_df = final_submission_df.merge(pred_df, on='Sentence ID', how='left')
    final_submission_df['Prediction'].fillna(1, inplace=True)
    final_submission_df['Prediction'] = final_submission_df['Prediction'].astype(int)

    print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
    final_submission_df.to_csv(SUBMISSION_PATH, index=False)
    
    print(f"Compressing {os.path.basename(SUBMISSION_PATH)} into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...")
    with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
        
    print(f"✔ Submission file {os.path.basename(ZIPPED_SUBMISSION_PATH)} created successfully.")

except FileNotFoundError:
    print(f"! ERROR: Test file not found. Make sure 'doc_blind_test_data.csv' is in the '{DATA_DIR}' directory.")
except Exception as e:
    print(f"An error occurred during final document prediction: {e}")

print("\n--- Script Finished ---")

Initializing CAMeL Tools...


Some weights of the model checkpoint at C:\Users\Fatima\AppData\Roaming\camel_tools\data\disambig_bert_unfactored\msa were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✔ Tools initialized.

===== LOADING PRE-TRAINED SENTENCE-LEVEL MODEL =====

✔ Found checkpoint at: D:\arabic_readability_project\results\regression_readability-arabertv2-d3tok-reg\checkpoint-10284
Loading model from checkpoint...
✔ Model loaded successfully.

===== DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION =====

Loading document test data from .\data\doc_blind_test_data.csv...
Processing documents: tokenizing into sentences using pyarabic...
Successfully created 100 sentences from 100 documents.

Preprocessing all sentences to D3Tok format (this may take a moment)...
An error occurred during final document prediction: 'LFUCache' object has no attribute '_LFUCache__links'

--- Script Finished ---


In [1]:
import pandas as pd
import numpy as np
import os
import torch
import zipfile

from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset as TorchDataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
# --- MODIFICATION: Use pyarabic for sentence splitting ---
import pyarabic.araby as araby

# --- These camel-tools imports are correct and unchanged ---
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar

# --- Configuration ---
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
NUM_LABELS = 1
TARGET_CLASSES = 19
BASE_DIR = '.'
DATA_DIR = os.path.join(BASE_DIR, "data")
SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")
os.makedirs(SUBMISSION_DIR, exist_ok=True)

# --- File Paths ---
DOC_BLIND_TEST_PATH = os.path.join(DATA_DIR, 'doc_blind_test_data.csv') 
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_document_regression_final.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_document_regression_final.zip")


# --- DATA PREPROCESSING (UNCHANGED) ---
def preprocess_d3tok(text, disambiguator):
    if not isinstance(text, str) or not text.strip(): return ""
    tokens = simple_word_tokenize(text)
    disambiguated_sentence = disambiguator.disambiguate(tokens)
    d3tok_forms = []
    for disambig_word in disambiguated_sentence:
        if disambig_word.analyses:
            analysis_dict = disambig_word.analyses[0][1]
            if 'd3tok' in analysis_dict:
                d3tok = dediac_ar(analysis_dict['d3tok']).replace("_+", " +").replace("+_", "+ ")
                d3tok_forms.append(d3tok)
            else: d3tok_forms.append(disambig_word.word)
        else: d3tok_forms.append(disambig_word.word)
    return " ".join(d3tok_forms)

# --- Initialize Tools (camel-tools only, pyarabic is used directly) ---
print("Initializing CAMeL Tools...")
bert_disambiguator = BERTUnfactoredDisambiguator.pretrained('msa')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("✔ Tools initialized.")


# --- DATASET CLASS (UNCHANGED) ---
class ReadabilityDataset(TorchDataset):
    def __init__(self, texts, labels=None):
        self.encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=256)
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
    def __len__(self):
        return len(self.encodings.get('input_ids', []))


# --- MODEL LOADING (UNCHANGED) ---
print("\n===== LOADING PRE-TRAINED SENTENCE-LEVEL MODEL =====\n")
CHECKPOINT_DIR = r"D:\arabic_readability_project\results\regression_readability-arabertv2-d3tok-reg\checkpoint-10284"
if os.path.exists(os.path.join(CHECKPOINT_DIR, "model.safetensors")):
    print(f"✔ Found checkpoint at: {CHECKPOINT_DIR}")
    print("Loading model from checkpoint...")
    model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT_DIR)
    trainer = Trainer(model=model)
    print("✔ Model loaded successfully.")
else:
    print(f"! ERROR: Checkpoint not found at '{CHECKPOINT_DIR}'. Please check the path.")
    exit()


# --- DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION (FIXED) ---
print("\n===== DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION =====\n")
try:
    print(f"Loading document test data from {DOC_BLIND_TEST_PATH}...")
    doc_test_df = pd.read_csv(DOC_BLIND_TEST_PATH)
    doc_test_df.dropna(subset=['ID', 'Document'], inplace=True)
    
    print("Processing documents: tokenizing into sentences using pyarabic...")
    all_sentences, doc_ids = [], []
    for _, row in doc_test_df.iterrows():
        doc_id = row['ID']
        full_document_text = row['Document']
        
        if isinstance(full_document_text, str) and full_document_text.strip():
            # --- FIX: Use pyarabic.araby.sentence_tokenize ---
            sentences_list = araby.sentence_tokenize(full_document_text)
            
            if sentences_list:
                all_sentences.extend(sentences_list)
                doc_ids.extend([doc_id] * len(sentences_list))
        else:
            print(f"Warning: Document ID {doc_id} has empty or invalid text. Skipping.")
            continue

    if not all_sentences:
        print("\n! ERROR: No sentences were extracted. Check the 'Document' column in your CSV.")
        exit()

    sentence_df = pd.DataFrame({'doc_id': doc_ids, 'sentence_text': all_sentences})
    print(f"Successfully created {len(sentence_df):,} sentences from {len(doc_test_df):,} documents.")

    print("\nPreprocessing all sentences to D3Tok format (this may take a moment)...")
    sentence_df['processed_text'] = sentence_df['sentence_text'].apply(lambda x: preprocess_d3tok(x, bert_disambiguator))
    
    print("Generating predictions for all sentences...")
    test_dataset = ReadabilityDataset(sentence_df['processed_text'].tolist())
    predictions = trainer.predict(test_dataset)
    sentence_df['raw_prediction'] = predictions.predictions.flatten()
    
    print("Aggregating results: finding the max readability score per document...")
    doc_predictions = sentence_df.groupby('doc_id')['raw_prediction'].max()
    
    rounded_preds = np.round(doc_predictions.values)
    clipped_preds = np.clip(rounded_preds, 0, TARGET_CLASSES - 1)
    
    final_submission_df = pd.DataFrame({'Sentence ID': doc_test_df['ID']})
    pred_df = pd.DataFrame({
        'Sentence ID': doc_predictions.index, 
        'Prediction': (clipped_preds + 1).astype(int)
    })
    final_submission_df = final_submission_df.merge(pred_df, on='Sentence ID', how='left')
    final_submission_df['Prediction'].fillna(1, inplace=True)
    final_submission_df['Prediction'] = final_submission_df['Prediction'].astype(int)

    print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
    final_submission_df.to_csv(SUBMISSION_PATH, index=False)
    
    print(f"Compressing {os.path.basename(SUBMISSION_PATH)} into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...")
    with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
        
    print(f"✔ Submission file {os.path.basename(ZIPPED_SUBMISSION_PATH)} created successfully.")

except FileNotFoundError:
    print(f"! ERROR: Test file not found. Make sure 'doc_blind_test_data.csv' is in the '{DATA_DIR}' directory.")
except Exception as e:
    print(f"An error occurred during final document prediction: {e}")

print("\n--- Script Finished ---")

Initializing CAMeL Tools...


Some weights of the model checkpoint at C:\Users\Fatima\AppData\Roaming\camel_tools\data\disambig_bert_unfactored\msa were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✔ Tools initialized.

===== LOADING PRE-TRAINED SENTENCE-LEVEL MODEL =====

✔ Found checkpoint at: D:\arabic_readability_project\results\regression_readability-arabertv2-d3tok-reg\checkpoint-10284
Loading model from checkpoint...
✔ Model loaded successfully.

===== DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION =====

Loading document test data from .\data\doc_blind_test_data.csv...
Processing documents: tokenizing into sentences using pyarabic...
Successfully created 100 sentences from 100 documents.

Preprocessing all sentences to D3Tok format (this may take a moment)...
Generating predictions for all sentences...


Aggregating results: finding the max readability score per document...

Saving prediction file to: .\submission\submission_document_regression_final.csv


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_submission_df['Prediction'].fillna(1, inplace=True)


Compressing submission_document_regression_final.csv into submission_document_regression_final.zip...
✔ Submission file submission_document_regression_final.zip created successfully.

--- Script Finished ---


In [2]:
import pandas as pd
import numpy as np
import os
import torch
import zipfile

from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset as TorchDataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
# --- pyarabic is no longer needed ---

# --- These camel-tools imports are correct and unchanged ---
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar

# --- Configuration ---
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
NUM_LABELS = 1
TARGET_CLASSES = 19
BASE_DIR = '.'
DATA_DIR = os.path.join(BASE_DIR, "data")
SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")
os.makedirs(SUBMISSION_DIR, exist_ok=True)

# --- File Paths ---
DOC_BLIND_TEST_PATH = os.path.join(DATA_DIR, 'doc_blind_test_data.csv') 
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_document_regression_final.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_document_regression_final.zip")


# --- DATA PREPROCESSING (UNCHANGED) ---
def preprocess_d3tok(text, disambiguator):
    if not isinstance(text, str) or not text.strip(): return ""
    tokens = simple_word_tokenize(text)
    disambiguated_sentence = disambiguator.disambiguate(tokens)
    d3tok_forms = []
    for disambig_word in disambiguated_sentence:
        if disambig_word.analyses:
            analysis_dict = disambig_word.analyses[0][1]
            if 'd3tok' in analysis_dict:
                d3tok = dediac_ar(analysis_dict['d3tok']).replace("_+", " +").replace("+_", "+ ")
                d3tok_forms.append(d3tok)
            else: d3tok_forms.append(disambig_word.word)
        else: d3tok_forms.append(disambig_word.word)
    return " ".join(d3tok_forms)

# --- Initialize Tools ---
print("Initializing CAMeL Tools...")
bert_disambiguator = BERTUnfactoredDisambiguator.pretrained('msa')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("✔ Tools initialized.")


# --- DATASET CLASS (UNCHANGED) ---
class ReadabilityDataset(TorchDataset):
    def __init__(self, texts, labels=None):
        self.encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=256)
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
    def __len__(self):
        return len(self.encodings.get('input_ids', []))


# --- MODEL LOADING (UNCHANGED) ---
print("\n===== LOADING PRE-TRAINED SENTENCE-LEVEL MODEL =====\n")
CHECKPOINT_DIR = r"D:\arabic_readability_project\results\regression_readability-arabertv2-d3tok-reg\checkpoint-10284"
if os.path.exists(os.path.join(CHECKPOINT_DIR, "model.safetensors")):
    print(f"✔ Found checkpoint at: {CHECKPOINT_DIR}")
    print("Loading model from checkpoint...")
    model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT_DIR)
    trainer = Trainer(model=model)
    print("✔ Model loaded successfully.")
else:
    print(f"! ERROR: Checkpoint not found at '{CHECKPOINT_DIR}'. Please check the path.")
    exit()


# --- DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION ---
print("\n===== DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION =====\n")
try:
    print(f"Loading document test data from {DOC_BLIND_TEST_PATH}...")
    doc_test_df = pd.read_csv(DOC_BLIND_TEST_PATH)
    doc_test_df.dropna(subset=['ID', 'Document'], inplace=True)
    
    print("Processing documents: splitting into sentences by newline characters...")
    all_sentences, doc_ids = [], []
    for _, row in doc_test_df.iterrows():
        doc_id = row['ID']
        full_document_text = row['Document']
        
        if isinstance(full_document_text, str) and full_document_text.strip():
            # --- CHANGE: Use the more robust split-by-newline method ---
            sentences_list = full_document_text.split('\n')
            
            # Clean up any empty strings that result from multiple newlines
            sentences_list = [s.strip() for s in sentences_list if s.strip()]

            if sentences_list:
                all_sentences.extend(sentences_list)
                doc_ids.extend([doc_id] * len(sentences_list))
        else:
            print(f"Warning: Document ID {doc_id} has empty or invalid text. Skipping.")
            continue

    if not all_sentences:
        print("\n! ERROR: No sentences were extracted. Check the 'Document' column in your CSV.")
        exit()

    sentence_df = pd.DataFrame({'doc_id': doc_ids, 'sentence_text': all_sentences})
    
    # --- NEW: Save the created sentences for you to review ---
    print("\n--- Saving split sentences for manual review ---")
    review_path = 'review_split_sentences.csv'
    # Use utf-8 encoding to correctly save Arabic characters
    sentence_df.to_csv(review_path, index=False, encoding='utf-8')
    print(f"✔ Sentences saved to {review_path}")

    print(f"\nSuccessfully created {len(sentence_df):,} sentences from {len(doc_test_df):,} documents.")

    print("\nPreprocessing all sentences to D3Tok format (this may take a moment)...")
    sentence_df['processed_text'] = sentence_df['sentence_text'].apply(lambda x: preprocess_d3tok(x, bert_disambiguator))
    
    print("Generating predictions for all sentences...")
    test_dataset = ReadabilityDataset(sentence_df['processed_text'].tolist())
    predictions = trainer.predict(test_dataset)
    sentence_df['raw_prediction'] = predictions.predictions.flatten()
    
    print("Aggregating results: finding the max readability score per document...")
    doc_predictions = sentence_df.groupby('doc_id')['raw_prediction'].max()
    
    rounded_preds = np.round(doc_predictions.values)
    clipped_preds = np.clip(rounded_preds, 0, TARGET_CLASSES - 1)
    
    final_submission_df = pd.DataFrame({'Sentence ID': doc_test_df['ID']})
    pred_df = pd.DataFrame({
        'Sentence ID': doc_predictions.index, 
        'Prediction': (clipped_preds + 1).astype(int)
    })
    # --- BUG FIX: Corrected variable name ---
    final_submission_df = final_submission_df.merge(pred_df, on='Sentence ID', how='left')
    final_submission_df['Prediction'].fillna(1, inplace=True)
    final_submission_df['Prediction'] = final_submission_df['Prediction'].astype(int)

    print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
    final_submission_df.to_csv(SUBMISSION_PATH, index=False)
    
    print(f"Compressing {os.path.basename(SUBMISSION_PATH)} into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...")
    with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
        
    print(f"✔ Submission file {os.path.basename(ZIPPED_SUBMISSION_PATH)} created successfully.")

except FileNotFoundError:
    print(f"! ERROR: Test file not found. Make sure 'doc_blind_test_data.csv' is in the '{DATA_DIR}' directory.")
except Exception as e:
    print(f"An error occurred during final document prediction: {e}")

print("\n--- Script Finished ---")

Initializing CAMeL Tools...


Some weights of the model checkpoint at C:\Users\Fatima\AppData\Roaming\camel_tools\data\disambig_bert_unfactored\msa were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✔ Tools initialized.

===== LOADING PRE-TRAINED SENTENCE-LEVEL MODEL =====

✔ Found checkpoint at: D:\arabic_readability_project\results\regression_readability-arabertv2-d3tok-reg\checkpoint-10284
Loading model from checkpoint...
✔ Model loaded successfully.

===== DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION =====

Loading document test data from .\data\doc_blind_test_data.csv...
Processing documents: splitting into sentences by newline characters...

--- Saving split sentences for manual review ---
✔ Sentences saved to review_split_sentences.csv

Successfully created 100 sentences from 100 documents.

Preprocessing all sentences to D3Tok format (this may take a moment)...
Generating predictions for all sentences...


Aggregating results: finding the max readability score per document...

Saving prediction file to: .\submission\submission_document_regression_final.csv
Compressing submission_document_regression_final.csv into submission_document_regression_final.zip...
✔ Submission file submission_document_regression_final.zip created successfully.

--- Script Finished ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_submission_df['Prediction'].fillna(1, inplace=True)


In [12]:
pip install cachetools==4.2.4

Collecting cachetools==4.2.4
  Using cached cachetools-4.2.4-py3-none-any.whl.metadata (4.8 kB)
Using cached cachetools-4.2.4-py3-none-any.whl (10 kB)
Installing collected packages: cachetools
  Attempting uninstall: cachetools
    Found existing installation: cachetools 6.1.0
    Uninstalling cachetools-6.1.0:
      Successfully uninstalled cachetools-6.1.0
Successfully installed cachetools-4.2.4
Note: you may need to restart the kernel to use updated packages.


In [9]:
import camel_tools
import camel_tools.tokenizers

# This will print out all the available modules in the tokenizers directory
# for your specific version of camel-tools.
print("--- Inspecting camel_tools.tokenizers ---")
print(dir(camel_tools.tokenizers))
print("-----------------------------------------")

--- Inspecting camel_tools.tokenizers ---
['__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__path__', '__spec__', 'word']
-----------------------------------------


In [10]:
pip install pyarabic Tashaphyne

Collecting Tashaphyne
  Downloading Tashaphyne-0.3.6-py3-none-any.whl.metadata (18 kB)
Downloading Tashaphyne-0.3.6-py3-none-any.whl (251 kB)
Installing collected packages: Tashaphyne
Successfully installed Tashaphyne-0.3.6
Note: you may need to restart the kernel to use updated packages.


In [7]:
pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting click (from nltk)
  Downloading click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ------ --------------------------------- 0.3/1.5 MB ? eta -:--:--
   ------------- -------------------------- 0.5/1.5 MB 1.3 MB/s eta 0:00:01
   -------------------- ------------------- 0.8/1.5 MB 1.0 MB/s eta 0:00:01
   -------------------- ------------------- 0.8/1.5 MB 1.0 MB/s eta 0:00:01
   --------------------------- ------------ 1.0/1.5 MB 868.0 kB/s eta 0:00:01
   ---------------------------------- ----- 1.3/1.5 MB 932.1 kB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 925.8 kB/s eta 0:00:00
Downloading click-8.2.1-py3-none-any.whl (102 k

In [5]:
import pandas as pd
import numpy as np
import os
import torch
import zipfile

from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset as TorchDataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)

from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar

# --- Configuration ---
MODEL_NAME = "CAMeL-Lab/readability-arabertv2-d3tok-reg"
NUM_LABELS = 1
TARGET_CLASSES = 19
BASE_DIR = '.'
DATA_DIR = os.path.join(BASE_DIR, "data")
SUBMISSION_DIR = os.path.join(BASE_DIR, "submission")
os.makedirs(SUBMISSION_DIR, exist_ok=True)

# --- File Paths ---
DOC_BLIND_TEST_PATH = os.path.join(DATA_DIR, 'doc_blind_test_data.csv') 
SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_document_regression_final.csv")
ZIPPED_SUBMISSION_PATH = os.path.join(SUBMISSION_DIR, "submission_document_regression_final.zip")


# --- DATA PREPROCESSING (UNCHANGED) ---
def preprocess_d3tok(text, disambiguator):
    if not isinstance(text, str) or not text.strip(): return ""
    tokens = simple_word_tokenize(text)
    disambiguated_sentence = disambiguator.disambiguate(tokens)
    d3tok_forms = []
    for disambig_word in disambiguated_sentence:
        if disambig_word.analyses:
            analysis_dict = disambig_word.analyses[0][1]
            if 'd3tok' in analysis_dict:
                d3tok = dediac_ar(analysis_dict['d3tok']).replace("_+", " +").replace("+_", "+ ")
                d3tok_forms.append(d3tok)
            else: d3tok_forms.append(disambig_word.word)
        else: d3tok_forms.append(disambig_word.word)
    return " ".join(d3tok_forms)

# --- Initialize Tools ---
print("Initializing CAMeL Tools...")
bert_disambiguator = BERTUnfactoredDisambiguator.pretrained('msa')
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print("✔ Tools initialized.")


# --- DATASET CLASS (UNCHANGED) ---
class ReadabilityDataset(TorchDataset):
    def __init__(self, texts, labels=None):
        self.encodings = tokenizer(texts, truncation=True, padding="max_length", max_length=256)
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None: item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float)
        return item
    def __len__(self):
        return len(self.encodings.get('input_ids', []))


# --- MODEL LOADING (UNCHANGED) ---
print("\n===== LOADING PRE-TRAINED SENTENCE-LEVEL MODEL =====\n")
CHECKPOINT_DIR = r"D:\arabic_readability_project\results\regression_readability-arabertv2-d3tok-reg\checkpoint-10284"
if os.path.exists(os.path.join(CHECKPOINT_DIR, "model.safetensors")):
    print(f"✔ Found checkpoint at: {CHECKPOINT_DIR}")
    print("Loading model from checkpoint...")
    model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT_DIR)
    trainer = Trainer(model=model)
    print("✔ Model loaded successfully.")
else:
    print(f"! ERROR: Checkpoint not found at '{CHECKPOINT_DIR}'. Please check the path.")
    exit()


# --- DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION ---
print("\n===== DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION =====\n")
try:
    print(f"Loading document test data from {DOC_BLIND_TEST_PATH}...")
    doc_test_df = pd.read_csv(DOC_BLIND_TEST_PATH)
    # --- CHANGE: Drop rows if 'Sentences' column is empty, not 'Document' ---
    doc_test_df.dropna(subset=['ID', 'Sentences'], inplace=True)
    
    print("Processing documents: splitting into sentences by newline characters...")
    all_sentences, doc_ids = [], []
    for _, row in doc_test_df.iterrows():
        doc_id = row['ID']
        # --- THE CRITICAL FIX: Read text from the 'Sentences' column ---
        full_document_text = row['Sentences']
        
        if isinstance(full_document_text, str) and full_document_text.strip():
            sentences_list = full_document_text.split('\n')
            sentences_list = [s.strip() for s in sentences_list if s.strip()]
            if sentences_list:
                all_sentences.extend(sentences_list)
                doc_ids.extend([doc_id] * len(sentences_list))
        else:
            print(f"Warning: Document ID {doc_id} has empty or invalid text in 'Sentences' column. Skipping.")
            continue

    if not all_sentences:
        print("\n! ERROR: No sentences were extracted. Check the 'Sentences' column in your CSV.")
        exit()

    sentence_df = pd.DataFrame({'doc_id': doc_ids, 'sentence_text': all_sentences})
    
    # Save split sentences for review
    review_split_path = 'review_split_sentences.csv'
    sentence_df.to_csv(review_split_path, index=False, encoding='utf-8-sig') # Use utf-8-sig for Excel
    print(f"\n✔ Raw split sentences saved to {review_split_path}")
    print(f"Successfully created {len(sentence_df):,} sentences from {len(doc_test_df):,} documents.")

    print("\nPreprocessing all sentences to D3Tok format (this may take a moment)...")
    sentence_df['processed_text'] = sentence_df['sentence_text'].apply(lambda x: preprocess_d3tok(x, bert_disambiguator))
    
    # Save D3tok output for review
    review_d3tok_path = 'review_d3tok_processed_output.csv'
    sentence_df[['sentence_text', 'processed_text']].to_csv(review_d3tok_path, index=False, encoding='utf-8-sig')
    print(f"✔ D3tok processed output saved to {review_d3tok_path}")
    
    print("\nGenerating predictions for all sentences...")
    test_dataset = ReadabilityDataset(sentence_df['processed_text'].tolist())
    predictions = trainer.predict(test_dataset)
    sentence_df['raw_prediction'] = predictions.predictions.flatten()
    
    print("Aggregating results: finding the max readability score per document...")
    doc_predictions = sentence_df.groupby('doc_id')['raw_prediction'].max()
    
    rounded_preds = np.round(doc_predictions.values)
    clipped_preds = np.clip(rounded_preds, 0, TARGET_CLASSES - 1)
    
    final_submission_df = pd.DataFrame({'Sentence ID': doc_test_df['ID']})
    pred_df = pd.DataFrame({
        'Sentence ID': doc_predictions.index, 
        'Prediction': (clipped_preds + 1).astype(int)
    })
    final_submission_df = final_submission_df.merge(pred_df, on='Sentence ID', how='left')
    final_submission_df['Prediction'].fillna(1, inplace=True)
    final_submission_df['Prediction'] = final_submission_df['Prediction'].astype(int)

    print(f"\nSaving prediction file to: {SUBMISSION_PATH}")
    final_submission_df.to_csv(SUBMISSION_PATH, index=False)
    
    print(f"Compressing {os.path.basename(SUBMISSION_PATH)} into {os.path.basename(ZIPPED_SUBMISSION_PATH)}...")
    with zipfile.ZipFile(ZIPPED_SUBMISSION_PATH, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(SUBMISSION_PATH, arcname=os.path.basename(SUBMISSION_PATH))
        
    print(f"✔ Submission file {os.path.basename(ZIPPED_SUBMISSION_PATH)} created successfully.")

except FileNotFoundError:
    print(f"! ERROR: Test file not found at '{DOC_BLIND_TEST_PATH}'.")
except Exception as e:
    print(f"An error occurred during final document prediction: {e}")

print("\n--- Script Finished ---")

Initializing CAMeL Tools...


Some weights of the model checkpoint at C:\Users\Fatima\AppData\Roaming\camel_tools\data\disambig_bert_unfactored\msa were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✔ Tools initialized.

===== LOADING PRE-TRAINED SENTENCE-LEVEL MODEL =====

✔ Found checkpoint at: D:\arabic_readability_project\results\regression_readability-arabertv2-d3tok-reg\checkpoint-10284
Loading model from checkpoint...
✔ Model loaded successfully.

===== DOCUMENT-LEVEL FINAL PREDICTION AND SUBMISSION =====

Loading document test data from .\data\doc_blind_test_data.csv...
Processing documents: splitting into sentences by newline characters...

✔ Raw split sentences saved to review_split_sentences.csv
Successfully created 3,420 sentences from 100 documents.

Preprocessing all sentences to D3Tok format (this may take a moment)...
✔ D3tok processed output saved to review_d3tok_processed_output.csv

Generating predictions for all sentences...


Aggregating results: finding the max readability score per document...

Saving prediction file to: .\submission\submission_document_regression_final.csv
Compressing submission_document_regression_final.csv into submission_document_regression_final.zip...
✔ Submission file submission_document_regression_final.zip created successfully.

--- Script Finished ---


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_submission_df['Prediction'].fillna(1, inplace=True)
