In [1]:
# -*- coding: utf-8 -*-
"""
sent_strict_local.ipynb

This script has been modified to run locally in a Jupyter Notebook.
Key changes:
1.  Model changed to 'aubmindlab/bert-large-arabertv2'.
2.  Google Drive mounting and specific paths have been removed.
3.  File paths are now relative, assuming a specific local folder structure.
"""

# =====================================================================================
# 1. INSTALLATIONS & IMPORTS
# =====================================================================================
# Make sure to run this cell in your Jupyter Notebook to install the required libraries.
# You can uncomment the line below or run it separately in your terminal.
# !pip install transformers[torch] pandas numpy scikit-learn pyarrow torch

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics import cohen_kappa_score
from torch.utils.data import Dataset as TorchDataset
from transformers import AutoModel, TrainingArguments, Trainer, EarlyStoppingCallback
import os

print("✅ Libraries imported successfully.")
# Check for GPU availability
if torch.cuda.is_available():
    print(f"GPU found: {torch.cuda.get_device_name(0)}")
else:
    print("❗️ WARNING: No GPU found. Training will run on the CPU, which will be very slow.")









✅ Libraries imported successfully.
GPU found: NVIDIA GeForce RTX 2060


In [4]:

# =====================================================================================
# 2. CONFIGURATION
# =====================================================================================
# --- Define Base Path for your local project folder ---
# This assumes your data folders are in the same directory as your notebook.
# =====================================================================================
# 2. CONFIGURATION
# =====================================================================================
# --- Define Base Path for your local project folder ---
# ✔️ CORRECT: This tells the script to look in the current folder.
BASE_PATH = './' 

# ❌ WRONG examples: '/', 'C:/', '/doc strict' will all cause errors.


# --- Model & Training Params ---
MODEL_NAME = "CAMeL-Lab/readability-arabertv02-word-CE"
NUM_LABELS = 19
NUM_FEATURES = 7

# --- File Paths (Modified for Local Environment) ---
PREPROCESSED_DATA_PATH = os.path.join(BASE_PATH, 'preprocessed-arabic-readability')
ORIGINAL_DATA_PATH = BASE_PATH
OUTPUT_DIR = os.path.join(BASE_PATH, 'output_results')
# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Preprocessed data is expected in: {PREPROCESSED_DATA_PATH}")
print(f"Original test data is expected in: {ORIGINAL_DATA_PATH}")
print(f"Model checkpoints and outputs will be saved to: {OUTPUT_DIR}")


Preprocessed data is expected in: ./preprocessed-arabic-readability
Original test data is expected in: ./
Model checkpoints and outputs will be saved to: ./output_results


In [5]:
# =====================================================================================
# 3. DATA LOADING & DATASET CLASS
# =====================================================================================
print("\n--- Loading Preprocessed Data from Local Machine ---")
try:
    train_df = pd.read_feather(os.path.join(PREPROCESSED_DATA_PATH, 'train_processed.feather'))
    val_df = pd.read_feather(os.path.join(PREPROCESSED_DATA_PATH, 'val_processed.feather'))
    # Load the preprocessed test data
    test_processed_df = pd.read_feather(os.path.join(PREPROCESSED_DATA_PATH, 'test_processed.feather'))
    print(f"Loaded {len(train_df)} training, {len(val_df)} validation, and {len(test_processed_df)} test samples.")

    # IMPORTANT: We need the original test file to map sentence predictions back to document IDs.
    test_original_df = pd.read_csv(os.path.join(ORIGINAL_DATA_PATH, 'sentnse_blind_test.csv'))
    # Explode it to get a sentence-level mapping
    sentence_to_doc_map = test_original_df.assign(text=test_original_df['Sentence'].str.split('\n')).explode('text')
    sentence_to_doc_map.dropna(subset=['text'], inplace=True)

    # Ensure the row count matches and add the doc_id column
    if len(sentence_to_doc_map) == len(test_processed_df):
        test_processed_df['doc_id'] = sentence_to_doc_map['ID'].values
        print("✅ Successfully mapped document IDs to the preprocessed test set.")
    else:
        print("❗️ WARNING: Mismatch between preprocessed test set and original doc_id mapping. Aggregation might fail.")

except FileNotFoundError as e:
    print(f"❗️ ERROR: Could not find preprocessed files. Please check your folder structure.")
    print(f"Missing file or directory: {e.filename}")
    # MODIFIED: Raising an error is better than exit() in a notebook
    raise e

# MODIFIED DATASET CLASS for pre-tokenized data
class ReadabilityDataset(TorchDataset):
    def __init__(self, dataframe, is_test=False):
        self.input_ids = dataframe['input_ids'].tolist()
        self.attention_mask = dataframe['attention_mask'].tolist()
        self.features = dataframe['features'].tolist()
        self.is_test = is_test
        if not self.is_test:
            self.labels = dataframe['label'].tolist()

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'numerical_features': torch.tensor(self.features[idx], dtype=torch.float)
        }
        if not self.is_test:
            item['labels'] = torch.tensor(self.labels[idx] - 1, dtype=torch.long)
        return item

    def __len__(self):
        return len(self.features)



--- Loading Preprocessed Data from Local Machine ---
Loaded 97874 training, 7310 validation, and 3420 test samples.
✅ Successfully mapped document IDs to the preprocessed test set.


In [6]:
# =====================================================================================
# 4. HYBRID MODEL
# =====================================================================================
class HybridReadabilityModel(nn.Module):
    def __init__(self, model_name, num_extra_features, num_labels):
        super(HybridReadabilityModel, self).__init__()
        self.transformer = AutoModel.from_pretrained(model_name)
        transformer_output_dim = self.transformer.config.hidden_size
        self.head = nn.Sequential(
            # Using the larger dimension from the 'large' model dynamically
            nn.Linear(transformer_output_dim + num_extra_features, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_labels)
        )
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, numerical_features, labels=None):
        transformer_outputs = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = transformer_outputs.last_hidden_state[:, 0, :]
        combined_features = torch.cat([cls_embedding, numerical_features], dim=1)
        logits = self.head(combined_features)
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return (loss, logits) if loss is not None else logits

def compute_metrics(p):
    logits, labels = p.predictions, p.label_ids
    preds = np.argmax(logits, axis=1)
    return {"qwk": cohen_kappa_score(labels, preds, weights='quadratic')}


In [7]:
# =====================================================================================
# 5. TRAINING AND EVALUATION
# =====================================================================================
print("\n===== 🚀 PREPARING FOR HYBRID MODEL TRAINING RUN =====\n")

# --- Create Datasets ---
train_dataset = ReadabilityDataset(train_df)
val_dataset = ReadabilityDataset(val_df)
test_dataset = ReadabilityDataset(test_processed_df, is_test=True)

# --- Initialize Hybrid Model ---
model = HybridReadabilityModel(MODEL_NAME, num_extra_features=NUM_FEATURES, num_labels=NUM_LABELS)

# --- Define Training Arguments ---
training_args = TrainingArguments(
    output_dir=os.path.join(OUTPUT_DIR, "results_hybrid_model_large_camel"), # new folder for large model
    num_train_epochs=8,
    per_device_train_batch_size=8,  # Reduced batch size for larger model to prevent memory issues
    per_device_eval_batch_size=16, # Reduced batch size
    gradient_accumulation_steps=2, # Accumulate gradients to keep effective batch size at 16
    learning_rate=1e-5, # Often a lower learning rate is better for larger models
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="qwk",
    greater_is_better=True,
    save_total_limit=2,
    fp16=torch.cuda.is_available(), # Automatically use mixed precision if GPU is available
    report_to="none"
)

# --- Initialize Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print(f"Starting model training with {MODEL_NAME}...")
trainer.train()
print("✅ Training finished.")


===== 🚀 PREPARING FOR HYBRID MODEL TRAINING RUN =====



config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/541M [00:00<?, ?B/s]

Starting model training with CAMeL-Lab/readability-arabertv02-word-CE...


Epoch,Training Loss,Validation Loss,Qwk
1,1.8778,2.196482,0.519
2,1.7104,2.050585,0.557014
3,1.6683,1.864119,0.591343
4,1.5409,1.842514,0.664796
5,1.4866,1.893837,0.676805
6,1.4493,1.888345,0.662472
7,1.3648,1.884651,0.664064


✅ Training finished.


In [8]:
# =====================================================================================
# 6. PREDICTION AND SUBMISSION FILE GENERATION
# =====================================================================================
print("\n===== 🏆 GENERATING PREDICTIONS ON THE TEST SET =====\n")

predictions = trainer.predict(test_dataset)
test_logits = predictions.predictions

# Convert logits to final predictions (1-19)
sentence_level_preds = np.argmax(test_logits, axis=1) + 1
test_processed_df['prediction'] = sentence_level_preds

# --- Aggregate sentence-level predictions to document-level ---
print("Aggregating sentence predictions to document-level using MAX rule...")
# Group by the document ID and take the max readability score
doc_level_preds = test_processed_df.groupby('doc_id')['prediction'].max()

submission_df = pd.DataFrame({
    'Document ID': doc_level_preds.index,
    'Prediction': doc_level_preds.values
})

# --- Save the submission file ---
SUBMISSION_FILE_NAME = "submission_large_model.csv"
SUBMISSION_PATH = os.path.join(OUTPUT_DIR, SUBMISSION_FILE_NAME)

print(f"Saving prediction file to: {SUBMISSION_PATH}")
submission_df.to_csv(SUBMISSION_PATH, index=False)

print(f"✅ Submission file '{SUBMISSION_FILE_NAME}' created successfully in {OUTPUT_DIR}.")
print("\n--- Script Finished ---")


===== 🏆 GENERATING PREDICTIONS ON THE TEST SET =====



Aggregating sentence predictions to document-level using MAX rule...
Saving prediction file to: ./output_results\submission_large_model.csv
✅ Submission file 'submission_large_model.csv' created successfully in ./output_results.

--- Script Finished ---
