In [1]:
import pandas as pd
import os
from tqdm.auto import tqdm
from camel_tools.disambig.bert import BERTUnfactoredDisambiguator
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.dediac import dediac_ar
import numpy as np # Imported numpy for efficient conditional logic
import warnings

# Suppress warnings for a cleaner output
warnings.filterwarnings('ignore')

# =====================================================================================
# 1. CONFIGURATION
# =====================================================================================
# IMPORTANT: Please ensure these paths are correct for your system.
# Using raw string literals (r"...") is recommended on Windows to handle backslashes.
DATA_DIR = r"D:\arabic_readability_project\data\samer\og"
OUTPUT_DIR = r"D:\arabic_readability_project\data\samer" # Directory to save the output file

# --- Input File Path ---
# Modified to point to the single SAMER training file
SAMER_TRAIN_PATH = os.path.join(DATA_DIR, "samer_train.tsv")

# --- Output File Path ---
# Create the output directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
OUTPUT_CSV_PATH = os.path.join(OUTPUT_DIR, "samer_d3tok_processed.csv")

# =====================================================================================
# 2. HELPER FUNCTION FOR D3TOK PREPROCESSING
# =====================================================================================
def preprocess_d3tok(text, disambiguator):
    """
    Preprocesses a single string of text into the D3Tok format using
    the provided BERTUnfactoredDisambiguator instance.
    (This function remains unchanged as its logic is still applicable).
    """
    # Ensure the input is a valid, non-empty string
    if not isinstance(text, str) or not text.strip():
        return ""
    try:
        # Tokenize the input text
        tokens = simple_word_tokenize(text)
        
        # Disambiguate the tokens to get detailed analyses
        disambiguated_sentence = disambiguator.disambiguate(tokens)
        
        d3tok_forms = []
        for disambig_word in disambiguated_sentence:
            # Check if the word has analyses and if 'd3tok' is available
            if disambig_word.analyses and 'd3tok' in disambig_word.analyses[0][1]:
                # Get the d3tok form, dediacritize it, and format spacing
                d3tok = dediac_ar(disambig_word.analyses[0][1]['d3tok']).replace("_+", " +").replace("+_", "+ ")
                d3tok_forms.append(d3tok)
            else:
                # If no d3tok form, use the original word (e.g., for punctuation)
                d3tok_forms.append(disambig_word.word)
        
        return " ".join(d3tok_forms)
    except Exception as e:
        print(f"An error occurred while processing text: '{text[:50]}...'. Error: {e}")
        return "" # Return an empty string in case of an error

# =====================================================================================
# 3. MAIN SCRIPT EXECUTION
# =====================================================================================
def main():
    """
    Main function to load, process, and save the SAMER dataset.
    """
    print("--- 🚀 Starting SAMER Dataset D3Tok Pre-processing Script ---")

    # --- Step 1: Load Data ---
    print(f"\n[1/5] Loading SAMER dataset from: {SAMER_TRAIN_PATH}")
    try:
        # Load the single TSV file. 'sep=\t' specifies it's tab-separated.
        df = pd.read_csv(SAMER_TRAIN_PATH, sep='\t', on_bad_lines='warn')
        print(f"✔️  Data loaded successfully. Total records: {len(df)}")

    except FileNotFoundError as e:
        print(f"❌ ERROR: File not found. Please check your file path in the CONFIGURATION section.")
        print(f"   - Missing file: {e.filename}")
        return
    except Exception as e:
        print(f"❌ ERROR loading source data: {e}")
        return

    # --- Step 2: Create 'ID' and 'label' Columns ---
    print("\n[2/5] Creating 'ID' and 'label' columns...")
    
    # Create a unique ID for each row. We'll use the dataframe index.
    df['ID'] = df.index + 1

    # Define the mapping from SAMER levels to BAREC scale
    # L3 -> 4, L4 -> 10, L5 -> 16
    conditions = [
        df['L3'] == 'x',
        df['L4'] == 'x',
        df['L5'] == 'x'
    ]
    choices = [4, 10, 16]

    # Use numpy.select for an efficient way to apply the logic
    # It creates the 'label' column based on which of L3, L4, or L5 contains 'x'
    # 'default=0' will be used if none of those columns have an 'x'
    df['label'] = np.select(conditions, choices, default=0)
    
    print("✔️ 'ID' and 'label' columns created successfully.")
    
    # --- Step 3: Initialize BERT Disambiguator ---
    print("\n[3/5] Initializing BERT Disambiguator for preprocessing (CPU)...")
    print("   (This may take a moment as the model is loaded into memory)")
    try:
        bert_disambiguator = BERTUnfactoredDisambiguator.pretrained('msa')
        print("✔️ Disambiguator loaded successfully.")
    except Exception as e:
        print(f"❌ ERROR: Could not load the CAMeL Tools model. Please ensure it's installed correctly.")
        print(f"   - To install: pip install camel-tools")
        print(f"   - Error details: {e}")
        return

    # --- Step 4: Process the Text Data ---
    print(f"\n[4/5] Applying D3Tok analysis to {len(df)} text records from the 'Line' column...")
    # Use tqdm's progress_apply to show a progress bar
    tqdm.pandas(desc="Analyzing Text with D3Tok")
    
    # Apply the preprocessing function to the 'Line' column
    df['d3tok_text'] = df['Line'].progress_apply(
        lambda text: preprocess_d3tok(text, bert_disambiguator)
    )
    print("✔️ D3Tok analysis complete.")

    # --- Step 5: Reorder Columns and Save the Processed Data ---
    print(f"\n[5/5] Saving processed data to CSV file...")
    
    # Define the desired order for the final CSV file
    final_columns = [
        'ID', 
        'label', 
        'd3tok_text', 
        'Novel', 
        'Chapter', 
        'L3', 
        'L4', 
        'L5', 
        'Line'
    ]
    df_final = df[final_columns]
    
    try:
        df_final.to_csv(OUTPUT_CSV_PATH, index=False, encoding='utf-8-sig')
        print(f"✔️  Successfully saved the file to: {OUTPUT_CSV_PATH}")
    except Exception as e:
        print(f"❌ ERROR saving the output file: {e}")
        return
        
    print("\n--- ✅ Script Finished Successfully ---")
    print("\nPreview of the first 5 rows of the processed data:")
    print(df_final.head())


if __name__ == "__main__":
    main()


--- 🚀 Starting SAMER Dataset D3Tok Pre-processing Script ---

[1/5] Loading SAMER dataset from: D:\arabic_readability_project\data\samer\og\samer_train.tsv
✔️  Data loaded successfully. Total records: 14343

[2/5] Creating 'ID' and 'label' columns...
✔️ 'ID' and 'label' columns created successfully.

[3/5] Initializing BERT Disambiguator for preprocessing (CPU)...
   (This may take a moment as the model is loaded into memory)


Some weights of the model checkpoint at C:\Users\Fatima\AppData\Roaming\camel_tools\data\disambig_bert_unfactored\msa were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


✔️ Disambiguator loaded successfully.

[4/5] Applying D3Tok analysis to 14343 text records from the 'Line' column...


Analyzing Text with D3Tok:   0%|          | 0/14343 [00:00<?, ?it/s]

✔️ D3Tok analysis complete.

[5/5] Saving processed data to CSV file...
✔️  Successfully saved the file to: D:\arabic_readability_project\data\samer\samer_d3tok_processed.csv

--- ✅ Script Finished Successfully ---

Preview of the first 5 rows of the processed data:
   ID  label d3tok_text  Novel  Chapter  \
0   1      0                 0      2.0   
1   2      0                 0      2.0   
2   3      0                 0      2.0   
3   4      0                 0      2.0   
4   5      0                 0      2.0   

                                                  L3  \
0                                       الفصل الثاني   
1                                       «وكان صباح..   
2                                        يوما واحدا»   
3  قضى فتانا إبراهيم — وهذا اسمه — ليلة هادئة عمي...   
4                     ينحدر على أحد جانبيه نهر هائج،   

                                                  L4  \
0                                       الفصل الثاني   
1                        