In [5]:
import pandas as pd

# 1. Define the file paths for your input and output files
#    Using raw strings (r'...') is recommended for Windows paths.
dares_file_path = r'D:\arabic_readability_project\data\dares\dares_d3tok_processed_FULL.csv'
train_file_path = r'D:\arabic_readability_project\data\strict\prepros\train_preprocessed.csv'
output_file_path = r'D:\arabic_readability_project\data\strict\prepros\merged_data.csv'

# 2. Load both CSV files into pandas DataFrames
try:
    df_dares = pd.read_csv(dares_file_path)
    df_train = pd.read_csv(train_file_path)
    print("✅ Files loaded successfully.")
except FileNotFoundError as e:
    print(f"❌ Error: File not found. Please check the path: {e}")
    exit()


# 3. Prepare the first DataFrame (dares)
#    - Select only the 'd3tok_text' and 'label' columns.
#    - Rename 'd3tok_text' to 'text' to match the second DataFrame.
df_dares_subset = df_dares[['d3tok_text', 'label']].rename(columns={'d3tok_text': 'text'})
print("Processed the first DataFrame (dares).")
print(f"Columns selected and renamed: {df_dares_subset.columns.tolist()}")


# 4. Prepare the second DataFrame (train)
#    - Select the 'text' and 'label' columns to ensure the correct order.
df_train_subset = df_train[['text', 'label']]
print("Processed the second DataFrame (train).")
print(f"Columns selected: {df_train_subset.columns.tolist()}")


# 5. Concatenate (merge) the two DataFrames one under the other
#    The data from df_dares_subset will be appended to the end of df_train_subset.
#    ignore_index=True resets the index of the new DataFrame.
merged_df = pd.concat([df_train_subset, df_dares_subset], ignore_index=True)
print("\nDataFrames merged successfully.")
print(f"Total rows in new DataFrame: {len(merged_df)}")


# 6. Save the final merged DataFrame to a new CSV file
#    - index=False prevents pandas from writing the DataFrame index as a column.
#    - encoding='utf-8-sig' is important for compatibility with Arabic text, especially in Excel.
merged_df.to_csv(output_file_path, index=False, encoding='utf-8-sig')

print(f"\n🎉 Success! Merged file has been saved to:\n{output_file_path}")

✅ Files loaded successfully.
Processed the first DataFrame (dares).
Columns selected and renamed: ['text', 'label']
Processed the second DataFrame (train).
Columns selected: ['text', 'label']

DataFrames merged successfully.
Total rows in new DataFrame: 68715

🎉 Success! Merged file has been saved to:
D:\arabic_readability_project\data\strict\prepros\merged_data.csv


# split dev to contain dares

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

# --- Configuration ---
# Define the path to your merged data file (BAREC + DARES)
MERGED_DATA_PATH = 'D:/arabic_readability_project/data/strict/prepros/merged_data.csv' 

# Define the path to your original, untouched development file (BAREC only)
ORIGINAL_DEV_PATH = 'D:/arabic_readability_project/data/strict/prepros/dev_preprocessed.csv'

# Define the names for your new, augmented output files
NEW_TRAIN_CSV_PATH = 'D:/arabic_readability_project/data/strict/prepros/train_augmented.csv'
NEW_DEV_CSV_PATH = 'D:/arabic_readability_project/data/strict/prepros/dev_augmented.csv'

# Define the row index where the DARES dataset begins in the merged file
DARES_START_INDEX = 54847

# Define the column to stratify on.
STRATIFY_COLUMN = 'label' 

# Define the proportion of the DARES data to move to the development set
DARES_DEV_SET_SIZE = 0.15

# A random state for reproducibility of the split
RANDOM_STATE = 42

# --- Main Splitting Logic ---
def create_augmented_split(merged_path, original_dev_path, new_train_path, new_dev_path, dares_start_index, stratify_col, test_size, random_state):
    """
    Loads a merged dataset and an original dev set. It then moves a stratified
    sample of the new data (DARES) from the training set to the dev set.

    Args:
        merged_path (str): Path to the merged (BAREC + DARES) data file.
        original_dev_path (str): Path to the original development data file.
        new_train_path (str): Path to save the new, augmented training data CSV.
        new_dev_path (str): Path to save the new, augmented development data CSV.
        dares_start_index (int): The index where the DARES data starts.
        stratify_col (str): The name of the column to stratify the split by.
        test_size (float): The proportion of the DARES dataset to move to the dev set.
        random_state (int): Seed for the random number generator for reproducibility.
    """
    print(f"--- Starting Data Augmentation Process ---")

    # --- Load Data ---
    try:
        print(f"Loading merged data from '{merged_path}'...")
        merged_df = pd.read_csv(merged_path)
        print(f"Successfully loaded {len(merged_df)} total records.")

        print(f"Loading original development data from '{original_dev_path}'...")
        original_dev_df = pd.read_csv(original_dev_path)
        print(f"Successfully loaded {len(original_dev_df)} original dev records.")
    except FileNotFoundError as e:
        print(f"Error: Could not find a required data file. {e}")
        return
    except Exception as e:
        print(f"Error loading CSV files: {e}")
        return

    # --- Separate BAREC and DARES data from the merged file ---
    barec_train_df = merged_df.loc[:dares_start_index - 1]
    dares_df = merged_df.loc[dares_start_index:]
    print(f"\nIdentified {len(barec_train_df)} BAREC records and {len(dares_df)} DARES records in the merged file.")

    # --- Stratified Split on DARES data ---
    print(f"\nPerforming stratified split on the DARES data to select {test_size*100}% for the dev set...")
    
    if dares_df.empty:
        print("Warning: No DARES data to split. The dev set will not be augmented.")
        dares_to_dev = pd.DataFrame()
        dares_to_train = pd.DataFrame()
    else:
        try:
            # Separate features (X) and target (y) for splitting
            X_dares = dares_df.drop(columns=[stratify_col])
            y_dares = dares_df[stratify_col]

            # Perform the stratified split
            dares_to_train_X, dares_to_dev_X, dares_to_train_y, dares_to_dev_y = train_test_split(
                X_dares, y_dares,
                test_size=test_size,
                random_state=random_state,
                stratify=y_dares
            )

            # Recombine into dataframes
            dares_to_train = pd.concat([dares_to_train_X, dares_to_train_y], axis=1)
            dares_to_dev = pd.concat([dares_to_dev_X, dares_to_dev_y], axis=1)
            print("✔ DARES data split successfully!")
            print(f"  - {len(dares_to_dev)} records will be added to the dev set.")
            print(f"  - {len(dares_to_train)} records will remain in the training set.")

        except Exception as e:
            print(f"An error occurred during the split of DARES data: {e}")
            return

    # --- Create New Augmented Datasets ---
    # The new training set is the original BAREC part + the part of DARES that was not moved.
    new_train_df = pd.concat([barec_train_df, dares_to_train], ignore_index=True)
    
    # The new dev set is the original BAREC dev set + the part of DARES that was moved.
    new_dev_df = pd.concat([original_dev_df, dares_to_dev], ignore_index=True)

    # --- Saving the New Files ---
    print("\n--- Saving New Augmented Datasets ---")
    try:
        new_train_df.to_csv(new_train_path, index=False)
        print(f"New training set saved to '{new_train_path}' ({len(new_train_df)} records)")
        
        new_dev_df.to_csv(new_dev_path, index=False)
        print(f"New development set saved to '{new_dev_path}' ({len(new_dev_df)} records)")
    except Exception as e:
        print(f"Error saving files: {e}")
        return

    print("\n--- Script Finished ---")


# --- Run the script ---
if __name__ == "__main__":
    create_augmented_split(
        merged_path=MERGED_DATA_PATH,
        original_dev_path=ORIGINAL_DEV_PATH,
        new_train_path=NEW_TRAIN_CSV_PATH,
        new_dev_path=NEW_DEV_CSV_PATH,
        dares_start_index=DARES_START_INDEX,
        stratify_col=STRATIFY_COLUMN,
        test_size=DARES_DEV_SET_SIZE,
        random_state=RANDOM_STATE
    )


--- Starting Data Augmentation Process ---
Loading merged data from 'D:/arabic_readability_project/data/strict/prepros/merged_data.csv'...
Successfully loaded 68715 total records.
Loading original development data from 'D:/arabic_readability_project/data/strict/prepros/dev_preprocessed.csv'...
Successfully loaded 7310 original dev records.

Identified 54847 BAREC records and 13868 DARES records in the merged file.

Performing stratified split on the DARES data to select 15.0% for the dev set...
✔ DARES data split successfully!
  - 2081 records will be added to the dev set.
  - 11787 records will remain in the training set.

--- Saving New Augmented Datasets ---
New training set saved to 'D:/arabic_readability_project/data/strict/prepros/train_augmented.csv' (66634 records)
New development set saved to 'D:/arabic_readability_project/data/strict/prepros/dev_augmented.csv' (9391 records)

--- Script Finished ---
