In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from spellchecker import SpellChecker
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import difflib
import re
import torch
import textwrap
from datasets import load_dataset, Dataset

In [None]:
file = 'Books_cleaned.csv'
folder = './Data_processed'

In [None]:
dataset = load_dataset("csv", data_files=folder + '/' + file)


In [None]:
# --- 1. Define the Helper Functions for text cleaning ---
def pre_clean_for_model(text: str) -> str:
    """
    Performs a hard scrub of the text to remove only the most severe OCR noise
    before sending it to a language model.
    """
    if not isinstance(text, str):
        return ""
    text = text.replace('\n', ' ').replace('<NEWPAGE>', ' ')
    text = re.sub(r'[^A-Za-z0-9\s.,!?-]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# --- 2. Recoverable Mapping Function ---
def apply_cleaning_in_true_batch(examples, indices, model_pipeline, chunk_size, overlap):
    """
    Accepts 'indices' to track which rows are being processed.
    Returns a special placeholder for batches that fail due to OOM errors.
    """
    print(f"Processing batch starting with index: {indices[0]}") # Progress indicator
    pre_cleaned_texts = [pre_clean_for_model(text) for text in examples['Full_text']]
    all_chunks, chunks_per_review = [], []

    for text in pre_cleaned_texts:
        words = text.split()
        if not words:
            chunks_per_review.append(0)
            continue
        review_chunks = []
        start = 0
        while start < len(words):
            end = start + chunk_size
            review_chunks.append(" ".join(words[start:end]))
            start += chunk_size - overlap
        all_chunks.extend(review_chunks)
        chunks_per_review.append(len(review_chunks))

    if not all_chunks:
        return {'Cleaned_text': ['' for _ in examples['Full_text']]}

    try:
        restored_chunks_results = model_pipeline(all_chunks, max_length=512, batch_size=4)
        restored_chunks_text = [result['generated_text'] for result in restored_chunks_results]
    except torch.cuda.OutOfMemoryError:
        print(f"\n---!!! WARNING: CUDA OutOfMemoryError caught on batch starting at index {indices[0]}. Marking as failed. !!!---\n")
        return {'Cleaned_text': ["__FAILED__" for _ in examples['Full_text']]}
    finally:
        if torch.cuda.is_available(): torch.cuda.empty_cache()

    final_cleaned_texts = []
    chunk_idx_start = 0
    for num_chunks in chunks_per_review:
        if num_chunks == 0:
            final_cleaned_texts.append("")
            continue
        
        review_restored_chunks = restored_chunks_text[chunk_idx_start : chunk_idx_start + num_chunks]
        full_text_words = []
        for i, chunk_text in enumerate(review_restored_chunks):
            chunk_words = chunk_text.split()
            if i == 0:
                full_text_words.extend(chunk_words)
            else:
                full_text_words.extend(chunk_words[overlap:])
        final_cleaned_texts.append(" ".join(full_text_words))
        chunk_idx_start += num_chunks

    return {'Cleaned_text': final_cleaned_texts}

# --- 3. Load Model and Apply the Mapping ---
print("Loading the text restoration model...")
restorer = pipeline(
    "text2text-generation",
    model="pszemraj/flan-t5-large-grammar-synthesis",
    device_map="auto"
)
print("Model loaded successfully.")

# --- 4. Run the Main Pipeline ---
print("\n--- Applying the recoverable cleaning pipeline ---")
dataset = dataset.map(
    apply_cleaning_in_true_batch,
    batched=True,
    with_indices=True, 
    batch_size=16,
    fn_kwargs={'model_pipeline': restorer, 'chunk_size': 300, 'overlap': 50}
)
print("Initial processing complete.")



In [None]:

# --- 5. Save the Intermediate Results ---
print("\n--- Saving intermediate results (with any failures) ---")
dataset['train'].to_csv("cleaned_reviews_with_failures.csv", index=False)
print("File saved.")



In [None]:

# --- 6. THE RECOVERY PROCESS ---

print("\n--- Starting Recovery Process ---")
# Load the dataset that contains the '__FAILED__' markers
recovery_dataset = load_dataset("csv", data_files="cleaned_reviews_with_failures.csv")

# Filter to get only the rows that failed
failed_examples = recovery_dataset['train'].filter(
    lambda example: example['Cleaned_text'] == "__FAILED__"
)

if len(failed_examples) > 0:
    print(f"Found {len(failed_examples)} failed examples to re-process.")

    # Re-run the pipeline on the small, failed dataset.
    # Use a smaller batch_size or chunk_size for more safety.
    reprocessed_failures = failed_examples.map(
        apply_cleaning_in_true_batch,
        batched=True,
        with_indices=True,
        batch_size=4, # Smaller batch size for safety
        fn_kwargs={'model_pipeline': restorer, 'chunk_size': 200, 'overlap': 30}
    )

    print("Re-processing complete. Now merging results.")
    # You would now merge these corrected results back into your main file.
    # The easiest way is often with Pandas.
    main_df = pd.read_csv("cleaned_reviews_with_failures.csv")
    reprocessed_df = reprocessed_failures.to_pandas()

    # Create a dictionary from the reprocessed data for easy mapping
    # We need a unique identifier; let's assume 'Full_text' is unique enough for this.
    update_map = pd.Series(reprocessed_df['Cleaned_text'].values, index=reprocessed_df['Full_text']).to_dict()

    # Update the main DataFrame
    main_df['Cleaned_text'] = main_df.apply(
        lambda row: update_map.get(row['Full_text'], row['Cleaned_text']),
        axis=1
    )

    main_df.to_csv("cleaned_reviews_final.csv", index=False)
    print("Final, fully cleaned file saved as 'cleaned_reviews_final.csv'")

else:
    print("No failed examples found. Your initial run was successful!")


In [None]:
# --- 1. Load the dataset with cleaned text into a dataframe ---
dataset = load_dataset("csv", data_files="cleaned_reviews_final.csv")

# --- 2. Define the SequenceMatcher comparison function ---
def calculate_similarity(text_a: str, text_b: str) -> float:
    """
    Calculates a similarity ratio between two strings using SequenceMatcher.
    Returns a float between 0.0 (totally different) and 1.0 (identical).
    """
    # Ensure inputs are strings to avoid errors
    if not isinstance(text_a, str) or not isinstance(text_b, str):
        return 0.0
    
    # Calculate and return the similarity ratio
    return difflib.SequenceMatcher(None, text_a, text_b).ratio()

# --- 3. Define the Mapping Function for Batch Processing ---
def add_similarity_in_batch(examples):
    """
    This function is designed for .map(batched=True).
    'examples' is a dictionary where each value is a LIST of items.
    """
    # Use a list comprehension with zip to efficiently process the batch.
    # This pairs up each 'Full_text' with its corresponding 'Cleaned_text'.
    similarity_scores = [
        calculate_similarity(pre_clean_for_model(original), cleaned)
        for original, cleaned in zip(examples['Full_text'], examples['Cleaned_text'])
    ]
    
    # Return a dictionary with the new column. The value must be a list.
    return {'similarity_score': similarity_scores}


# --- 4. Apply the Mapping ---
print("--- Applying the similarity calculation using batching ---")

dataset = dataset.map(
    add_similarity_in_batch,
    batched=True,
    batch_size=500  # For CPU tasks, a larger batch size is fine
)

print("\n--- Dataset after adding the 'similarity_score' column ---")
print(dataset['train'].to_pandas())


In [None]:
plt.hist(dataset['train']['similarity_score'], bins=50, color='skyblue', edgecolor='black')

In [None]:
df_sorted = dataset['train'].to_pandas().sort_values(by='similarity_score')

In [None]:
# Filter for the most heavily altered reviews
suspicious_reviews = df_sorted[df_sorted['similarity_score'] < 0.01]

print(f"Found {len(suspicious_reviews)} reviews with a similarity score below 0.01 to inspect.")

In [None]:
# --- The Side-by-Side Pretty Print Function ---
def pretty_print_side_by_side(review_series, total_width=120):
    """
    Prints a side-by-side comparison of the pre-cleaned original text and
    the final T5-restored text for a single review.

    Args:
        review_series (pd.Series): A single row from your DataFrame.
        total_width (int): The total width of the output in characters.
    """
    # Calculate the width for each text column
    divider = "   |   "
    col_width = (total_width - len(divider)) // 2

    # Get the two versions of the text
    original_pre_cleaned = pre_clean_for_model(review_series['Full_text'])
    t5_cleaned = review_series['Cleaned_text']
    
    # Wrap the text in each column into lists of lines
    original_lines = textwrap.wrap(original_pre_cleaned, width=col_width)
    cleaned_lines = textwrap.wrap(t5_cleaned, width=col_width)
    
    # --- Start Printing ---
    print("=" * total_width)
    # Use .get() to avoid an error if 'similarity_score' doesn't exist
    score = review_series.get('similarity_score', 'N/A')
    if isinstance(score, float):
        score = f"{score:.4f}"
    print(f"Similarity Score: {score}".center(total_width))
    print("-" * total_width)
    
    # Print headers
    header_original = "Pre-Cleaned Original".center(col_width)
    header_cleaned = "T5 Restored Text".center(col_width)
    print(f"{header_original}{divider}{header_cleaned}")
    print("-" * total_width)

    # Print the lines side-by-side
    max_lines = max(len(original_lines), len(cleaned_lines))
    for i in range(max_lines):
        # Get the line for each side, or an empty string if one side is shorter
        left_line = original_lines[i] if i < len(original_lines) else ""
        right_line = cleaned_lines[i] if i < len(cleaned_lines) else ""
        
        # Print the formatted line with padding
        print(f"{left_line:<{col_width}}{divider}{right_line:<{col_width}}")
        
    print("=" * total_width + "\n")

In [None]:
print("--- This is the dataset we want to save ---")
print(dataset)
print("\n" + "="*50 + "\n")


# --- 2. Define the output file path ---
output_file_path = "cleaned_reviews.csv"


# --- 3. Save the dataset to a CSV file ---

# === METHOD 1: Convert to Pandas and Save (Highly Recommended) ===
print("--- Saving using the recommended Pandas method ---")
try:
    # First, select the 'train' split from the DatasetDict
    train_split = dataset['train']

    # Convert the split to a Pandas DataFrame
    df = train_split.to_pandas()

    # Save the DataFrame to CSV.
    # `index=False` is crucial to avoid an extra, unnamed column.
    # `encoding='utf-8'` is a best practice for text data.
    df.to_csv(output_file_path, index=False, encoding='utf-8')

    print(f"Successfully saved the 'train' split to '{output_file_path}'.")
    print("You can now open this file to begin your manual labelling.")

except Exception as e:
    print(f"An error occurred with the Pandas method: {e}")

print("\n" + "="*50 + "\n")

In [None]:
pretty_print_side_by_side(suspicious_reviews.iloc[1])