In [None]:
# Install the required packages
%pip install datasets pandas
from datasets import load_dataset
import re
import psutil
import sys
import time
import pandas as pd

In [None]:
# Load the dataset
ds = load_dataset("jordiclive/wikipedia-summary-dataset")

In [None]:
# Helper function to clean the text by removing special characters and converting to lowercase
def clean_text(example):
    for key, value in example.items():
        if isinstance(value, str):
            value = value.replace('\n', ' ').replace('\t', ' ')  # Replace newlines with spaces
            value = re.sub(r'[^a-zA-Z0-9\s]', '', value)  # Remove special characters
            example[key] = value.lower()  # Convert to lowercase
    return example


In [None]:
# Preprocess to remove all empty, special characters and convert to lowercase
def preprocess_data(ds):
    # Remove rows with any None or NaN values in 'full_text' column
    ds = ds.filter(lambda x: x['full_text'] is not None and x['full_text'] == x['full_text'])

    # Remove special characters and convert to lowercase
    ds = ds.map(clean_text)

    # Convert to pandas DataFrame
    return ds.to_pandas()

In [None]:
# Get the first row to estimate memory usage
first_row = ds['train'][0]
estimated_row_size = sys.getsizeof(first_row)
total_rows = ds['train'].num_rows

# Find available memory to either load the entire dataset or a subset
available_memory = psutil.virtual_memory().available
estimated_memory_required = estimated_row_size * total_rows
available_on_disk_memory = psutil.disk_usage('/').free
MEM_THRESHOLD = 1024**3 // 2  # 0.5 GB, threshold for loading by chunks

print(f"Estimated memory required: {estimated_memory_required} bytes ~ {estimated_memory_required / 1024**3} GB")
print(f"Available RAM: {available_memory} bytes ~ {available_memory / 1024**3} GB")
print(f"Available on-disk memory: {available_on_disk_memory} bytes ~ {available_on_disk_memory / 1024**3} GB")

if estimated_memory_required >= available_on_disk_memory:
    print("Estimated memory required exceeds available on-disk memory. Exiting.")
    sys.exit()

load_by_chunks = False
if estimated_memory_required >= min(available_memory, MEM_THRESHOLD):
    load_by_chunks = True
    print("Estimated memory required exceeds available memory. Loading by chunks...")

In [None]:
# Save data to a CSV file
CHUNK_SIZE = 100000 # 100k rows/chunk
save_file = 'processed.csv'

print(f'Total rows: {total_rows}')

start_time = time.perf_counter()
if load_by_chunks:
    # Process the dataset in chunks
    for i in range(0, total_rows, CHUNK_SIZE):
        print(f'Processing chunk {i // CHUNK_SIZE + 1} of {total_rows // CHUNK_SIZE + 1}...')
        chunk = ds['train'].select(range(i, min(i + CHUNK_SIZE, total_rows)))
        
        df = preprocess_data(chunk)
        
        # Save the chunk to a CSV file
        if i == 0:
            # Write the header for the first chunk
            df.to_csv(save_file, index=False, mode='w')
        else:
            # Append without writing the header for subsequent chunks
            df.to_csv(save_file, index=False, mode='a', header=False)
else:
    # Process the entire dataset at once
    df = preprocess_data(ds['train'])
    df.to_csv(save_file, index=False)
    
print(f'Finished processing in {time.perf_counter() - start_time} seconds ~ {(time.perf_counter() - start_time) / 60} minutes')