# 2. Data Cleaning & Preprocessing

Import Libraries

In [1]:
import pyarrow.dataset as ds
import pandas as pd
import re
from pathlib import Path
import os
import pyarrow.parquet as pq
import pyarrow as pa
import gc

# Cleaning in Batches
This section processes the merged dataset for a specific category (eg. Magazine_Subscriptions) in chunks to handle the large-scale dataset efficiently. The code loads the input Parquet file, filters out columns, and applies cleaning steps including dropping invalid/missing ratings and empty reviews, extracting brand information, removing duplicates, and deriving new columns (review length and year). Each cleaned chunk is saved as a separate Parquet file to manage memory efficiently since the dataset included some large files.

In [None]:
# Define paths
input_file = Path("C:/Users/zachr/Downloads/merged_Magazine_Subscriptions.parquet")
output_dir = Path("C:/Users/zachr/Magazine_Subscriptions")
output_dir.mkdir(parents=True, exist_ok=True)

# Check if input file exists
if not input_file.exists():
    raise FileNotFoundError(f"Input file {input_file} does not exist")

# Clear existing cleaned files to start fresh
for file in output_dir.glob("cleaned__Magazine_Subscriptions*.parquet"):
    os.remove(file)
print(f"Cleared existing files in {output_dir}")

# Create a dataset for streaming
dataset = ds.dataset(input_file, format="parquet")

# Specify columns to keep
columns_to_keep = [
    "parent_asin", "rating", "text", "details", "store", "user_id", "asin", 
    "timestamp", "categories", "main_category", "helpful_vote", 
    "verified_purchase", "title_y", "average_rating", "rating_number", "price"
]

# Scanner for chunked reading
scanner = dataset.scanner(columns=columns_to_keep, batch_size=50_000)

# Define brand extraction logic
def extract_brand(details, store):
    if isinstance(details, dict):
        brand = details.get("brand")
        if brand:
            return brand.strip()
    if isinstance(store, str) and store.strip():
        return store.strip()
    return "Unknown"

# Process in chunks
total_rows_processed = 0
chunk_index = 1

for i, batch in enumerate(scanner.to_batches()):
    # Convert batch to Pandas DataFrame
    df = batch.to_pandas()

    # Drop invalid/missing ratings
    df = df[df["rating"].isin([1, 2, 3, 4, 5])]

    # Drop empty/null review texts
    df = df[df["text"].notnull() & (df["text"].str.strip() != "")]

    # Extract brand
    df["brand"] = df.apply(lambda row: extract_brand(row["details"], row["store"]), axis=1)

    # Drop details/store after extracting brand
    df.drop(columns=["details", "store"], inplace=True, errors='ignore')

    # Drop duplicates
    df.drop_duplicates(subset=["user_id", "asin", "text"], keep="first", inplace=True)

    # Derived column: review length
    df["review_length"] = df["text"].apply(lambda x: len(re.findall(r'\w+', x)))

    # Derived column: year from timestamp
    df["year"] = pd.to_datetime(df["timestamp"], unit="ms", errors="coerce").dt.year

    # Write cleaned chunk to a separate Parquet file
    output_file = output_dir / f"Magazine_Subscriptions{chunk_index}.parquet"
    table = pa.Table.from_pandas(df, preserve_index=False)
    pq.write_table(table, output_file, compression='snappy')
    
    total_rows_processed += len(df)
    print(f"✅ Processed and saved chunk {chunk_index}: {len(df):,} rows to {output_file}")

    # Increment chunk index
    chunk_index += 1
    
    # Clean up memory
    del df
    del table
    gc.collect()

print(f"\n Cleaning complete! {chunk_index - 1} files saved to {output_dir}")
print(f"Total rows processed: {total_rows_processed:,}")

Cleared existing files in C:\Users\zachr\Magazine_Subscriptions
✅ Processed and saved chunk 1: 19,940 rows to C:\Users\zachr\Magazine_Subscriptions\Magazine_Subscriptions1.parquet
✅ Processed and saved chunk 2: 19,916 rows to C:\Users\zachr\Magazine_Subscriptions\Magazine_Subscriptions2.parquet
✅ Processed and saved chunk 3: 19,846 rows to C:\Users\zachr\Magazine_Subscriptions\Magazine_Subscriptions3.parquet
✅ Processed and saved chunk 4: 11,224 rows to C:\Users\zachr\Magazine_Subscriptions\Magazine_Subscriptions4.parquet

 Cleaning complete! 4 files saved to C:\Users\zachr\Magazine_Subscriptions
Total rows processed: 70,926


# Combining Cleaned Parquet Files
This section combines all cleaned Parquet files for a specific category (eg. Magazine_Subscriptions) into a single unified Parquet file. The code iterates through the chunked Parquet files, ensures schema consistency, and writes them to a final output file using PyArrow for efficient handling. The process verifies the total rows and schema of the combined dataset, ensuring it is ready for downstream tasks like EDA, sentiment analysis, and clustering.

In [None]:
import pyarrow.parquet as pq
import pyarrow as pa
from pathlib import Path
import glob
import gc

# Define paths
input_dir = Path("C:/Users/zachr/Magazine_Subscriptions")
output_file = Path("C:/Users/zachr/final_cleaned_Magazine_Subscriptions.parquet")

# Ensure input directory exists
if not input_dir.exists():
    raise FileNotFoundError(f"Input directory {input_dir} does not exist")

# Get list of all cleaned Parquet files
input_files = sorted(glob.glob(str(input_dir / "Magazine_Subscriptions*.parquet")))
if not input_files:
    raise FileNotFoundError(f"No files found matching {input_dir / 'Magazine_Subscriptions*.parquet'}")

print(f"Found {len(input_files)} cleaned Parquet files to combine")

# Initialize ParquetWriter with the schema from the first file
first_file = input_files[0]
first_table = pq.read_table(first_file)
schema = first_table.schema
writer = pq.ParquetWriter(output_file, schema, compression='snappy')

total_rows_written = 0

# Iterate over each input file
for file in input_files:
    # Read the Parquet file
    table = pq.read_table(file)
    
    # Verify schema consistency
    if table.schema != schema:
        print(f"Warning: Schema mismatch in {file}. Expected {schema}, got {table.schema}")
    
    # Write the table to the output file
    writer.write_table(table)
    
    total_rows_written += table.num_rows
    print(f"Processed {file}: {table.num_rows:,} rows")
    
    # Clean up memory
    del table
    gc.collect()

# Close the ParquetWriter
writer.close()

print(f"\n🎉 Combining complete! Saved to {output_file}")
print(f"Total rows written: {total_rows_written:,}")

# Verify the final file
final_table = pq.read_table(output_file)
print(f"Final file shape: ({final_table.num_rows}, {final_table.num_columns})")
print(f"Final file columns: {final_table.column_names}")

Found 4 cleaned Parquet files to combine
Processed C:\Users\zachr\Magazine_Subscriptions\Magazine_Subscriptions1.parquet: 19,940 rows
Processed C:\Users\zachr\Magazine_Subscriptions\Magazine_Subscriptions2.parquet: 19,916 rows
Processed C:\Users\zachr\Magazine_Subscriptions\Magazine_Subscriptions3.parquet: 19,846 rows
Processed C:\Users\zachr\Magazine_Subscriptions\Magazine_Subscriptions4.parquet: 11,224 rows

🎉 Combining complete! Saved to C:\Users\zachr\final_cleaned_Magazine_Subscriptions.parquet
Total rows written: 70,926
Final file shape: (70926, 17)
Final file columns: ['parent_asin', 'rating', 'text', 'user_id', 'asin', 'timestamp', 'categories', 'main_category', 'helpful_vote', 'verified_purchase', 'title_y', 'average_rating', 'rating_number', 'price', 'brand', 'review_length', 'year']
