In [1]:
import pandas as pd

from data_pipeline import ReviewDataPipeline, create_sample_pipeline
from utils.helpers.summary import validate_review_data, get_data_summary
from utils.helpers.sampling import print_sampling_summary

## Data Pipeline

In [2]:
# Create pipeline instance
pipeline = create_sample_pipeline()

In [3]:
# Example: Process multiple categories
data_config = {
    # "ebooks": "data/ebooks.tsv",
    # "music": "data/music.tsv", 
    "software": "data/software.tsv",
    "games": "data/games.tsv",
    # "videos": "data/videos.tsv"
}
processed_data = pipeline.process_all_categories(data_config)


=== Processing software ===
Loaded 102084 reviews from data/software.tsv
  - 5 reviews have empty review_body
  - 2098 reviews are very short (< 10 characters)
  - 2 reviews have missing dates
Found 5602 reviews from 2012
Sampled 4000 reviews for software
Saved processed data to processed_data/software.csv

=== Processing games ===
Loaded 145431 reviews from data/games.tsv
  - 3 reviews have empty review_body
  - 8508 reviews are very short (< 10 characters)
  - 11 reviews have missing dates
Found 16624 reviews from 2012
Sampled 4000 reviews for games
Saved processed data to processed_data/games.csv


In [None]:
merged_df = pipeline.merge_all_data(processed_data, "final_dataset.csv")

## Dataset Summary

In [None]:
# Load your existing data
df = pd.read_csv("/data/final_dataset.csv")
print(f"  Loaded {len(df):,} reviews")

In [None]:
# Validate the data
validation_results = validate_review_data(df)
if validation_results['warnings']:
    print("  ❌ Warnings:")
    for warning in validation_results['warnings']:
        print(f"     - {warning}")
else:
    print("  ✅ No data quality issues found")

In [None]:
# Get data summary
summary = get_data_summary(df)
summary

In [None]:
print(f"   Total reviews: {summary['total_reviews']:,}")
print(f"   Date range: {summary['date_range']['earliest']} to {summary['date_range']['latest']}")
print(f"   Verified purchase rate: {summary['verified_purchase_rate']:.1%}")

print("\n   Rating distribution:")
for rating, count in summary['rating_distribution'].items():
    print(f"     {rating} stars: {count:,}")

In [None]:
print_sampling_summary(df)