In [1]:
import pandas as pd

from data_pipeline import ReviewDataPipeline, create_sample_pipeline
from utils.helpers.summary import validate_review_data, get_data_summary
from utils.helpers.sampling import print_sampling_summary

## Data Pipeline

In [2]:
# Create pipeline instance
pipeline = create_sample_pipeline()

In [3]:
# Example: Process multiple categories
data_config = {
    "ebooks": "data/Digital_Ebook.tsv",
    "music": "data/Digital_Music.tsv", 
    "software": "data/Digital_Software.tsv",
    "games": "data/Digital_Video_Games.tsv",
    "videos": "data/Digital_Video.tsv"
}
processed_data = pipeline.process_all_categories(data_config)


=== Processing ebooks ===
Loaded 5101693 reviews from data/Digital_Ebook.tsv
  - 70 reviews are very short (< 10 characters)
  - 94 reviews have missing dates
Found 1503561 reviews from 2012
Sampled 400 reviews for ebooks
Saved processed data to processed_data/ebooks.csv

=== Processing music ===
Loaded 1688884 reviews from data/Digital_Music.tsv
  - 114 reviews have empty review_body
  - 85531 reviews are very short (< 10 characters)
  - 43 reviews have missing dates
Found 201359 reviews from 2012
Sampled 400 reviews for music
Saved processed data to processed_data/music.csv

=== Processing software ===
Loaded 102084 reviews from data/Digital_Software.tsv
  - 5 reviews have empty review_body
  - 2098 reviews are very short (< 10 characters)
  - 2 reviews have missing dates
Found 5602 reviews from 2012
Sampled 400 reviews for software
Saved processed data to processed_data/software.csv

=== Processing games ===
Loaded 145431 reviews from data/Digital_Video_Games.tsv
  - 3 reviews have

In [4]:
merged_df = pipeline.merge_all_data(processed_data, "final_dataset2.csv")


=== Merging all data ===
Merged 2000 total reviews from 5 categories
Saved merged data to final_dataset2.csv


## Dataset Summary

In [5]:
# Load your existing data
df = pd.read_csv("final_dataset2.csv")
print(f"  Loaded {len(df):,} reviews")
df

  Loaded 2,000 reviews


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,review_length,length_bin,month,category
0,US,45450659,R2X2VCD6881DZG,B0079W9WZ6,318707260,And the Moonbeams Kissed the Sea,Digital_Video_Download,5,31,33,False,False,Cerebral crime procedural featuring lots of li...,Note: Amazon first made Series 3&4 available v...,2012-04-08,855,extra_long,2012-04,videos
1,US,10366214,R3DXVCE0HEUVLC,B0054W4JQK,465996227,Before I Go To Sleep,Digital_Ebook_Purchase,4,0,1,False,True,Surprising End,"The start of the book was interesting, then ha...",2012-05-26,38,short,2012-05,ebooks
2,US,15355503,RQZMZFDSAGADS,B00822X7OY,810428534,Square Enix Ultimate Collection [Download],Digital_Video_Games,5,1,3,False,True,$7.49 for 5 games? Amazo.com is THE BEST!,this is the Biggest deal on amazon.com about g...,2012-05-19,61,medium,2012-05,games
3,US,15526727,RECG1YZ46MM0V,B005S4Y13K,70285996,TurboTax Deluxe Federal + E-file + State 2011 ...,Digital_Software,4,0,0,False,True,No issues with download/install,Just downloaded and installed and it took 5 mi...,2012-01-27,43,short,2012-01,software
4,US,38097415,R1SKCVKCY3F8CD,B0064TYRAU,771851250,Mass Effect 3,Digital_Video_Games,1,11,17,False,False,please do not support,"This game is, without a doubt, absolutely bril...",2012-03-17,154,medium,2012-03,games
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,US,43498454,RUBQWETGCIWFV,B0068TJ7OC,740565112,H&R Block At Home 2011 Deluxe + State,Digital_Software,4,2,2,False,True,Great tax prep software,I switched to TurboTax last year and switched ...,2012-02-04,49,short,2012-02,software
1996,US,46785098,R304SNPYUR2SE5,B007977HM6,151163064,LA Noire,Digital_Video_Games,2,0,0,False,True,"Good game, HORRIBLE DRM!!!",I was very excited to get this game on PC beca...,2012-05-08,201,long,2012-05,games
1997,US,24803828,R1YCE7ULKKTH4X,B0068TJ7OC,740565112,H&R Block At Home 2011 Deluxe + State,Digital_Software,1,0,0,False,True,"Good program, bad pricing structure",I used H&R Block at Home Deluxe + State for my...,2012-04-29,162,medium,2012-04,software
1998,US,50994027,R2M5ELQIUI5F4D,B006ULENFG,445862452,Crusader Kings II [Download],Digital_Video_Games,5,1,2,False,True,Top Notch Strategy Game,Crusader Kings II is a game fit for kings! Or...,2012-05-15,201,long,2012-05,games


In [6]:
# Validate the data
validation_results = validate_review_data(df)
if validation_results['warnings']:
    print("  ❌ Warnings:")
    for warning in validation_results['warnings']:
        print(f"     - {warning}")
else:
    print("  ✅ No data quality issues found")

  ✅ No data quality issues found


In [7]:
# Get data summary
summary = get_data_summary(df)
summary

{'total_reviews': 2000,
 'date_range': {'earliest': '2012-01-01', 'latest': '2012-12-31'},
 'rating_distribution': {1: 406, 2: 387, 3: 395, 4: 396, 5: 416},
 'categories': {'Digital_Video_Download': 400,
  'Digital_Ebook_Purchase': 400,
  'Digital_Video_Games': 400,
  'Digital_Software': 400,
  'Digital_Music_Purchase': 400},
 'verified_purchase_rate': 0.572}

In [8]:
print(f"   Total reviews: {summary['total_reviews']:,}")
print(f"   Date range: {summary['date_range']['earliest']} to {summary['date_range']['latest']}")
print(f"   Verified purchase rate: {summary['verified_purchase_rate']:.1%}")

print("\n   Rating distribution:")
for rating, count in summary['rating_distribution'].items():
    print(f"     {rating} stars: {count:,}")

   Total reviews: 2,000
   Date range: 2012-01-01 to 2012-12-31
   Verified purchase rate: 57.2%

   Rating distribution:
     1 stars: 406
     2 stars: 387
     3 stars: 395
     4 stars: 396
     5 stars: 416


In [9]:
print_sampling_summary(df)

Total samples: 2000
Length bin distribution:
  short: 528
  medium: 517
  long: 502
  extra_long: 453
Month distribution:
  2012-01: 195
  2012-02: 199
  2012-03: 195
  2012-04: 196
  2012-05: 197
  2012-06: 190
  2012-07: 191
  2012-08: 193
  2012-09: 117
  2012-10: 119
  2012-11: 104
  2012-12: 104
Rating distribution:
  1: 406
  2: 387
  3: 395
  4: 396
  5: 416
