In [19]:
import pandas as pd
from collections import Counter

In [20]:
file = "Toys_and_Games.jsonl"

In [121]:
asin_counts = Counter()
parent_counts = Counter()

for chunk in pd.read_json(file, lines=True, chunksize=50000):
    asin_counts.update(chunk['asin'].dropna())
    parent_counts.update(chunk['parent_asin'].dropna())

# now after full pass:
# print(asin_counts.most_common(250))

In [122]:
most_common = asin_counts.most_common(250)

In [130]:
for i in range(250):
    asin, _ = most_common[i]
    print(f"{i}: https://www.amazon.com/dp/{asin}")

0: https://www.amazon.com/dp/B004S8F7QM
1: https://www.amazon.com/dp/B000YDDF6O
2: https://www.amazon.com/dp/B010TQY7A8
3: https://www.amazon.com/dp/B00005C5H4
4: https://www.amazon.com/dp/B004V3PS72
5: https://www.amazon.com/dp/B01MRG7T0D
6: https://www.amazon.com/dp/B01AC7SF7K
7: https://www.amazon.com/dp/B00ABA0ZOA
8: https://www.amazon.com/dp/B000B6ACGA
9: https://www.amazon.com/dp/B0157IHJMQ
10: https://www.amazon.com/dp/B0120XRWLE
11: https://www.amazon.com/dp/B00AU56C5W
12: https://www.amazon.com/dp/B001CJVTLC
13: https://www.amazon.com/dp/B00000IV35
14: https://www.amazon.com/dp/B00A80X19E
15: https://www.amazon.com/dp/0975277324
16: https://www.amazon.com/dp/B00000IVAK
17: https://www.amazon.com/dp/B003I64OT6
18: https://www.amazon.com/dp/B00D8STBHY
19: https://www.amazon.com/dp/B00APVXSM6
20: https://www.amazon.com/dp/B00MNG37C2
21: https://www.amazon.com/dp/8499000606
22: https://www.amazon.com/dp/B00NQQTZCO
23: https://www.amazon.com/dp/B01C5A2WJO
24: https://www.amazon.com

In [170]:
expensive_asins = ['B01C49MCCS',
                  'B004A2QTRC',
                  'B00005KBVD',
                  'B074YYVXQH',
                  'B085HNMHKJ',
                  'B0107H5FJ6',
                  'B002PEGT9U',
                  'B000CBSNRY',
                  'B07F38CPM1',
                  'B07V3CLLCV',
                  'B07VCD1SRL',
                  'B07G3ZNK4Y',
                  'B003NSBMUI',
                  'B0042F99PG',
                  'B00592BOAO']

In [171]:
for chunk in pd.read_json(file, lines=True, chunksize=50000):
    df = chunk
    break

In [190]:
import os, json
from collections import Counter, defaultdict
import pandas as pd

path = file   # <-- your JSONL path

def stream_json_objects(path):
    """Yield dict objects from a JSONL file; skip blanks, 'false', arrays, malformed, etc."""
    with open(path, "r", encoding="utf-8") as f:
        for i, line in enumerate(f, 1):
            s = line.strip()
            if not s:
                continue
            try:
                obj = json.loads(s)
            except Exception:
                # malformed line; skip
                continue
            if isinstance(obj, dict):
                yield obj  # only objects are valid rows

# 1) First pass: find top 25 ASINs (robust to bad lines)
asin_counts = Counter()
for obj in stream_json_objects(path):
    a = obj.get("asin")
    if a:
        asin_counts[a] += 1

retro_top_25 = [a for a, _ in asin_counts.most_common(25)]
top25 = expensive_asins + retro_top_25#[a for a, _ in asin_counts.most_common(25)]
top25_set = set(top25)#set(top25)

print("Top 25 ASINs:", top25)

# 2) Second pass: write rows for those ASINs to per-ASIN files incrementally (memory-safe)
buffers = defaultdict(list)
flush_every = 10000  # adjust as you like

def flush(asin):
    """Append buffer to CSV for this ASIN, then clear buffer."""
    out = f"asin_{asin}.csv"
    df = pd.DataFrame.from_records(buffers[asin])
    df.to_csv(out, mode="a", index=False, header=not os.path.exists(out))
    buffers[asin].clear()

# remove old outputs if you’re re-running (optional)
for a in top25:
    out = f"asin_{a}.csv"
    if os.path.exists(out):
        os.remove(out)

for obj in stream_json_objects(path):
    a = obj.get("asin")
    if a in top25_set:
        buffers[a].append(obj)
        if len(buffers[a]) >= flush_every:
            flush(a)

# final flush
for a in list(buffers.keys()):
    if buffers[a]:
        flush(a)

# 3) If you want in-memory DataFrames now (optional; uses RAM):
dfs = {a: pd.read_csv(f"asin_{a}.csv") for a in top25 if os.path.exists(f"asin_{a}.csv")}

Top 25 ASINs: ['B01C49MCCS', 'B004A2QTRC', 'B00005KBVD', 'B074YYVXQH', 'B085HNMHKJ', 'B0107H5FJ6', 'B002PEGT9U', 'B000CBSNRY', 'B07F38CPM1', 'B07V3CLLCV', 'B07VCD1SRL', 'B07G3ZNK4Y', 'B003NSBMUI', 'B0042F99PG', 'B00592BOAO', 'B004S8F7QM', 'B000YDDF6O', 'B010TQY7A8', 'B00005C5H4', 'B004V3PS72', 'B01MRG7T0D', 'B01AC7SF7K', 'B00ABA0ZOA', 'B000B6ACGA', 'B0157IHJMQ', 'B0120XRWLE', 'B00AU56C5W', 'B001CJVTLC', 'B00000IV35', 'B00A80X19E', '0975277324', 'B00000IVAK', 'B003I64OT6', 'B00D8STBHY', 'B00APVXSM6', 'B00MNG37C2', '8499000606', 'B00NQQTZCO', 'B01C5A2WJO', 'B007EA4UBY']


In [191]:
dfs.keys()

dict_keys(['B01C49MCCS', 'B004A2QTRC', 'B00005KBVD', 'B074YYVXQH', 'B085HNMHKJ', 'B0107H5FJ6', 'B002PEGT9U', 'B000CBSNRY', 'B07F38CPM1', 'B07V3CLLCV', 'B07VCD1SRL', 'B07G3ZNK4Y', 'B003NSBMUI', 'B0042F99PG', 'B00592BOAO', 'B004S8F7QM', 'B000YDDF6O', 'B010TQY7A8', 'B00005C5H4', 'B004V3PS72', 'B01MRG7T0D', 'B01AC7SF7K', 'B00ABA0ZOA', 'B000B6ACGA', 'B0157IHJMQ', 'B0120XRWLE', 'B00AU56C5W', 'B001CJVTLC', 'B00000IV35', 'B00A80X19E', '0975277324', 'B00000IVAK', 'B003I64OT6', 'B00D8STBHY', 'B00APVXSM6', 'B00MNG37C2', '8499000606', 'B00NQQTZCO', 'B01C5A2WJO', 'B007EA4UBY'])

In [192]:
# df = dfs['B004S8F7QM']
# df.to_csv("CardsAgainstHumanityAmazonReviewData.csv")

In [263]:
blocked_asins = ['B0157IHJMQ', '8499000606']

In [264]:
top25 = [x for x in top25 if x not in blocked_asins]

In [265]:
top_asins = top25#['B004S8F7QM', 'B000YDDF6O', 'B010TQY7A8', 'B00005C5H4', 'B004V3PS72', 'B01MRG7T0D', 'B01AC7SF7K', 'B00ABA0ZOA', 'B000B6ACGA', 'B0120XRWLE', 'B00AU56C5W', 'B001CJVTLC', 'B00000IV35', 'B00A80X19E', '0975277324', 'B00000IVAK', 'B003I64OT6', 'B00D8STBHY', 'B00APVXSM6', 'B00MNG37C2', 'B00NQQTZCO', 'B01C5A2WJO', 'B007EA4UBY']

In [266]:
# top_asins = expensive_asins

In [267]:
for asin in top_asins:
    print(f"https://www.amazon.com/dp/{asin}")

https://www.amazon.com/dp/B01C49MCCS
https://www.amazon.com/dp/B004A2QTRC
https://www.amazon.com/dp/B00005KBVD
https://www.amazon.com/dp/B074YYVXQH
https://www.amazon.com/dp/B085HNMHKJ
https://www.amazon.com/dp/B0107H5FJ6
https://www.amazon.com/dp/B002PEGT9U
https://www.amazon.com/dp/B000CBSNRY
https://www.amazon.com/dp/B07F38CPM1
https://www.amazon.com/dp/B07V3CLLCV
https://www.amazon.com/dp/B07VCD1SRL
https://www.amazon.com/dp/B07G3ZNK4Y
https://www.amazon.com/dp/B003NSBMUI
https://www.amazon.com/dp/B0042F99PG
https://www.amazon.com/dp/B00592BOAO
https://www.amazon.com/dp/B004S8F7QM
https://www.amazon.com/dp/B000YDDF6O
https://www.amazon.com/dp/B010TQY7A8
https://www.amazon.com/dp/B00005C5H4
https://www.amazon.com/dp/B004V3PS72
https://www.amazon.com/dp/B01MRG7T0D
https://www.amazon.com/dp/B01AC7SF7K
https://www.amazon.com/dp/B00ABA0ZOA
https://www.amazon.com/dp/B000B6ACGA
https://www.amazon.com/dp/B0120XRWLE
https://www.amazon.com/dp/B00AU56C5W
https://www.amazon.com/dp/B001CJVTLC
h

In [268]:
products = [
    "Ultimate kitchen playset",
    "Play kitchen",
    "Kid roller coaster",
    "HD drone",
    "4k drone",
    "BB-8 Toy",
    "Mini John Deere",
    "Magna Tiles",
    "Remote control monster truck",
    "GPS drone",
    "Camera drone",
    "Vector robot",
    "Mini bounce house",
    "Jumbo doll house",
    "Mini play kitchen",
    "Cards Against Humanity",
    "Baby Einstein Toy",
    "Exploding Kittens",
    "baby stacking cups",
    "dyson vacuum toy",
    "what do you meme",
    "stuffed elephant",
    "jenga",
    "automatic card shuffler",
    "princess castle tent",
    "magnet tiles",
    "pokemon cards",
    "five crowns",
    "gumby toy",
    "ticket to ride",
    "sequence game",
    "left right center",
    "connect 4",
    "magnet tiles",
    "drone toy",
    "monopoly deal",
    "little tikes basketball",
    "suspend game"
]

In [269]:
len(products)

38

In [270]:
len(top25)

38

In [271]:
top25

['B01C49MCCS',
 'B004A2QTRC',
 'B00005KBVD',
 'B074YYVXQH',
 'B085HNMHKJ',
 'B0107H5FJ6',
 'B002PEGT9U',
 'B000CBSNRY',
 'B07F38CPM1',
 'B07V3CLLCV',
 'B07VCD1SRL',
 'B07G3ZNK4Y',
 'B003NSBMUI',
 'B0042F99PG',
 'B00592BOAO',
 'B004S8F7QM',
 'B000YDDF6O',
 'B010TQY7A8',
 'B00005C5H4',
 'B004V3PS72',
 'B01MRG7T0D',
 'B01AC7SF7K',
 'B00ABA0ZOA',
 'B000B6ACGA',
 'B0120XRWLE',
 'B00AU56C5W',
 'B001CJVTLC',
 'B00000IV35',
 'B00A80X19E',
 '0975277324',
 'B00000IVAK',
 'B003I64OT6',
 'B00D8STBHY',
 'B00APVXSM6',
 'B00MNG37C2',
 'B00NQQTZCO',
 'B01C5A2WJO',
 'B007EA4UBY']

In [272]:
for i in range(len(top25)):
    print(products[i])
    print(f"https://www.amazon.com/dp/{top25[i]}")
    print()

Ultimate kitchen playset
https://www.amazon.com/dp/B01C49MCCS

Play kitchen
https://www.amazon.com/dp/B004A2QTRC

Kid roller coaster
https://www.amazon.com/dp/B00005KBVD

HD drone
https://www.amazon.com/dp/B074YYVXQH

4k drone
https://www.amazon.com/dp/B085HNMHKJ

BB-8 Toy
https://www.amazon.com/dp/B0107H5FJ6

Mini John Deere
https://www.amazon.com/dp/B002PEGT9U

Magna Tiles
https://www.amazon.com/dp/B000CBSNRY

Remote control monster truck
https://www.amazon.com/dp/B07F38CPM1

GPS drone
https://www.amazon.com/dp/B07V3CLLCV

Camera drone
https://www.amazon.com/dp/B07VCD1SRL

Vector robot
https://www.amazon.com/dp/B07G3ZNK4Y

Mini bounce house
https://www.amazon.com/dp/B003NSBMUI

Jumbo doll house
https://www.amazon.com/dp/B0042F99PG

Mini play kitchen
https://www.amazon.com/dp/B00592BOAO

Cards Against Humanity
https://www.amazon.com/dp/B004S8F7QM

Baby Einstein Toy
https://www.amazon.com/dp/B000YDDF6O

Exploding Kittens
https://www.amazon.com/dp/B010TQY7A8

baby stacking cups
https://

In [282]:
# products = [
#     "Cards Against Humanity",
#     "Baby Einstein Toy",
#     "Exploding Kittens",
#     "baby stacking cups",
#     "dyson vacuum toy",
#     "what do you meme",
#     "stuffed elephant",
#     "jenga",
#     "automatic card shuffler",
#     "princess castle tent",
#     "magnet tiles",
#     "pokemon cards",
#     "five crowns",
#     "gumby toy",
#     "ticket to ride",
#     "sequence game",
#     "left right center",
#     "connect 4",
#     "magnet tiles",
#     "drone toy",
#     "monopoly deal",
#     "little tikes basketball",
#     "suspend game"
# ]

In [283]:
asin_to_prod = {}
for i in range(len(top_asins)):
    asin_to_prod[top_asins[i]] = products[i]

In [291]:
a2p = pd.DataFrame.from_dict(asin_to_prod, orient='index', columns=['product_name']).reset_index()
a2p = a2p.rename(columns={'index': 'asin'})

# Add is_expensive column
a2p['is_expensive'] = a2p['asin'].isin(expensive_asins)

a2p.to_csv("asin_to_product_name_and_is_expensive.csv", index=False)

In [285]:
for asin in top_asins:
    df = dfs[asin]
    name = asin_to_prod[asin]
    df.to_csv(name.replace(' ', '_') + '.csv')

In [286]:
dfs.keys()

dict_keys(['B01C49MCCS', 'B004A2QTRC', 'B00005KBVD', 'B074YYVXQH', 'B085HNMHKJ', 'B0107H5FJ6', 'B002PEGT9U', 'B000CBSNRY', 'B07F38CPM1', 'B07V3CLLCV', 'B07VCD1SRL', 'B07G3ZNK4Y', 'B003NSBMUI', 'B0042F99PG', 'B00592BOAO', 'B004S8F7QM', 'B000YDDF6O', 'B010TQY7A8', 'B00005C5H4', 'B004V3PS72', 'B01MRG7T0D', 'B01AC7SF7K', 'B00ABA0ZOA', 'B000B6ACGA', 'B0157IHJMQ', 'B0120XRWLE', 'B00AU56C5W', 'B001CJVTLC', 'B00000IV35', 'B00A80X19E', '0975277324', 'B00000IVAK', 'B003I64OT6', 'B00D8STBHY', 'B00APVXSM6', 'B00MNG37C2', '8499000606', 'B00NQQTZCO', 'B01C5A2WJO', 'B007EA4UBY'])

In [287]:
dummy_df = pd.concat(
    [
        df.sample(n=25, replace=False, random_state=None)
        if len(df) >= 25
        else df.sample(n=25, replace=True, random_state=None)
        for df in dfs.values()
    ],
    ignore_index=True
)

In [288]:
dummy_df.columns

Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase'],
      dtype='object')

In [289]:
for asin, name in asin_to_prod.items():
    print(f"""{asin}:{name}""")

B01C49MCCS:Ultimate kitchen playset
B004A2QTRC:Play kitchen
B00005KBVD:Kid roller coaster
B074YYVXQH:HD drone
B085HNMHKJ:4k drone
B0107H5FJ6:BB-8 Toy
B002PEGT9U:Mini John Deere
B000CBSNRY:Magna Tiles
B07F38CPM1:Remote control monster truck
B07V3CLLCV:GPS drone
B07VCD1SRL:Camera drone
B07G3ZNK4Y:Vector robot
B003NSBMUI:Mini bounce house
B0042F99PG:Jumbo doll house
B00592BOAO:Mini play kitchen
B004S8F7QM:Cards Against Humanity
B000YDDF6O:Baby Einstein Toy
B010TQY7A8:Exploding Kittens
B00005C5H4:baby stacking cups
B004V3PS72:dyson vacuum toy
B01MRG7T0D:what do you meme
B01AC7SF7K:stuffed elephant
B00ABA0ZOA:jenga
B000B6ACGA:automatic card shuffler
B0120XRWLE:princess castle tent
B00AU56C5W:magnet tiles
B001CJVTLC:pokemon cards
B00000IV35:five crowns
B00A80X19E:gumby toy
0975277324:ticket to ride
B00000IVAK:sequence game
B003I64OT6:left right center
B00D8STBHY:connect 4
B00APVXSM6:magnet tiles
B00MNG37C2:drone toy
B00NQQTZCO:monopoly deal
B01C5A2WJO:little tikes basketball
B007EA4UBY:suspe

In [189]:
dummy_df

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,4.0,The directions were not the best. But it's so ...,It took 3 hours to put together. The direction...,[],B01C49MCCS,B01LALE67U,AHYPQYEGDQ4PBE3J7XX2E5HRZ4EA,1522142471477,0,True
1,5.0,You’ll have to read…..,Ok first off. Great little set. That said. It ...,[],B01C49MCCS,B01LALE67U,AHXNLMWRFXQKHJUSAH6SRAFZKVAA,1641252798573,0,True
2,4.0,Nice design but pointy edges!,Fun design but not the safest for a toddler. V...,[{'small_image_url': 'https://m.media-amazon.c...,B01C49MCCS,B01LALE67U,AEQKS6DGPEFKN3HPKCS365IZY3SA,1672057601201,0,True
3,4.0,Super cute,Super cute and my kids love it. I know it was ...,[],B01C49MCCS,B01LALE67U,AFTO77SDFFZS5BCKOEUCS2DVWENA,1580649384265,0,True
4,5.0,They play with it all the time,The kids love it,[],B01C49MCCS,B01LALE67U,AE7GZBYNJ6Z2ZWR6DSP2AAN2QJHA,1577454625188,0,True
...,...,...,...,...,...,...,...,...,...,...
370,5.0,Best little kitchen ever!,"After a year, still in great shape and used ev...",[],B00592BOAO,B00ELRFLZM,AFA3W5SUTFLU4C32NNKFRUJR45UA,1481837754000,1,True
371,5.0,Amazing children's kitchen!!,We bought this kitchen for my 18 month old son...,[],B00592BOAO,B00ELRFLZM,AGVMVQGYGCC62JNUKPEY23Z4PEPQ,1355290132000,1,True
372,5.0,Five Stars,Awesome product at an awesome price!!!,[],B00592BOAO,B00ELRFLZM,AF7OLVQ25SFSEK5YSIOIVBUFSL2A,1419540946000,0,True
373,1.0,Frustration,After one horrible incident with a replacement...,[],B00592BOAO,B00ELRFLZM,AFJ7RSEK6GLTCIIHB3LV7FNIAYCA,1463182323000,2,True


In [115]:
# df = dfs['B01MRG7T0D'].copy()
# df = df[df['rating'] == 1]

# for t in df['text']:
#     print(t)
#     print()

In [298]:
prod_names = [asin_to_prod[key] for key in asin_to_prod]
csv_names = [name.replace(' ', '_') + '.csv' for name in prod_names]
dfs = [pd.read_csv(name) for name in csv_names]
combined_df = pd.concat(dfs, axis=0, ignore_index=True)
combined_df = combined_df.drop(columns=['Unnamed: 0'])
print(f"Combined shape: {combined_df.shape}")

Combined shape: (177960, 10)


In [299]:
combined_df.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,5.0,A lot of fun for Toddlers,It’s awesome and my grandson (2) loves it. He...,[],B01C49MCCS,B01LALE67U,AFTEZMTDJTMY4YIU4HJQEQY35BXQ,1609892936645,0,True
1,5.0,Very nice product for all those future chefs!,Very nice product! Item was packed very well a...,[{'small_image_url': 'https://images-na.ssl-im...,B01C49MCCS,B01LALE67U,AHRU3HJGLE5IC4VQFCGT5P4CGOLA,1521960365136,2,True
2,1.0,Came missing a ton of parts,"Wanted to love this, so many parts and screws....",[],B01C49MCCS,B01LALE67U,AGRXOSYV2XJIBYVOHKL4TTXSRHLQ,1657390116692,2,True
3,4.0,Good toy for grandkids,Got this for my two grandchildren for Christma...,[],B01C49MCCS,B01LALE67U,AGCGSPZP6XTN5LTOFCAZK2E6MVFQ,1550940421664,0,True
4,5.0,Five Stars,Great,[],B01C49MCCS,B01LALE67U,AEXGVJXXCE4R4Y377KYNB4K2GFQA,1501690697903,0,True


In [300]:
combined_df.to_csv("combined_csv.csv")