In [1]:
from pathlib import Path
import pandas as pd
import os
import gzip
import json
from utils import extract_gz, load_and_process_data
from loguru import logger
from typing import List, Dict

## Dataset

In [2]:
dataset_path = Path(
    "~/Datasets/Amazon_Reviews_23/meta_data/meta_Clothing_Shoes_and_Jewelry.jsonl.gz"
).expanduser()
assert dataset_path.exists(), "There is no dataset file!"

Unpack the data

In [11]:
extract_path = extract_gz(path=dataset_path)
print(f"File extracted to: {extract_path}")

File extracted to: /Users/studeni/Datasets/Amazon_Reviews_23/meta_data/meta_Clothing_Shoes_and_Jewelry.jsonl


In [2]:
extract_path = Path(
    "/Users/studeni/Datasets/Amazon_Reviews_23/meta_data/meta_Clothing_Shoes_and_Jewelry.jsonl"
)

In [3]:
extract_path.exists()

True

Load JSONL to DataFrame

In [44]:
df = load_and_process_data(file_path=extract_path, lines=50_000)

[32m2024-08-05 18:44:24.468[0m | [1mINFO    [0m | [36mutils[0m:[36mload_and_process_data[0m:[36m63[0m - [1mProcessed file /Users/studeni/Datasets/Amazon_Reviews_23/meta_data/meta_Clothing_Shoes_and_Jewelry.jsonl successfully, collected: 50000 products.[0m


In [47]:
df.head(3)

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,AMAZON FASHION,BALEAF Women's Long Sleeve Zip Beach Coverup U...,4.2,422,"[90% Polyester, 10% Spandex, Zipper closure, M...",[],31.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Women's UPF 50+ Front Zip Beach Co...,BALEAF,"[Clothing, Shoes & Jewelry, Women, Clothing, S...","{'Department': 'womens', 'Date First Available...",B09X1MRDN6,,,
1,AMAZON FASHION,Merrell Work Moab 2 Vent Waterproof SR Boulder,2.7,4,[Rubber sole],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Merrell,"[Clothing, Shoes & Jewelry, Women, Shoes, Outd...",{'Package Dimensions': '14.02 x 9.29 x 4.8 inc...,B073C4Q7W8,,,
2,AMAZON FASHION,"SAS Women's, Relaxed Sandal",4.7,618,"[Made in the USA, Suede sole, Heel measures ap...","[Unwind, leave your worries behind, and simply...",188.95,[{'thumb': 'https://m.media-amazon.com/images/...,[],SAS,"[Clothing, Shoes & Jewelry, Women, Shoes, Sand...",{'Product Dimensions': '10 x 15 x 6 inches; 2 ...,B0944VG4Y4,,,


# Check bought_together data

In [None]:
def load_and_process_data_bt(file_path: Path, lines: int = None) -> List[dict]:
    """Load and process data from a given file path."""
    if file_path.suffix != ".jsonl":
        logger.info(f"Expected a .jsonl file, got {file_path.suffix} instead")
        raise ValueError(f"Expected a .jsonl file, got {file_path.suffix} instead")

    ids = []
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            for i, line in enumerate(file):
                try:
                    data = json.loads(line.strip())
                    bought_together = data.get("related", {}).get("bought_together", [])
                    ids.append(json.loads(line.strip()))
                except Exception as e:
                    logger.warning(f"Exception occurred while loading data: {e}")

                if line and (i == lines - 1):
                    break
            logger.info(
                f"Processed file {file_path} successfully, collected: {len(products)} products."
            )
    except Exception as e:
        logger.error(f"Failed to process file {file_path}: {e}")
        return pd.DataFrame()

    return pd.DataFrame(data=products)

In [None]:
with open(file_path, "r", encoding="utf-8") as file:
    for i, line in enumerate(file):

Extract First Available Date

In [48]:
def extract_and_pop_date(details):
    date = details.pop("Date First Available", None)

    if date is None:
        date = details.pop("Release date", None)

    return date

In [49]:
df["date_first_available"] = df["details"].apply(extract_and_pop_date)

Extract Color

In [57]:
def extract_and_pop_color(details):
    color = details.pop("Color", None)

    # if color is None:
    #     color = details.pop("Release color", None)

    return color

In [58]:
df["color"] = df["details"].apply(extract_and_pop_color)

In [56]:
df.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author,date_first_available,color
0,AMAZON FASHION,BALEAF Women's Long Sleeve Zip Beach Coverup U...,4.2,422,"[90% Polyester, 10% Spandex, Zipper closure, M...",[],31.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Women's UPF 50+ Front Zip Beach Co...,BALEAF,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",{'Department': 'womens'},B09X1MRDN6,,,,"April 3, 2022",
1,AMAZON FASHION,Merrell Work Moab 2 Vent Waterproof SR Boulder,2.7,4,[Rubber sole],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Merrell,"[Clothing, Shoes & Jewelry, Women, Shoes, Outd...",{'Package Dimensions': '14.02 x 9.29 x 4.8 inc...,B073C4Q7W8,,,,"November 2, 2017",
2,AMAZON FASHION,"SAS Women's, Relaxed Sandal",4.7,618,"[Made in the USA, Suede sole, Heel measures ap...","[Unwind, leave your worries behind, and simply...",188.95,[{'thumb': 'https://m.media-amazon.com/images/...,[],SAS,"[Clothing, Shoes & Jewelry, Women, Shoes, Sand...",{'Product Dimensions': '10 x 15 x 6 inches; 2 ...,B0944VG4Y4,,,,"March 22, 2017",
3,AMAZON FASHION,SheIn Women's Basic Stretch Plaid Mini Bodycon...,3.8,999,"[Zipper closure, Fabric has some stretch; Plea...",[],12.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'SUPER CUTE plaid skirt!', 'url': '...",SheIn,"[Clothing, Shoes & Jewelry, Novelty & More, Cl...",{'Department': 'womens'},B08JGGF5TJ,,,,"September 24, 2020",
4,AMAZON FASHION,"Michael Kors Cindy, Women’s Cross-Body Bag",4.5,6,"[Leather, 9-1/2"" W x 5-1/4"" H x 2-1/4"" D, Inte...","[Adjustable crossbody strap with 24""-26"" drop ...",,[{'thumb': 'https://m.media-amazon.com/images/...,[],Michael Kors,"[Clothing, Shoes & Jewelry, Women, Handbags & ...","{'Is Discontinued By Manufacturer': 'No', 'Pro...",B00ZQMM6BI,,,,"June 16, 2015",


In [59]:
df["color"].isna().sum()

47966

In [60]:
df[df["color"].isna()]["details"].tolist()

[{'Department': 'womens'},
 {'Package Dimensions': '14.02 x 9.29 x 4.8 inches; 1 Pounds',
  'Item model number': '00-3C4Q7W8I-AX',
  'Department': 'womens'},
 {'Product Dimensions': '10 x 15 x 6 inches; 2 Pounds',
  'Item model number': '1570-891',
  'Department': 'womens',
  'Manufacturer': 'SAS'},
 {'Department': 'womens'},
 {'Is Discontinued By Manufacturer': 'No',
  'Product Dimensions': '8.66 x 3.54 x 6.69 inches; 2.1 Pounds',
  'Item model number': '32H4GCPC7L',
  'Department': 'womens',
  'Manufacturer': 'Vista Trade Finance & Services S.A.'},
 {'Package Dimensions': '6.4 x 5 x 3.8 inches; 1 Pounds',
  'Item model number': 'RB154873D',
  'Department': 'womens',
  'Manufacturer': 'Jewelili'},
 {'Item Weight': '8 Ounces',
  'Item model number': 'G2941M1',
  'Department': 'womens',
  'Manufacturer': "Dr. Scholl's Shoes"},
 {'Department': 'boys'},
 {'Department': 'womens'},
 {'Item model number': 'AE1932668',
  'Department': 'womens',
  'Manufacturer': 'Amazon Essentials'},
 {'Packa

In [24]:
df.head()

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author,date_first_available
0,AMAZON FASHION,BALEAF Women's Long Sleeve Zip Beach Coverup U...,4.2,422,"[90% Polyester, 10% Spandex, Zipper closure, M...",[],31.99,[{'thumb': 'https://m.media-amazon.com/images/...,[{'title': 'Women's UPF 50+ Front Zip Beach Co...,BALEAF,"[Clothing, Shoes & Jewelry, Women, Clothing, S...",{'Department': 'womens'},B09X1MRDN6,,,,"April 3, 2022"
1,AMAZON FASHION,Merrell Work Moab 2 Vent Waterproof SR Boulder,2.7,4,[Rubber sole],[],,[{'thumb': 'https://m.media-amazon.com/images/...,[],Merrell,"[Clothing, Shoes & Jewelry, Women, Shoes, Outd...",{'Package Dimensions': '14.02 x 9.29 x 4.8 inc...,B073C4Q7W8,,,,"November 2, 2017"
2,AMAZON FASHION,"SAS Women's, Relaxed Sandal",4.7,618,"[Made in the USA, Suede sole, Heel measures ap...","[Unwind, leave your worries behind, and simply...",188.95,[{'thumb': 'https://m.media-amazon.com/images/...,[],SAS,"[Clothing, Shoes & Jewelry, Women, Shoes, Sand...",{'Product Dimensions': '10 x 15 x 6 inches; 2 ...,B0944VG4Y4,,,,"March 22, 2017"
3,AMAZON FASHION,SheIn Women's Basic Stretch Plaid Mini Bodycon...,3.8,999,"[Zipper closure, Fabric has some stretch; Plea...",[],12.99,[{'thumb': 'https://m.media-amazon.com/images/...,"[{'title': 'SUPER CUTE plaid skirt!', 'url': '...",SheIn,"[Clothing, Shoes & Jewelry, Novelty & More, Cl...",{'Department': 'womens'},B08JGGF5TJ,,,,"September 24, 2020"
4,AMAZON FASHION,"Michael Kors Cindy, Women’s Cross-Body Bag",4.5,6,"[Leather, 9-1/2"" W x 5-1/4"" H x 2-1/4"" D, Inte...","[Adjustable crossbody strap with 24""-26"" drop ...",,[{'thumb': 'https://m.media-amazon.com/images/...,[],Michael Kors,"[Clothing, Shoes & Jewelry, Women, Handbags & ...","{'Is Discontinued By Manufacturer': 'No', 'Pro...",B00ZQMM6BI,,,,"June 16, 2015"


In [21]:
test_date = {"Date First Availabl": "2012-05-01"}
test_date.pop("Date First Available", None)

In [22]:
test_date

{'Date First Availabl': '2012-05-01'}