# BDA Assignment #3: Streaming Data Insights

### Frequent Itemset Analysis on Amazon Metadata

## Data Sampling and Preprocessing


#### Group members:

- Aaqib Ahmed Nazir (i22-1920),
- Arhum Khan (i22-1967),
- Ammar Khasif (i22-1968)

##### Section: DS-D


#### Libraries Used:


In [1]:
import json
from tqdm import tqdm
import pandas as pd
import re

### Sampling the Data

#### Extracting a 15GB sample from the original 105GB dataset


In [1]:
def sample_json(input_file, output_file, target_size_gb, filter_key='also_buy'):
    target_size_bytes = target_size_gb * 1024**3
    current_size_bytes = 0

    # Reading the input file and writing the output file
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in tqdm(infile, desc=f"Processing {input_file}"):
            record = json.loads(line)
            # Filtering out records
            if record.get(filter_key):
                outfile.write(json.dumps(record) + '\n')
                current_size_bytes += len(line.encode('utf-8'))

            if current_size_bytes >= target_size_bytes:
                break

    print(f"Finished sampling. Output size: {current_size_bytes / 1024**3:.2f} GB")

sample_json('H:\\All_Amazon_Meta.json', 'Sampled_Amazon_Meta.json', 15)

Processing H:\All_Amazon_Meta.json: 9519286it [16:27, 9640.48it/s] 

Finished sampling. Output size: 15.00 GB





### Fixing JSON Errors


In [2]:
def fix_common_json_errors(line):
    # Removing extra commas
    line = line.strip()
    line = line.rstrip(",")
    # Adding missing closing brackets
    if not line.endswith("}") and not line.endswith("]"):
        if "{" in line and "}" not in line:
            line += "}"
        elif "[" in line and "]" not in line:
            line += "]"
    return line

#### Fixing the JSON errors and saving the fixed data in a new file


In [3]:
def process_line_by_line(input_path, output_path):
    errors_count = 0
    with open(input_path, "r", encoding="utf-8") as file, open(
        output_path, "w", encoding="utf-8"
    ) as outfile:
        # Process the file line by line
        for line_number, line in enumerate(file, 1):
            try:
                corrected_line = fix_common_json_errors(line)
                data = json.loads(corrected_line)
                # Writing fixed JSON object to the file
                json.dump(data, outfile)
                outfile.write("\n")  
            except json.JSONDecodeError as e:
                errors_count += 1
                print(f"Error in line {line_number}: {e}")
    print(f"Finished processing. Total errors: {errors_count}")

In [4]:
input_file_path = "Sampled_Amazon_Meta.json"
output_file_path = "Corrected_Sample_Amazon_Meta.json"

process_line_by_line(input_file_path, output_file_path)

Finished processing. Total errors: 0


### Preprocessing and Cleaning the Dataset


#### Removing unwanted HTML tags, URLs, and normalizing the whitespace


In [5]:
def clean_text(text):
    unwanted_patterns = [
        r"<[^>]*>",  # HTML tags
        r"https?:\/\/\S+",  # URLs
        r"P\.when\(.*?\);",  # JS snippets
        r"span class\w+",  # span classes
    ]

    if any(re.search(pattern, text) for pattern in unwanted_patterns):
        return ""

    # Removing back slashes, extra whitespaces, and punctuation
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[^\w\s]", "", text)

    return text

#### Extracting and cleaning the relevant info


In [6]:
def process_item_data(item_data):
    # Extracting selected columns
    asin = item_data.get("asin", "")
    title = clean_text(item_data.get("title", "No Title Available"))
    brand = item_data.get("brand", "Unknown Brand")
    categories = item_data.get("category", [])

    # Extracting related products
    related_products = set(item_data.get("also_buy", [])) | set(
        item_data.get("also_viewed", [])
    )
    related = list(related_products) if related_products else []

    # Preparing preprocessed item data with selected cols
    preprocessed_item = {
        "asin": asin,
        "title": title,
        "related": related,
        "brand": brand,
        "categories": categories,
    }

    return preprocessed_item

In [7]:
input_file_path = "Sampled_Amazon_Meta.json"
output_file_path = "preprocessed_dataset.json"

# Processing the input file and writing the preprocessed data to the output file
item_data_collection = []
with open(input_file_path, "r") as file_input, open(
    output_file_path, "w"
) as file_output:
    for line in file_input:
        # Loading JSON object from the file
        raw_data = json.loads(line)
        processed_item = process_item_data(raw_data)
        item_data_collection.append(processed_item)
        if len(item_data_collection) == 100:
            break
    json.dump(item_data_collection, file_output, indent=4)

### Data Exploration

In [12]:
Amazon_data = pd.read_json('preprocessed_dataset.json')

display(Amazon_data.head(5))

Unnamed: 0,asin,title,related,brand,categories
19,8742240611,Leather and Jute Adjustable Believe Inspiratio...,"[B018135A52, B00VOXQRAO, B00UVC6YAC, B01DP6GJD...","Gifts by Lulee, LLC","[Clothing, Shoes & Jewelry, Men, Jewelry, Brac..."
20,9543894027,Blue Simulated Sapphire Zirconia Austrian Crys...,"[B00N4TIYGQ, B00VHTTN0G, B072HGDY91, B019HCYGZ...",Crystalline Azuria,"[Clothing, Shoes & Jewelry, Women, Jewelry, Je..."
83,B000074RL3,Enell Womens WireFree Sports Bra,"[B00XK08M6Q, B001013YQ0, B07GRFH27C, B0035WTMF...",Enell,"[Clothing, Shoes & Jewelry, Women, Clothing, L..."
97,B0000862R1,Amoena Womens Frances FrontClosure Leisure Bra,"[B00EJFEUPI, B01AQW9BV0, B0773WC9VP, B01J91GA4...",Amoena,"[Clothing, Shoes & Jewelry, Women, Clothing, L..."
98,B0000866GL,Goddess Crepeset Soft Cup Bra,[B001JQLOWC],Goddess,"[Clothing, Shoes & Jewelry, Women, Clothing, L..."


Unnamed: 0,asin,title,related,brand,categories
0,6342509379,QIBOE Mens Baggy Jeans Denim Sweatpants Loose ...,"[B072XTTTK9, B07CP532DP, B010V0WTP2, B07C3FNYF...",QIBOE,"[Clothing, Shoes & Jewelry, Men, Clothing, Jea..."
1,6342502315,Crazy Womens Voile Crinkle Scarf Shawl,"[B0169ZHJDK, B01LYDMB6U, B01LLOUFRQ, B00G9EMVI...",Crazy,"[Clothing, Shoes & Jewelry, Women, Accessories..."
2,6342522545,FQQ Women Sexy Lingerie Lace Dress Sheer Babyd...,[B00VBVXVPI],FQQ,"[Clothing, Shoes & Jewelry, Women, Clothing, L..."
3,6342522898,Crazy Womens Sexy Leather Backless Bodycon Clu...,"[B0748C68ZX, B01AHZSZ9A, B01H43Z5GY, B01I809NC...",Crazy,"[Clothing, Shoes & Jewelry, Women, Clothing, D..."
4,6342523002,FQQ Womens Sexy Lingerie Babydoll Dress Sleepw...,"[B01JOVOFRE, B01EKRMG8C, B01LY4VKTL, B004SLKRY...",FQQ,"[Clothing, Shoes & Jewelry, Women, Clothing, L..."
