# BDA Assignment #3: Streaming Data Insights

### Frequent Itemset Analysis on Amazon Metadata

## Data Sampling and Preprocessing


#### Group members:

- Aaqib Ahmed Nazir (i22-1920),
- Arhum Khan (i22-1967),
- Ammar Khasif (i22-1968)

##### Section: DS-D


#### Libraries Used:


In [2]:
import re
import json
import itertools 
import pandas as pd
from tqdm import tqdm


### Sampling the Data

#### Extracting a 15GB sample from the original 105GB dataset


In [None]:
def sample_json(input_file, output_file, target_size_gb, filter_key='also_buy'):
    target_size_bytes = target_size_gb * 1024**3
    current_size_bytes = 0

    # Reading the input file and writing the output file
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in tqdm(infile, desc=f"Processing {input_file}"):
            record = json.loads(line)
            # Filtering out records
            if record.get(filter_key):
                outfile.write(json.dumps(record) + '\n')
                current_size_bytes += len(line.encode('utf-8'))

            if current_size_bytes >= target_size_bytes:
                break

    print(f"Finished sampling. Output size: {current_size_bytes / 1024**3:.2f} GB")

sample_json('H:\\All_Amazon_Meta.json', 'Sampled_Amazon_Meta.json', 15)

### Fixing JSON Errors


In [None]:
def fix_common_json_errors(line):
    # Removing extra commas
    line = line.strip()
    line = line.rstrip(",")
    # Adding missing closing brackets
    if not line.endswith("}") and not line.endswith("]"):
        if "{" in line and "}" not in line:
            line += "}"
        elif "[" in line and "]" not in line:
            line += "]"
    return line

#### Fixing the JSON errors and saving the fixed data in a new file


In [None]:
def process_line_by_line(input_path, output_path):
    errors_count = 0
    with open(input_path, "r", encoding="utf-8") as file, open(
        output_path, "w", encoding="utf-8"
    ) as outfile:
        # Process the file line by line
        for line_number, line in enumerate(file, 1):
            try:
                corrected_line = fix_common_json_errors(line)
                data = json.loads(corrected_line)
                # Writing fixed JSON object to the file
                json.dump(data, outfile)
                outfile.write("\n")  
            except json.JSONDecodeError as e:
                errors_count += 1
                print(f"Error in line {line_number}: {e}")
    print(f"Finished processing. Total errors: {errors_count}")

In [None]:
input_file_path = "Sampled_Amazon_Meta.json"
output_file_path = "Corrected_Sample_Amazon_Meta.json"

process_line_by_line(input_file_path, output_file_path)

### Preprocessing and Cleaning the Dataset


#### Removing unwanted HTML tags, URLs, and normalizing the whitespace


In [3]:
# loading only the column names from the sampled file
input_file = "H:\\Sampled_Amazon_Meta.json"
with open(input_file, 'r', encoding='utf-8') as infile:
    first_line = infile.readline()
    column_names = list(json.loads(first_line).keys())

# displaying the column names
print(column_names)

['category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'image', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat', 'similar_item', 'date', 'price', 'asin']


In [4]:
def clean_text(text):
    unwanted_patterns = [
        r"<[^>]*>",  # HTML tags
        r"https?:\/\/\S+",  # URLs
        r"P\.when\(.*?\);",  # JS snippets
        r"span class\w+",  # span classes
    ]

    if any(re.search(pattern, text) for pattern in unwanted_patterns):
        return ""

    # Removing back slashes, extra whitespaces, and punctuation
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[^\w\s]", "", text)

    return text

#### Extracting and cleaning the relevant info


In [5]:
def process_item_data(item_data):
    # Extracting selected columns
    asin = item_data.get("asin", "")
    title = clean_text(item_data.get("title", "No Title Available"))
    brand = item_data.get("brand", "Unknown Brand")
    categories = item_data.get("categories", [])

    # Extracting related products
    related_products = set(item_data.get("also_buy", [])) | set(
        item_data.get("also_viewed", [])
    )
    related = list(related_products) if related_products else []

    # Preparing preprocessed item data with selected cols
    preprocessed_item = {
        "asin": asin,
        "title": title,
        "brand": brand,
        "categories": categories,
        "related": related,
    }

    return preprocessed_item

In [6]:
input_file_path = "H:\\Sampled_Amazon_Meta.json"
output_file_path = "preprocessed_dataset1.json"

item_data_collection = []
with open(input_file_path, "r") as file_input:
    for line in file_input:
        try:
            raw_data = json.loads(line)
            processed_item = process_item_data(raw_data)
            item_data_collection.append(processed_item)
        except json.JSONDecodeError:
            print(f"Error decoding JSON from line: {line}")

# Write all processed data to a JSON file
with open(output_file_path, "w") as file_output:
    json.dump(item_data_collection, file_output, indent=4)
    
print("Dataset processing complete.")

Dataset processing complete.


### Data Exploration

In [5]:
Amazon_data = pd.read_json('preprocessed_dataset.json')

display(Amazon_data.head(5))

Unnamed: 0,asin,title,related,brand,categories
0,6342509379,QIBOE Mens Baggy Jeans Denim Sweatpants Loose ...,"[B071PFP967, B010V0WTP2, B0156SZQ5O, B01FVRKZ4...",QIBOE,"[Clothing, Shoes & Jewelry, Men, Clothing, Jea..."
1,6342502315,Crazy Womens Voile Crinkle Scarf Shawl,"[B01LLOUFRQ, B01LYDMB6U, B019ZAYUB0, B00NV1VFP...",Crazy,"[Clothing, Shoes & Jewelry, Women, Accessories..."
2,6342522545,FQQ Women Sexy Lingerie Lace Dress Sheer Babyd...,[B00VBVXVPI],FQQ,"[Clothing, Shoes & Jewelry, Women, Clothing, L..."
3,6342522898,Crazy Womens Sexy Leather Backless Bodycon Clu...,"[B07219C7LQ, B015W134LS, B06ZZBQMT4, B01AHZSZ9...",Crazy,"[Clothing, Shoes & Jewelry, Women, Clothing, D..."
4,6342523002,FQQ Womens Sexy Lingerie Babydoll Dress Sleepw...,"[B0723CQH2L, B01LY4VKTL, B06XKWCGTT, B074Z3QGM...",FQQ,"[Clothing, Shoes & Jewelry, Women, Clothing, L..."
