In [1]:
import json
from tqdm import tqdm
import pandas as pd
import re

In [None]:
# def sample_json(input_file, output_file, target_size_gb, filter_key='also_buy'):
#     target_size_bytes = target_size_gb * 1024**3
#     current_size_bytes = 0

#     with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
#         for line in tqdm(infile, desc=f"Processing {input_file}"):  
#             record = json.loads(line)
#             if record.get(filter_key):
#                 outfile.write(json.dumps(record) + '\n')
#                 current_size_bytes += len(line.encode('utf-8'))
            
#             if current_size_bytes >= target_size_bytes:
#                 break

#     print(f"Finished sampling. Output size: {current_size_bytes / 1024**3:.2f} GB")

# sample_json('H:\\All_Amazon_Meta.json', 'Sampled_Amazon_Meta.json', 15)

In [None]:
input_file_path = 'Sampled_Amazon_Meta.json'
output_file_path = 'Corrected_Sample_Amazon_Meta.json'

def fix_common_json_errors(line):
    # Remove extra commas before closing brackets or braces
    line = line.strip()
    line = line.rstrip(',')
    if not line.endswith('}') and not line.endswith(']'):
        # Try adding a closing bracket if missing
        if '{' in line and '}' not in line:
            line += '}'
        elif '[' in line and ']' not in line:
            line += ']'
    return line

def process_line_by_line(input_path, output_path):
    errors_count = 0
    with open(input_path, 'r', encoding='utf-8') as file, \
         open(output_path, 'w', encoding='utf-8') as outfile:
        for line_number, line in enumerate(file, 1):
            try:
                corrected_line = fix_common_json_errors(line)
                data = json.loads(corrected_line)
                # Write the valid JSON back to a new file
                json.dump(data, outfile)
                outfile.write('\n')  # Ensure each JSON object is on a new line
            except json.JSONDecodeError as e:
                errors_count += 1
                print(f"Error in line {line_number}: {e}")
    print(f"Finished processing. Total errors: {errors_count}")

process_line_by_line(input_file_path, output_file_path)

In [None]:
# loading only the column names from the sampled file
input_file = "H:\\Sampled_Amazon_Meta.json"
with open(input_file, 'r', encoding='utf-8') as infile:
    first_line = infile.readline()
    column_names = list(json.loads(first_line).keys())

# displaying the column names
print(column_names)

In [None]:
import json
import re

def clean_text(text):
    """Cleans text by removing unwanted HTML tags, URLs, and normalizing whitespace."""
    unwanted_patterns = [
        r"<[^>]*>",                  # HTML tags
        r"https?:\/\/\S+",           # URLs
        r"P\.when\(.*?\);",          # JS snippets
        r"span class\w+",             # span classes
    ]

    # Drop text if any unwanted patterns are found
    if any(re.search(pattern, text) for pattern in unwanted_patterns):
        return ""

    # Remove backslashes, normalize spaces, and remove non-alphanumeric characters
    text = re.sub(r"\\", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    text = re.sub(r"[^\w\s]", "", text)

    return text

def process_item_data(item_data):
    """Processes individual item data, extracting and cleaning relevant information."""
    # Extract and clean the necessary fields
    asin = item_data.get("asin", "")
    title = clean_text(item_data.get("title", "No Title Available"))
    brand = item_data.get("brand", "Unknown Brand")
    categories = item_data.get("categories", [])

    # Handling related products
    related_products = set(item_data.get("also_buy", [])) | set(item_data.get("also_viewed", []))
    related = list(related_products) if related_products else []

    # Prepare preprocessed item data with selected fields
    preprocessed_item = {
        "asin": asin,
        "title": title,
        "brand": brand,
        "categories": categories,
        "related": related,
    }

    return preprocessed_item

input_file_path = "H:\\Sampled_Amazon_Meta.json"
output_file_path = "H:\\preprocessed_dataset.json"

with open(input_file_path, "r") as file_input, open(output_file_path, "w") as file_output:
    file_output.write("[")  # Start JSON array
    first_item = True
    for line in file_input:
        raw_data = json.loads(line)
        processed_item = process_item_data(raw_data)
        if not first_item:
            file_output.write(",\n")
        json.dump(processed_item, file_output)
        first_item = False
    file_output.write("]")  # End JSON array

print("Dataset processing complete.")