In [2]:
import json
import os

# Define input and output file paths
input_file_path = 'data_A2.json'
output_file_path = 'output.json'

# Define the size of data to extract in bytes (15 GB)
data_to_extract = 15 * 1024 * 1024 * 1024

# Define a function to read the file and extract data
def extract_data(input_file_path, output_file_path, data_to_extract):
    current_size = 0
    with open(input_file_path, 'r') as f_in, open(output_file_path, 'w') as f_out:
        # Read the file line by line
        for line in f_in:
            # Load the line as JSON
            try:
                data = json.loads(line)
            except json.JSONDecodeError:
                continue
            # Calculate the size of the current line
            line_size = len(line)
            # Check if appending the line exceeds the data to extract
            if current_size + line_size <= data_to_extract:
                # Append the data to the output file
                f_out.write(line)
                current_size += line_size
            else:
                break

# Call the function to extract the data
extract_data(input_file_path, output_file_path, data_to_extract)

print("Extraction complete.")


Extraction complete.


In [1]:
import json
import re

def remove_html_tags(text):
    # Remove HTML tags from text
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def preprocess_record(record):
    # Remove HTML tags from the 'description' field
    if 'description' in record:
        if isinstance(record['description'], str):
            record['description'] = remove_html_tags(record['description'])
        elif isinstance(record['description'], list):
            # Join the list items into a single string
            record['description'] = ' '.join(record['description'])

    # Convert 'price' to float if it exists
    if 'price' in record:
        try:
            # Convert price range to average if it is represented as a range
            if isinstance(record['price'], str) and '-' in record['price']:
                price_range = record['price'].split('-')
                record['price'] = (float(price_range[0]) + float(price_range[1])) / 2
            else:
                record['price'] = float(record['price'])
        except ValueError:
            record['price'] = None  # Unable to convert to float

    return record

def preprocess_json(input_file, output_file):
    asins = set()
    titles = set()
    features = set()
    descriptions = set()
    related_products = set()
    categories = set()
    brands = set()
    tech1 = set()
    tech2 = set()
    similar_products = set()
    image_urls = set()
    high_resolution_image_urls = set()

    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            # Load the line as JSON
            try:
                record = json.loads(line)
            except json.JSONDecodeError:
                continue

            # Preprocess the record
            record = preprocess_record(record)

            # Remove duplicate URLs in the 'imageURL' and 'highResolutionImageURL' columns
            if 'imageURL' in record:
                image_urls.add(record['imageURL'])
                del record['imageURL']
            if 'highResolutionImageURL' in record:
                high_resolution_image_urls.add(record['highResolutionImageURL'])
                del record['highResolutionImageURL']

            # Remove duplicate ASINs
            if 'asin' in record:
                if record['asin'] in asins:
                    continue
                asins.add(record['asin'])

            # Remove duplicate titles
            if 'title' in record:
                if record['title'] in titles:
                    continue
                titles.add(record['title'])

            # Remove duplicate features
            if 'feature' in record:
                if isinstance(record['feature'], list):
                    record['feature'] = ','.join(record['feature'])
                if record['feature'] in features:
                    continue
                features.add(record['feature'])

            # Remove duplicate descriptions
            if 'description' in record:
                if record['description'] in descriptions:
                    continue
                descriptions.add(record['description'])

            # Remove duplicate related products
            if 'related' in record:
                if record['related'] in related_products:
                    continue
                related_products.add(record['related'])

            # Convert 'categories' to a list if it exists
            if 'categories' in record:
                if isinstance(record['categories'], str):
                    record['categories'] = record['categories'].split(',')
                elif isinstance(record['categories'], list):
                    record['categories'] = record['categories']

                # Remove duplicate categories
                record['categories'] = list(set(record['categories']))

            # Remove duplicate brand names
            if 'brand' in record:
                if record['brand'] in brands:
                    continue
                brands.add(record['brand'])

            # Remove duplicate tech1 details
            if 'tech1' in record:
                if record['tech1'] in tech1:
                    continue
                tech1.add(record['tech1'])

            # Remove duplicate tech2 details
            if 'tech2' in record:
                if record['tech2'] in tech2:
                    continue
                tech2.add(record['tech2'])

            # Remove duplicate similar products
            if 'similar' in record:
                if record['similar'] in similar_products:
                    continue
                similar_products.add(record['similar'])

            # Write the preprocessed record to the output file
            json.dump(record, outfile)
            outfile.write('\n')

    print("Preprocessing completed.")

preprocess_json('E:\Big data\Assignment 3\output.json', 'preprocessed.json')


  preprocess_json('E:\Big data\Assignment 3\output.json', 'preprocessed.json')
