In [14]:
import json
import os
from tqdm import tqdm

def sampleData(inputFile, outputFile, targetSize, filterKey='also_buy'):
    targetSize = targetSize * 1024 * 3
    currentSize = 0

    with open(inputFile, 'r', encoding='utf-8') as input, open(outputFile, 'w', encoding='utf-8') as output:
        for line in tqdm(input):
            data = json.loads(line)
            if data.get(filterKey):
                output.write(json.dumps(data) + '\n')
                currentSize += len(line.encode('utf-8'))

            if currentSize >= targetSize:
                break

def print_head(file_path, n=5):
    with open(file_path, 'r', encoding='utf-8') as file:
        head = []
        for _ in range(n):
            try:
                line = next(file)
                head.append(line)
            except StopIteration:
                break
        print(''.join(head))


sampleData('Sample_Amazon_Meta.json', 'Sample_Amazon_Meta2.json', 1)

print("Head of sampled data:")
print_head('Sample_Amazon_Meta2.json')


0it [00:00, ?it/s]

Head of sampled data:
{"category": ["Clothing, Shoes & Jewelry", "Men", "Clothing", "Jeans", "Denim", "Zipper closure", "Material: cotton", "Style: hip pop", "Two side slant pockets and two back pockets", "Straight fit long pants"], "tech1": "", "description": ["<b>pant size(Unit:inch)</b><br> W30(tag30) Waist: 30.0 Hip: 41.7 Length: 43.3 Thigh: 26.8 Leg opening: 16.5 <br> W32(tag32) Waist: 32.0 Hip: 43.7 Length: 43.7 Thigh: 27.6 Leg opening: 16.9 <br> W33(tag34) Waist: 33.9 Hip: 45.7 Length: 44.1 Thigh: 28.3 Leg opening: 17.3 <br> W34(tag36) Waist: 35.8 Hip: 47.6 Length: 44.5 Thigh: 29.1 Leg opening: 17.7 <br> W36(tag38) Waist: 37.8 Hip: 49.6 Length: 44.9 Thigh: 29.9 Leg opening: 18.1 <br> W38(tag40) Waist: 40.0 Hip: 51.6 Length: 45.3 Thigh: 30.7 Leg opening: 18.5 <br> W40(tag42) Waist: 42.0 Hip: 53.5 Length: 45.7 Thigh: 31.5 Leg opening: 18.9 <br> W42(tag44) Waist: 44.0 Hip: 55.5 Length: 46.1 Thigh: 32.3 Leg opening: 19.3 <br> W44(tag46) Waist: 46.0 Hip: 57.5 Length: 46.5 Thigh: 33.1




In [20]:
import json
import re

def preprocess_data(data):
    cleaned_data = {}

    # Ensure that required fields exist, otherwise set them to None
    cleaned_data['title'] = data.get('title', None)
    cleaned_data['description'] = re.sub(r'<[^>]+>', '', ' '.join(data.get('description', [])))  # Remove HTML tags
    cleaned_data['price'] = data.get('price', None)
    cleaned_data['brand'] = data.get('brand', None)
    cleaned_data['categories'] = data.get('category', [])
    cleaned_data['also_buy'] = data.get('also_buy', [])

    return cleaned_data


def preprocess_file(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as f, open(output_file, 'w', encoding='utf-8') as fout:
        for line in f:
            data = json.loads(line)
            preprocessed_data = preprocess_data(data)
            fout.write(json.dumps(preprocessed_data) + '\n')

def stream_processing(input_file, target_size):
    target_size *= 1024 * 3  # Convert to bytes
    current_size = 0
    preprocessed_data = []

    with open(input_file, 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            data = json.loads(line)
            preprocessed_data.append(preprocess_data(data))
            current_size += len(line.encode('utf-8'))

            if current_size >= target_size:
                # Perform frequent itemset mining or any other analysis here with preprocessed_data
                current_size = 0
                preprocessed_data = []

def batch_processing(input_file):
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in tqdm(f):
            data = json.loads(line)
            preprocess_data(data)
            # Perform batch processing steps here

# Generate new JSON file with preprocessed data
preprocess_file('Sample_Amazon_Meta2.json', 'Preprocessed_Amazon_Meta.json')

# Perform streaming processing
stream_processing('Sample_Amazon_Meta2.json', target_size=1)

# Perform batch processing
batch_processing('Sample_Amazon_Meta2.json')


1it [00:00, 3744.91it/s]
1it [00:00, 6502.80it/s]


In [21]:
import json
from kafka import KafkaProducer

def produce_data(input_file, topic):
    producer = KafkaProducer(bootstrap_servers=['localhost:9092'])

    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            data = json.loads(line)
            preprocessed_data = preprocess_data(data)
            producer.send(topic, json.dumps(preprocessed_data).encode('utf-8'))

if __name__ == "__main__":
    input_file = 'Preprocessed_Amazon_Meta.json'
    topic = 'preprocessed_data_topic'
    produce_data(input_file, topic)