Dataset Loading

In [1]:
import json
import os
from tqdm import tqdm

def sampleData(inputFile, outputFile, targetSize, filterKey='also_buy'):
    targetSize = targetSize * 1024 * 1024 * 1024
    currentSize = 0

    with open(inputFile, 'r', encoding='utf-8') as input, open(outputFile, 'w', encoding='utf-8') as output:
        for line in tqdm(input):
            data = json.loads(line)
            if data.get(filterKey):
                output.write(json.dumps(data) + '\n')
                currentSize += len(line.encode('utf-8'))

            if currentSize >= targetSize:
                break

sampleData('Sampled_Amazon_Meta.json', 'Sample_Amazon_Meta3.json', 0.5)


0it [00:00, ?it/s]

54093it [00:04, 11735.30it/s]


Pre-processing

In [5]:
# import json 
# import re
# from tqdm import tqdm

# def preprocessing(inputFile, outputFile):
#     processedData = []

#     with open(inputFile, 'r', encoding='utf-8') as inputFilee:
#         # Count the total number of lines in the input file for tqdm progress tracking
#         total_lines = sum(1 for _ in open(inputFile, 'r', encoding='utf-8'))

#         with tqdm(total=total_lines, desc='Processing Data') as pbar:
#             for line in inputFilee:
#                 data = json.loads(line)

#                 if 'title' in data and 'related' in data and 'also_bought' in data['related']:
#                     title = data.get('title', '')
#                     titleCleaned = re.sub(r'[\d\W_ ]+', '', title).lower()

#                     # Preprocess other fields if needed
#                     feature = data.get('feature', '')
#                     featureCleaned = re.sub(r'[\d\W_ ]+', '', feature).lower()

#                     description = data.get('description', '')
#                     descriptionCleaned = re.sub(r'[\d\W_ ]+', '', description).lower()

#                     brand = data.get('brand', '')
#                     brandCleaned = re.sub(r'[\d\W_ ]+', '', brand).lower()

#                     # Create a transaction with preprocessed data
#                     transaction = {
#                         'title': titleCleaned,
#                         'feature': featureCleaned,
#                         'description': descriptionCleaned,
#                         'brand': brandCleaned,
#                         'related': data['related']['also_bought']
#                     }
#                     processedData.append(transaction)

#                 # Update tqdm progress bar
#                 pbar.update(1)

#     with open(outputFile, 'w', encoding='utf-8') as outputFilee:
#         for item in processedData:
#             outputFilee.write(json.dumps(item) + '\n')

# preprocessing('Sample_Amazon_Meta2.json', 'Preprocessed_Amazon_Meta222.json')

import json 
import re
from tqdm import tqdm

def preprocess_text(text):
    # Remove digits, special characters, and underscores
    cleaned_text = re.sub(r'[\d\W_]+', ' ', text)
    # Convert to lowercase
    cleaned_text = cleaned_text.lower()
    return cleaned_text.strip()

def preprocessing(inputFile, outputFile):
    processedData = []

    with open(inputFile, 'r', encoding='utf-8') as inputFilee:
        # Count the total number of lines in the input file for tqdm progress tracking
        total_lines = sum(1 for _ in open(inputFile, 'r', encoding='utf-8'))

        with tqdm(total=total_lines, desc='Processing Data') as pbar:
            for line in inputFilee:
                try:
                    data = json.loads(line)

                    title = preprocess_text(data.get('title', ''))
                    feature = preprocess_text(data.get('feature', ''))
                    description = preprocess_text(data.get('description', ''))
                    brand = preprocess_text(data.get('brand', ''))

                    # Exclude categories column from preprocessing
                    # If 'categories' is in data, you can skip it as follows
                    categories = data.get('categories', '')

                    # Create a transaction with preprocessed data
                    transaction = {
                        'title': title,
                        'feature': feature,
                        'description': description,
                        'brand': brand,
                        'categories': categories,  # Include 'categories' as-is
                        'related': data.get('related', {})  # Include 'related' as-is
                    }
                    print("Preprocessed data:", transaction)  # Print the preprocessed data
                    processedData.append(transaction)
                except Exception as e:
                    print(f"Error processing line: {line}")
                    print(e)

                # Update tqdm progress bar
                pbar.update(1)

    with open(outputFile, 'w', encoding='utf-8') as outputFilee:
        for item in processedData:
            outputFilee.write(json.dumps(item) + '\n')

preprocessing('Sample_Amazon_Meta3.json', 'Preprocessed_Amazon_Meta2.json')