In [1]:
import json

# Paths to the input and output files
data_file = 'attrebute_train.data'
solution_file = 'attrebute_train.solution'
output_file = 'atc.jsonl'

# Load JSONL data
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Create prompt-completion pairs for fine-tuning
def create_fine_tuning_dataset(data, solution):
    fine_tuning_pairs = []
    
    # Create a dictionary from solution file using 'indoml_id' as key
    solution_dict = {item['indoml_id']: item for item in solution}
    
    for item in data:
        indoml_id = item['indoml_id']
        if indoml_id in solution_dict:
            # Shorter, concise prompt
            instruction="Given a product title and optional details (store and manufacturer), predict the brand and 5 hierarchical ecommerce product categories (L0 to L4),some times categories can be na as well. The values may or may not be present in the title."
            input = (
                f"Title: {item['title']}\n"
                f"Store: {item['store']}\n"
                f"Manufacturer: {item.get('details_Manufacturer', '')}"
            )
            
            # Get the completion from the corresponding solution record
            solution_item = solution_dict[indoml_id]
            output = (
                f"Brand: {solution_item['details_Brand']}, L0_category: {solution_item['L0_category']}, "
                f"L1_category: {solution_item['L1_category']}, L2_category: {solution_item['L2_category']}, "
                f"L3_category: {solution_item['L3_category']}, L4_category: {solution_item['L4_category']}"
            )
            
            # Create the prompt-completion pair
            fine_tuning_pairs.append({"instruction":instruction,"input": input, "output": output})
    
    return fine_tuning_pairs

# Save fine-tuning dataset to JSONL format
def save_jsonl(data, file_path):
    with open(file_path, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')

# Load the data and solution files
attribute_train_data = load_jsonl(data_file)
attribute_train_solution = load_jsonl(solution_file)

# Create fine-tuning dataset
fine_tuning_dataset = create_fine_tuning_dataset(attribute_train_data, attribute_train_solution)

# Save to output file
save_jsonl(fine_tuning_dataset, output_file)

print(f"Fine-tuning dataset saved to {output_file}")


Fine-tuning dataset saved to atc.jsonl


In [2]:
import json
import random

# File paths
input_file = "atc.jsonl"  # Input JSONL file
output_file = 'atc2.jsonl'  # Output sample JSONL file

# Parameters
sample_size = 20  # Number of samples to extract
random_sampling = True  # Set to False to take the first 'n' samples

# Load JSONL data
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Save JSONL data
def save_jsonl(data, file_path):
    with open(file_path, 'w') as f:
        for item in data:
            f.write(json.dumps(item) + '\n')

# Sample data
def sample_jsonl_data(data, sample_size, random_sampling):
    if random_sampling:
        return random.sample(data, min(sample_size, len(data)))
    else:
        return data[:sample_size]

# Load the large JSONL file
data = load_jsonl(input_file)

# Take a sample of the data
sampled_data = sample_jsonl_data(data, sample_size, random_sampling)

# Save the sampled data to a new JSONL file
save_jsonl(sampled_data, output_file)

print(f"Sampled {len(sampled_data)} records saved to {output_file}")


Sampled 20 records saved to atc2.jsonl


In [3]:
import json

# Path to the input JSONL file and the output JSON file
input_jsonl_file = 'atc2.jsonl'
output_json_file = 'atc2.json'

# Initialize a list to hold all JSON objects
json_list = []

# Read the JSONL file line by line
with open(input_jsonl_file, 'r') as f:
    for line in f:
        json_obj = json.loads(line.strip())
        json_list.append(json_obj)

# Write the list of JSON objects to a single JSON file
with open(output_json_file, 'w') as f:
    json.dump(json_list, f, indent=4)

print(f"Converted {input_jsonl_file} to {output_json_file}")


Converted atc2.jsonl to atc2.json


In [6]:
import json

# Path to the input JSONL file and the output JSON file
input_jsonl_file = 'ola_ins.jsonl'
output_json_file = 'ola_final_train.json'

# Initialize a list to hold the first 200,000 JSON objects
json_list = []

# Read the JSONL file line by line, but only up to 200,000 lines
with open(input_jsonl_file, 'r') as f:
    for i, line in enumerate(f):
        if i >= 200000:  # Stop after reading 200,000 lines
            break
        json_obj = json.loads(line.strip())
        json_list.append(json_obj)

# Write the list of JSON objects to a single JSON file
with open(output_json_file, 'w') as f:
    json.dump(json_list, f, indent=4)

print(f"Converted the first 200,000 records from {input_jsonl_file} to {output_json_file}")


Converted the first 200,000 records from ola_ins.jsonl to ola_final_train.json
