In [1]:
import pandas as pd
import json
import itertools

# Load the cleaned dataset
cleaned_file_path = './data/processed_extracted_code_data.csv'  # Replace with your cleaned file path
cleaned_data = pd.read_csv(cleaned_file_path)

# Function to create input-output pairs
def create_input_output_pairs(row):
    # Fields to consider for input generation
    fields = {
        "device_detail": row['device_detail'] if pd.notna(row['device_detail']) else None,
        "category": row['category'] if pd.notna(row['category']) else None,
        "platform": row['platform'] if pd.notna(row['platform']) else None
    }
    
    # Use 'code' if present, otherwise fallback to 'details'
    if pd.notna(row['code']) and isinstance(row['code'], str):
        output_text = row['code']
    elif pd.notna(row['details']) and isinstance(row['details'], str):
        output_text = row['details']
    else:
        return []  # Skip rows with invalid or missing code and details

    # Generate all possible combinations of the fields (1 to 3 fields)
    field_values = [(key, value) for key, value in fields.items() if value]
    pairs = []
    
    for r in range(1, len(field_values) + 1):
        for combination in itertools.combinations(field_values, r):
            # Create the input text from the combination of fields
            input_text = "Generate code for " + " and ".join([f"{field}" for _, field in combination])
            # Append the pair
            pairs.append({"input": input_text.strip(), "output": output_text.strip()})
    
    return pairs

# Generate input-output pairs for all rows
all_pairs = []
for _, row in cleaned_data.iterrows():
    all_pairs.extend(create_input_output_pairs(row))

# Display the total count of generated pairs
print(f"Total input-output pairs generated: {len(all_pairs)}")

# Save the input-output pairs to a JSON file
output_file = './data/codegen_finetune_pairs.json'  # Save as JSON
with open(output_file, 'w') as f:
    json.dump(all_pairs, f, indent=4)



Total input-output pairs generated: 4459
