# Parsing LLM Output
This notebook takes output from finetuned Llama 65B and parses it to prepare a dataset that will be used for RLHF. 

## Load in output and parse it
This block loads in the output, parses it for solution and question, and executes the solution block. It inlcudes Try and Except blocks for cases when the model creates an incomplete output.

In [6]:
file_path = 'questions_5shot.txt'
data = []

with open(file_path, 'r') as file:
    text = file.read()
    sections = text.split("Prompting Approach: few shot. Generated Text: ### Response:")
    del sections[0]  # Remove the initial part before the question
    total_gens = len(sections)
    for section in sections:
        try:
            question = section.split("Question:")[1].strip()
            question = question.split("Solution:")[0].strip()
            solution = section.split("Solution:")[1].strip()
            solution = solution.split('\n    return result')[0]
            solution = solution + '\n    return result'
            solution_text = solution
            try:
                # Execute the Python code
                exec(solution)
                answer = solution()
                data.append({"question": question, "solution": solution_text, 'answer': answer})
            except: 
                try: #pick up questions that did not indent the code block
                    solution_new = ""
                    lines = solution.split('\n')
                    indent = False
                    for line in lines:
                        if line.startswith('def solution():'):
                            solution_new += line + '\n'
                            
                        if not line.startswith('def solution():') and not line.startswith('    return result'):
                            solution_new += '    ' + line + '\n'
                            
                        if line.startswith('    return result'):
                            solution_new += line
                    solution_text = solution_new
                    # Execute the Python code
                    exec(solution_new)
                    answer = solution()
                    data.append({"question": question, "solution": solution_text, 'answer': answer})
                
                except: 
                    pass
                    #pass if code doesn't execute
        except IndexError:
            pass
            #pass if there is no solution

In [7]:
for i in range(0, len(data)):
    data[i]['question_number'] = f"question{i}"

In [9]:
len(data)

3222

In [15]:
data_toloka = []
for i in range(0, len(data)):
    data_toloka.append({'question_number': data[i]['question_number'], 'question': data[i]['question'], 'answer': data[i]['answer']})

## Export as CSV

In [30]:
import pandas as pd
import json
data_path = 'data/questions_run1.json'

# Save data as a JSON file
with open(data_path, 'w') as json_file:
    json.dump(data, json_file, indent=4)

print("Full data saved to JSON successfully.")

data_path = 'data/questions_run1_toloka.json'

# Save data as a JSON file
with open(data_path, 'w') as json_file:
    json.dump(data_toloka, json_file, indent=4)
    
print("Toloka data saved to JSON successfully.")

import pandas as pd
df = pd.read_json(data_path)
df.dropna(subset = ['answer'], inplace=True)
df = df[:2000]
df.to_csv('data/questions_run1.csv')
print("Data saved to CSV successfully.")

Full data saved to JSON successfully.
Toloka data saved to JSON successfully.
Data saved to CSV successfully.


In [21]:
data[3200]

{'question': 'A cake of soap weighs 4 ounces.  A box of soap powder weighs 16 ounces.  A bottle of detergent weighs 20 ounces.  A tote bag can hold 100 ounces of laundry supplies.  How many tote bags are needed to carry 12 cakes of soap, 8 boxes of soap powder, and 6 bottles of detergent?',
 'solution': 'def solution():\n    #A cake of soap weighs 4 ounces\n    soap_cake_weight = 4\n    #A box of soap powder weighs 16 ounces\n    soap_powder_weight = 16\n    #A bottle of detergent weighs 20 ounces\n    detergent_weight = 20\n    #A tote bag can hold 100 ounces\n    tote_bag_capacity = 100\n    #12 cakes of soap weigh\n    soap_cake_total_weight = soap_cake_weight * 12\n    #8 boxes of soap powder weigh\n    soap_powder_total_weight = soap_powder_weight * 8\n    #6 bottles of detergent weigh\n    detergent_total_weight = detergent_weight * 6\n    #The total weight of all the laundry supplies is\n    total_weight = soap_cake_total_weight + soap_powder_total_weight + detergent_total_weigh

In [22]:
data[3206]

{'question': 'A cake of soap weighs 4 ounces.  A box of soap powder weighs 16 ounces.  A bottle of detergent weighs 20 ounces.  A tote bag full of laundry weighs 40 pounds.  How many ounces of soap and detergent, in total, does a load of laundry weigh?',
 'solution': 'def solution():\n    #A cake of soap weighs 4 ounces\n    soap_cake_weight = 4\n    #A box of soap powder weighs 16 ounces\n    soap_powder_weight = 16\n    #A bottle of detergent weighs 20 ounces\n    detergent_weight = 20\n    #A tote bag full of laundry weighs 40 pounds\n    laundry_weight = 40\n    #A pound is 16 ounces\n    pound_to_ounces = 16\n    #The laundry weighs 40 * 16 = <<40*16=640>>640 ounces\n    laundry_ounces = laundry_weight * pound_to_ounces\n    #The soap and detergent weigh 4 + 16 + 20 = <<4+16+20=40>>40 ounces\n    soap_and_detergent_weight = soap_cake_weight + soap_powder_weight + detergent_weight\n    #The answer is\n    result = laundry_ounces + soap_and_detergent_weight\n    return result',
 'an

In [26]:
data[3185]

{'question': 'A 20-ounce soda has 12 ounces of sugar. A 12-ounce soda has 8 ounces of sugar. How much more sugar is in a 20-ounce soda than in a 12-ounce soda?',
 'solution': 'def solution():\n    #A 20-ounce soda has 12 ounces of sugar\n    sugar_20_oz = 12\n    #A 12-ounce soda has 8 ounces of sugar\n    sugar_12_oz = 8\n    #The answer is\n    result = sugar_20_oz - sugar_12_oz\n    return result',
 'answer': 4,
 'question_number': 'question3185'}

In [25]:
data[3186]

{'question': 'A 20-ounce soda has 10 ounces of sugar. How much sugar is in a 6-pack of soda?',
 'solution': 'def solution():\n    #A soda has 10 ounces of sugar\n    sugar_per_soda = 10\n    #A 6-pack has 6 sodas\n    sodas_in_6_pack = 6\n    #The answer is\n    result = sugar_per_soda * sodas_in_6_pack\n    return result',
 'answer': 60,
 'question_number': 'question3186'}