In [5]:
import json
import pandas as pd

# Load the JSONL file
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            data.append(json.loads(line))
    return data

# Extract relevant information
def extract_data(data):
    extracted_data = []
    for entry in data:
        instruction = entry.get("instruction", "")
        input_data = entry.get("input", "")
        response = entry.get("generated_response", "")

        # Extracting question, choices, and answer from the response
        question = response.split("question: ")[1].split(",")[0] if "question: " in response else None
        choices_start = response.find("choices: ")
        choices_end = response.find("answer: ")
        choices = response[choices_start:choices_end].replace("choices: ", "").strip() if choices_start != -1 and choices_end != -1 else None
        answer = response.split("answer: ")[1].strip() if "answer: " in response else None

        if question and choices and answer:
            extracted_data.append({
                "instruction": instruction,
                "input": input_data,
                "question": question,
                "choices": choices,
                "answer": answer
            })
    return extracted_data

# File path
file_path = 'generated_test_results_few_shot_LLaMA.jsonl'

# Load and process the data
data = load_jsonl(file_path)
extracted_data = extract_data(data)

# Convert to DataFrame
extracted_df = pd.DataFrame(extracted_data)

# Save the DataFrame to a CSV file for user access
output_xlsx_path = 'formatted_generated_test_results_few_shot_LLaMA.xlsx'
extracted_df.to_excel(output_xlsx_path, index=False)

print(f"The extracted data has been saved to: {output_xlsx_path}")


The extracted data has been saved to: formatted_generated_test_results_few_shot_LLaMA.xlsx


In [3]:
import json
import pandas as pd

# Load the JSONL file
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return data

# Extract relevant information
def extract_data(data):
    extracted_data = []

    for entry in data:
        instruction = entry.get("instruction", "")
        input_data = entry.get("input", "")
        response = entry.get("generated_response", "")

        # Extracting question
        question = None
        if "question: " in response:
            try:
                question = response.split("question: ")[1].split(",")[0]
            except IndexError:
                question = None  # If split fails, set to None

        # Extracting choices
        choices_start = response.find("choices: ")
        choices_end = response.find("answer: ")
        choices = None
        if choices_start != -1 and choices_end != -1:
            choices = response[choices_start:choices_end].replace("choices: ", "").strip()
        
        # Extracting answer
        answer = None
        if "answer: " in response:
            try:
                answer = response.split("answer: ")[1].strip()
            except IndexError:
                answer = None  # If split fails, set to None

        # Append all data, even if some values are missing
        extracted_data.append({
            "instruction": instruction if instruction else "",
            "input": input_data if input_data else "",
            "question": question if question else "",
            "choices": choices if choices else "",
            "answer": answer if answer else ""
        })

    return extracted_data

# File path
file_path = 'generated_test_results_few_shot_LLaMA.jsonl'

# Load and process the data
data = load_jsonl(file_path)
extracted_data = extract_data(data)

# Convert to DataFrame
extracted_df = pd.DataFrame(extracted_data)

# Save the DataFrame to an Excel file
output_xlsx_path = 'formatted_generated_test_results_few_shot_LLaMA.xlsx'
extracted_df.to_excel(output_xlsx_path, index=False)

print(f"The extracted data has been saved to: {output_xlsx_path}")


The extracted data has been saved to: formatted_generated_test_results_few_shot_LLaMA.xlsx


In [None]:
import pandas as pd
import re

# Load the Excel file
file_path = "formatted_generated_test_results_few_shot_LLaMA.xlsx"  # Change this to your actual file path
df = pd.read_excel(file_path)

# Function to remove everything after the first number
def keep_first_number(cell):
    if pd.isna(cell):  # Check if the cell is NaN
        return cell
    match = re.search(r'^\d+', str(cell).strip())  # Find the first number at the beginning
    return match.group(0) if match else ""

# Apply the function only to the "Answer" column
if "answer" in df.columns:
    df["answer"] = df["answer"].apply(keep_first_number)
else:
    print("Column 'answer' not found in the file.")

# Save the modified file
output_path = "formatted_generated_test_results_few_shot_LLaMA.xlsx"
df.to_excel(output_path, index=False)

print(f"Processed file saved as: {output_path}")


Processed file saved as: formatted_generated_test_results_few_shot.xlsx


In [7]:
import json
import pandas as pd
import re

# Load the JSONL file
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return data

# Extract everything after "### Response:"
def extract_response_data(data):
    extracted_data = []
    for entry in data:
        instruction = entry.get("instruction", "")
        input_data = entry.get("input", "")
        response = entry.get("generated_response", "")

        # Extract everything after "### Response:"
        match = re.search(r"### Response:\s*(.*)", response, re.DOTALL)
        extracted_response = match.group(1).strip() if match else response  # Default to full response if no match

        extracted_data.append({
            "instruction": instruction,
            "input": input_data,
            "extracted_response": extracted_response
        })
    
    return extracted_data

# File path
file_path = 'generated_test_results_FULL_LLaMA.jsonl'

# Load and process the data
data = load_jsonl(file_path)
extracted_data = extract_response_data(data)

# Convert to DataFrame
extracted_df = pd.DataFrame(extracted_data)

# Save the DataFrame to an Excel file
output_xlsx_path = 'formatted_generated_test_results_FULL_LLaMA.xlsx'
extracted_df.to_excel(output_xlsx_path, index=False)

print(f"The extracted data has been saved to: {output_xlsx_path}")


The extracted data has been saved to: formatted_generated_test_results_FULL_LLaMA.xlsx


In [12]:
import pandas as pd
import json

# Define the input JSONL file
input_file = 'generated_test_results_few_shot_LLaMA.jsonl' # Replace with your actual JSONL file

# Initialize a list to store extracted data
parsed_data = []

# Read JSONL file line by line
with open(input_file, "r", encoding="utf-8") as file:
    for line_number, line in enumerate(file, start=1):
        try:
            # Ignore empty lines
            if not line.strip():
                continue
            
            # Parse each line as a JSON object
            data = json.loads(line.strip())

            # Extract 'generated_response' and ensure it's valid JSON
            response_text = data.get("generated_response", "").strip()
            if not response_text:
                print(f"Skipping empty response at line {line_number}")
                continue
            
            try:
                # Convert 'generated_response' from string to dictionary
                response_dict = json.loads(response_text)
            except json.JSONDecodeError:
                print(f"Skipping invalid JSON in 'generated_response' at line {line_number}")
                continue

            # Extract relevant fields
            question = response_dict.get("question", "").strip()
            choices = response_dict.get("choices", [])
            answer = response_dict.get("answer", "").strip()

            # Validate extracted data
            if question and choices and answer:
                parsed_data.append({
                    "Question": question,
                    "Choices": ", ".join(choices),  # Store choices as comma-separated text
                    "Answer": answer
                })
            else:
                print(f"Skipping incomplete data at line {line_number}")

        except json.JSONDecodeError:
            print(f"Skipping invalid JSON at line {line_number}")
        except Exception as e:
            print(f"Error processing line {line_number}: {e}")

# Convert extracted data to DataFrame
parsed_df = pd.DataFrame(parsed_data)

# Save extracted data to an Excel file
output_excel = 'formatted_generated_test_results_few_shot_LLaMA.xlsx'
parsed_df.to_excel(output_excel, index=False)

print(f"✅ Extraction completed. The Excel file is saved as '{output_excel}'")


Skipping invalid JSON in 'generated_response' at line 1
Skipping invalid JSON in 'generated_response' at line 2
Skipping invalid JSON in 'generated_response' at line 3
Skipping invalid JSON in 'generated_response' at line 4
Skipping invalid JSON in 'generated_response' at line 5
Skipping invalid JSON in 'generated_response' at line 6
Skipping invalid JSON in 'generated_response' at line 7
Skipping invalid JSON in 'generated_response' at line 8
Skipping invalid JSON in 'generated_response' at line 9
Skipping invalid JSON in 'generated_response' at line 10
Skipping invalid JSON in 'generated_response' at line 11
Skipping invalid JSON in 'generated_response' at line 12
Skipping invalid JSON in 'generated_response' at line 13
Skipping invalid JSON in 'generated_response' at line 14
Skipping invalid JSON in 'generated_response' at line 15
Skipping invalid JSON in 'generated_response' at line 16
Skipping invalid JSON in 'generated_response' at line 17
Skipping invalid JSON in 'generated_resp

In [14]:
import pandas as pd
import json
import re

# Load the Excel file
file_path = "generated_test_results_few_shot_LLaMA.csv"  # Replace with your actual file path
df = pd.read_csv(file_path)

# Define a function to extract question, choices, and answer
def extract_question_data(response_text):
    try:
        # Parse the JSON from the response
        response_dict = json.loads(response_text)
        
        # Extract required fields
        question = response_dict.get("question", "").strip()
        choices = response_dict.get("choices", [])
        answer = response_dict.get("answer", "").strip()
        
        return question, choices, answer
    except Exception as e:
        return None, None, None

# Initialize lists to store extracted data
parsed_data = []

# Iterate over each row in the dataset
for index, row in df.iterrows():
    response_text = row["generated_response"]  # Adjust column name if necessary
    question, choices, answer = extract_question_data(response_text)

    # Store only if extraction was successful
    if question and choices and answer:
        parsed_data.append({
            "Question": question,
            "Choices": choices,
            "Answer": answer
        })

# Convert to DataFrame
parsed_df = pd.DataFrame(parsed_data)

# Save extracted data to a new Excel file
output_file = "formatted_generated_test_results_few_shot_LLaMA.xlsx"
parsed_df.to_excel(output_file, index=False)

print(f"✅ Extraction completed. The Excel file is saved as '{output_excel}'")



✅ Extraction completed. The Excel file is saved as 'formatted_generated_test_results_few_shot_LLaMA.xlsx'


In [16]:
import pandas as pd
import json

# Load the CSV file
file_path = "generated_test_results_few_shot_LLaMA.csv"  # Replace with your actual CSV file path
df = pd.read_csv(file_path)

# Check if the expected column exists
if "generated_response" not in df.columns:
    raise ValueError("The column 'generated_response' is missing in the CSV file. Check column names.")

# Function to extract question, choices, and answer
def extract_question_data(response_text):
    try:
        # Ensure text is not empty or NaN
        if pd.isna(response_text) or not isinstance(response_text, str) or response_text.strip() == "":
            return None, None, None

        # Parse JSON safely
        response_dict = json.loads(response_text)

        # Extract required fields
        question = response_dict.get("question", "").strip()
        choices = response_dict.get("choices", [])
        answer = response_dict.get("answer", "").strip()

        # Validate extracted data
        if not question or not choices or not answer:
            return None, None, None

        return question, choices, answer
    except (json.JSONDecodeError, TypeError, KeyError) as e:
        print(f"Skipping row due to error: {e}")  # Debugging statement
        return None, None, None

# Initialize a list to store extracted data
parsed_data = []

# Iterate over each row in the dataset
for index, row in df.iterrows():
    question, choices, answer = extract_question_data(row["generated_response"])

    # Store only if extraction was successful
    if question and choices and answer:
        parsed_data.append({
            "Question": question,
            "Choices": ", ".join(choices),  # Convert list to string for better readability
            "Answer": answer
        })

# Convert to DataFrame
parsed_df = pd.DataFrame(parsed_data)

# Check if extracted data is empty
if parsed_df.empty:
    print("No valid data was extracted. Please check the input CSV file for formatting issues.")
else:
    # Save extracted data to an Excel file
    output_file = "formatted_generated_test_results_few_shot_LLaMA.xlsx"
    parsed_df.to_excel(output_file, index=False)

# Print sample extracted data for debugging
print(parsed_df.head())


Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecting value: line 1 column 1 (char 0)
Skipping row due to error: Expecti

In [17]:
import pandas as pd
import json
import re

# Load the CSV file
file_path = "generated_test_results_few_shot_LLaMA.csv" # Adjust file name if needed
df = pd.read_csv(file_path)

# Function to extract question, choices, and answer using regex to capture the JSON block
def extract_question_data(response_text):
    try:
        # Ensure text is valid
        if pd.isna(response_text) or not isinstance(response_text, str) or response_text.strip() == "":
            return None, None, None

        # Use regex to find a JSON object (assumes the JSON starts with { and ends with })
        pattern = re.compile(r'(\{.*\})', re.DOTALL)
        match = pattern.search(response_text)
        if not match:
            return None, None, None  # No JSON block found

        json_str = match.group(1).strip()

        # Attempt to parse the JSON string
        response_dict = json.loads(json_str)

        # Extract required fields
        question = response_dict.get("question", "").strip()
        choices = response_dict.get("choices", [])
        answer = response_dict.get("answer", "").strip()

        # Validate extracted data
        if not question or not choices or not answer:
            return None, None, None

        return question, choices, answer
    except Exception as e:
        print(f"Error extracting row: {e}")
        return None, None, None

# Initialize list to store extracted data
parsed_data = []

# Iterate over each row in the dataset
for index, row in df.iterrows():
    question, choices, answer = extract_question_data(row["generated_response"])
    # Only add if extraction is successful
    if question and choices and answer:
        parsed_data.append({
            "Question": question,
            "Choices": ", ".join(choices),  # Convert list to a comma-separated string
            "Answer": answer
        })

# Convert extracted data to a DataFrame
parsed_df = pd.DataFrame(parsed_data)

# Check if any data was extracted
if parsed_df.empty:
    print("No valid data was extracted. Please check the formatting of the 'generated_response' column.")
else:
    # Save the results to an Excel file
    output_file = "formatted_generated_test_results_few_shot_LLaMA.xlsx"
    parsed_df.to_excel(output_file, index=False)
    print(f"Extracted data saved to: {output_file}")

# For debugging: display the first few extracted rows
print(parsed_df.head())


Error extracting row: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error extracting row: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error extracting row: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error extracting row: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error extracting row: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error extracting row: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error extracting row: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error extracting row: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error extracting row: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error extracting row: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Error extracting row

In [20]:
import json
import pandas as pd

# Define the input and output file paths
input_file = "generated_test_results_few_shot_LLaMA.jsonl"  # Replace with your actual JSONL file name
output_file = "formmatted_generated_test_results_few_shot_LLaMA.xlsx"

# Initialize an empty list to store extracted data
data = []

# Read and parse the JSONL file
with open(input_file, "r", encoding="utf-8") as file:
    for line_num, line in enumerate(file, start=1):
        try:
            record = json.loads(line.strip())  # Load each JSONL record
            if "generated_response" in record:
                response_text = record["generated_response"]
                
                # Ensure the response is valid JSON
                try:
                    response_data = json.loads(response_text)
                    question = response_data.get("question", "")
                    choices = response_data.get("choices", [])
                    answer = response_data.get("answer", "")

                    data.append({
                        "Question": question,
                        "Choices": "\n".join(choices),  # Store choices as newline-separated text
                        "Answer": answer
                    })
                except json.JSONDecodeError:
                    print(f"Skipping entry at line {line_num} due to JSON decoding error in 'generated_response' field.")
        except json.JSONDecodeError:
            print(f"Skipping entry at line {line_num} due to malformed JSON.")

# Convert the extracted data into a DataFrame
df = pd.DataFrame(data)

# Save to an Excel file
df.to_excel(output_file, index=False)

print(f"Successfully saved parsed questions to {output_file}")

Skipping entry at line 1 due to JSON decoding error in 'generated_response' field.
Skipping entry at line 2 due to JSON decoding error in 'generated_response' field.
Skipping entry at line 3 due to JSON decoding error in 'generated_response' field.
Skipping entry at line 4 due to JSON decoding error in 'generated_response' field.
Skipping entry at line 5 due to JSON decoding error in 'generated_response' field.
Skipping entry at line 6 due to JSON decoding error in 'generated_response' field.
Skipping entry at line 7 due to JSON decoding error in 'generated_response' field.
Skipping entry at line 8 due to JSON decoding error in 'generated_response' field.
Skipping entry at line 9 due to JSON decoding error in 'generated_response' field.
Skipping entry at line 10 due to JSON decoding error in 'generated_response' field.
Skipping entry at line 11 due to JSON decoding error in 'generated_response' field.
Skipping entry at line 12 due to JSON decoding error in 'generated_response' field.
S

In [22]:
with open("generated_test_results_few_shot_LLaMA.jsonl", "r", encoding="utf-8") as file:
    for i in range(5):  # Print first 5 lines
        print(file.readline().strip())


{"instruction": "{'role': 'You are an expert science assessment specialist that generates science questions based on specified metadata provided by the user. Your role is to ensure that the generated questions are of high quality, align with the intended learning objectives, and adhere to scientifically rigorous standards.'}", "input": "{\n  \"task_description\": \"Improve low-scoring questions using high-performing questions as a guide. Given metadata about a low-scoring question, regenerate a new question that aligns with the high-scoring examples.\",\n  \"high_performing_examples\": [\n    {\n      \"question\": \"Which of these changes is a physical change?\",\n      \"choices\": [\n        \"a banana turning brown\",\n        \"a candle melting\",\n        \"a piece of metal rusting\"\n      ],\n      \"answer\": \"a candle melting\",\n      \"explanation\": \"The question aligns well with learning outcomes related to distinguishing physical and chemical changes, which is a common

In [23]:
import json
import pandas as pd

# Define the input and output file paths
input_file = "generated_test_results_few_shot_LLaMA.jsonl"  # Replace with your actual JSONL file name
output_file = "formmatted_generated_test_results_few_shot_LLaMA.xlsx"

data = []

with open(input_file, "r", encoding="utf-8") as file:
    for line_num, line in enumerate(file, start=1):
        try:
            record = json.loads(line.strip())
            response_text = record.get("generated_response", "")

            # Try double-decoding if response is a JSON string
            if isinstance(response_text, str) and response_text.startswith("{"):
                try:
                    response_data = json.loads(response_text)  # Double decode
                except json.JSONDecodeError:
                    response_data = response_text  # Keep as string if it fails

            else:
                response_data = response_text

            data.append({
                "Line Number": line_num,
                "Generated Response": response_data
            })
        
        except json.JSONDecodeError as e:
            print(f"Skipping entry at line {line_num} due to JSON decoding error: {e}")

# Save extracted responses for inspection
df = pd.DataFrame(data)
df.to_excel(output_file, index=False)

print(f"Extracted responses saved to {output_file}")


Extracted responses saved to formmatted_generated_test_results_few_shot_LLaMA.xlsx


In [24]:
import pandas as pd
import json

# Load the Excel file
file_path = "formmatted_generated_test_results_few_shot_LLaMA.xlsx"
df = pd.read_excel(file_path)

# Function to parse JSON safely
def parse_json(response):
    if isinstance(response, str) and response.startswith("{"):
        try:
            return json.loads(response)  # Attempt to parse JSON
        except json.JSONDecodeError as e:
            print(f"Skipping entry due to JSON decoding error: {e}")
            return response  # Keep as is if it fails
    return response  # Return original if not JSON

# Apply JSON parsing to 'Generated Response' column
df["Parsed Response"] = df["Generated Response"].apply(parse_json)

# Save the cleaned data back to Excel for inspection
cleaned_file_path = "formmatted_generated_test_results_few_shot_LLaMA.xlsx"
df.to_excel(cleaned_file_path, index=False)

print(f"Cleaned responses saved to {cleaned_file_path}")


Cleaned responses saved to formmatted_generated_test_results_few_shot_LLaMA.xlsx


In [27]:
import pandas as pd
import json
import re

# Load the Excel file
file_path = "formmatted_generated_test_results_few_shot_LLaMA.xlsx"  # Change to your actual file path
df = pd.read_excel(file_path)

# Function to extract JSON from response text
def extract_json(response):
    try:
        # Use regex to find content after '### Response:'
        match = re.search(r'### Response:\s*({.*})', response, re.DOTALL)
        if match:
            json_text = match.group(1)  # Extract JSON part
            
            # Fix common JSON issues before loading
            json_text = json_text.strip()
            json_text = json_text.replace("’", "'")  # Fix any incorrect quotes
            json_text = json_text.replace("\n", " ")  # Remove line breaks

            return json.loads(json_text)  # Parse JSON
    except json.JSONDecodeError as e:
        print(f"JSON Decoding Error: {e}\nProblematic text:\n{json_text[:500]}")
    return None  # Return None if parsing fails

# Apply JSON extraction function to the 'Generated Response' column
df["Parsed JSON"] = df["Generated Response"].apply(lambda x: extract_json(str(x)))

# Drop rows where parsing failed
df = df.dropna(subset=["Parsed JSON"])

# Expand the extracted JSON into separate columns
df_extracted = df["Parsed JSON"].apply(pd.Series)

# Select only relevant columns
df_final = df[["Generated Response"]].copy()
df_final["question"] = df_extracted.get("question", None)
df_final["choices"] = df_extracted.get("choices", None)
df_final["answer"] = df_extracted.get("answer", None)

# Save the cleaned data to a new Excel file
cleaned_file_path = "extracted_questions.xlsx"
df_final.to_excel(cleaned_file_path, index=False)

print(f"Extracted questions, choices, and answers saved to {cleaned_file_path}")


JSON Decoding Error: Extra data: line 1 column 2325 (char 2324)
Problematic text:
{   "question": "What is the speed of a car driving at 10 m/s?",   "choices": [     "10 m/s",     "20 m/s",     "30 m/s"   ],   "answer": "10 m/s",   "explanation": "The question aligns well with learning outcomes related to understanding speed and comparisons, a fundamental concept in physics for many grade levels. This question fits well within the typical curriculum goals for middle school science or math, where speed and comparisons of different objects are often discussed. The question is a
JSON Decoding Error: Extra data: line 1 column 2198 (char 2197)
Problematic text:
{   "question": "Why do you think the rabbit has white fur?",   "choices": [     "The rabbit has a dominant allele for white fur.",     "The rabbit has a recessive allele for white fur.",     "The rabbit has a mixed inheritance of both dominant and recessive alleles."   ],   "answer": "The rabbit has a dominant allele for white fur."

In [29]:
import pandas as pd
import json
import re

# Load the Excel file
file_path = "formmatted_generated_test_results_few_shot_LLaMA.xlsx"  # Change this to your actual file path
df = pd.read_excel(file_path)

# Function to extract JSON safely
def extract_json(response):
    try:
        # Use regex to extract content after '### Response:'
        match = re.search(r'### Response:\s*({.*})', response, re.DOTALL)
        if match:
            json_text = match.group(1).strip()  # Extract JSON block

            # Fix common JSON issues before loading
            json_text = json_text.replace("’", "'")  # Fix any incorrect quotes
            json_text = json_text.replace("\n", " ")  # Remove line breaks

            # Handle single quotes by replacing them with double quotes
            json_text = re.sub(r"(?<!\\)'", '"', json_text)

            # Remove extra text after a valid JSON structure
            json_text = re.split(r'}\s*[^{}]*$', json_text)[0] + "}"

            # Parse JSON
            return json.loads(json_text)  
    except json.JSONDecodeError as e:
        print(f"❌ JSON Decoding Error: {e}\nProblematic text:\n{json_text[:500]}")
    return None  # Return None if parsing fails

# Apply JSON extraction function to the 'generated_response' column
df["Parsed JSON"] = df["Generated Response"].apply(lambda x: extract_json(str(x)))

# Drop rows where parsing failed
df = df.dropna(subset=["Parsed JSON"])

# Expand the extracted JSON into separate columns
df_extracted = df["Parsed JSON"].apply(pd.Series)

# Select only relevant columns
df_final = df[["Generated Response"]].copy()
df_final["question"] = df_extracted.get("question", None)
df_final["choices"] = df_extracted.get("choices", None)
df_final["answer"] = df_extracted.get("answer", None)

# Save the cleaned data to a new Excel file
cleaned_file_path = file_path = "formmatted_generated_test_results_few_shot_LLaMA.xlsx"
df_final.to_excel(cleaned_file_path, index=False)

print(f"✅ Extracted questions, choices, and answers saved to {cleaned_file_path}")


❌ JSON Decoding Error: Expecting ',' delimiter: line 1 column 1582 (char 1581)
Problematic text:
{   "question": "Which of these changes is a physical change?",   "choices": [     "A candle is lit.",     "A piece of iron is heated in a fire.",     "A banana is cut into pieces."   ],   "answer": "A candle is lit.",   "explanation": "The question aligns well with learning outcomes related to understanding physical and chemical changes, a fundamental concept in chemistry education. The question is appropriate for middle school science curricula, where students learn to distinguish between phys
❌ JSON Decoding Error: Expecting ',' delimiter: line 1 column 1761 (char 1760)
Problematic text:
{   "question": "What is the difference between physical and chemical changes?",   "choices": [     "A physical change occurs when the size or shape of an object changes.",     "A chemical change occurs when the type of matter changes.",     "A physical change occurs when the temperature of an object cha

In [30]:
# Load the CSV file
file_path = "generated_test_results_few_shot_LLaMA.csv"
df = pd.read_csv(file_path)

# Function to extract relevant parts from the generated response
import json
import re

def extract_question_data(response):
    try:
        # Extract JSON content from the response text
        match = re.search(r'### Response:\s*({.*})', response, re.DOTALL)
        if match:
            json_text = match.group(1)
            json_text = json_text.strip()

            # Attempt to load JSON
            data = json.loads(json_text)
            question = data.get("question", "")
            choices = data.get("choices", [])
            answer = data.get("answer", "")

            return question, choices, answer

    except json.JSONDecodeError:
        return "", [], ""

    return "", [], ""

# Apply extraction to the generated response column
df_extracted = df[['generated_response']].copy()
df_extracted[['question', 'choices', 'answer']] = df_extracted['generated_response'].apply(
    lambda x: pd.Series(extract_question_data(str(x)))
)

# Save the extracted data to an Excel file
extracted_file_path = "formatted_generated_test_results_few_shot_LLaMA.xlsx"
df_extracted.to_excel(extracted_file_path, index=False)
