In [24]:
# !pip install pandas

import pandas as pd
import json

df = pd.read_csv("data/Annotated_data.csv")
df.drop(columns=["Id_Number"], inplace=True)

In [25]:
def build_prompt_completion(row):
    """
    Build a dictionary for a training example in Gemini's dataset format.
    The "systemInstruction" contains the prompt that instructs the model,
    and "contents" holds the annotated cognitive distortion information.
    """
    # Build the prompt from the patient question
    prompt = (
        f"Journal Entry: {row['Patient Question']}\n"
        "Identify any cognitive distortions in the text."
    )

    # Use the annotation columns to build the completion.
    distorted_part = row['Distorted part'] if pd.notna(row['Distorted part']) else "None"
    dominant_distortion = row['Dominant Distortion'] if pd.notna(row['Dominant Distortion']) else "None"
    secondary_distortion = (
        row['Secondary Distortion (Optional)']
        if pd.notna(row['Secondary Distortion (Optional)'])
        else ""
    )

    completion = f"Distorted part: {distorted_part}\nDominant Distortion: {dominant_distortion}"
    if secondary_distortion:
        completion += f"\nSecondary Distortion: {secondary_distortion}"

    return {
        "contents": [
            {
                "role": "user",
                "parts": [
                    {
                        "text": prompt
                    }
                ]
            },
            {
                "role": "model",
                "parts": [
                    {
                        "text": completion
                    }
                ]
            }
        ]
    }

In [26]:
# Apply the function to each row in the df
# Record is now a list in the {contents: []} form
records = df.apply(build_prompt_completion, axis=1).tolist()

# Write each record as a separate JSON object (one per line) to a .jsonl file
output_file = "data/fine_tuning_data.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
    for record in records:
        json_line = json.dumps(record, ensure_ascii=False)
        f.write(json_line + "\n")

print(f"JSONL file saved to {output_file}")

JSONL file saved to data/fine_tuning_data.jsonl
