### Read csv data, split into train and test

In [9]:
# !pip install pandas

import pandas as pd
import json

df = pd.read_csv("data/Annotated_data.csv").sample(frac = 1)
df.drop(columns=["Id_Number"], inplace=True)

In [10]:
# Train test split
train_size = int(0.8 * len(df))
val_size = int(0.1 * len(df))
train_df = df[:train_size]
val_df = df[train_size:train_size + val_size]
test_df = df[val_size:]

### Process data as jsonls for fine tuning

In [11]:
def normalize_str(s):
   """Lower text and remove extra whitespace, but preserve newlines."""

   def white_space_fix(text):
       return ' '.join(text.split())  # Splits by any whitespace, including \n

   def lower(text):
       return text.lower()

   return white_space_fix(lower(s))

In [14]:
def build_prompt_completion(row):
    """
    Build a dictionary for a training example in Gemini's dataset format.
    The "systemInstruction" contains the prompt that instructs the model,
    and "contents" holds the annotated cognitive distortion information.
    """
    # Build the prompt from the patient question
    prompt = (
        f"Journal Entry: {row['Patient Question']}\n"
        "Identify any cognitive distortions in the text."
    )

    # Use the annotation columns to build the completion.
    distorted_part = row['Distorted part'] if pd.notna(row['Distorted part']) else "None"
    dominant_distortion = row['Dominant Distortion'] if pd.notna(row['Dominant Distortion']) else "None"
    secondary_distortion = (
        row['Secondary Distortion (Optional)']
        if pd.notna(row['Secondary Distortion (Optional)'])
        else ""
    )

    completion = f"Distorted part: {distorted_part}\nDominant Distortion: {dominant_distortion}"
    if secondary_distortion:
        completion += f"\nSecondary Distortion: {secondary_distortion}"

    return {
        "contents": [
            {
                "role": "user",
                "parts": [
                    {
                        "text": normalize_str(prompt)
                    }
                ]
            },
            {
                "role": "model",
                "parts": [
                    {
                        "text": normalize_str(completion)
                    }
                ]
            }
        ]
    }

In [15]:
# Apply the function to each row in the df
# Record is now a list in the {contents: []} form
def create_jsonl(df, jsonl_name: str):
    records = df.apply(build_prompt_completion, axis=1).tolist()

    # Write each record as a separate JSON object (one per line) to a .jsonl file
    output_file = f"data/{jsonl_name}.jsonl"
    with open(output_file, "w", encoding="utf-8") as f:
        for record in records:
            json_line = json.dumps(record, ensure_ascii=False)
            f.write(json_line + "\n")
    print(f"JSONL file saved to {output_file}")

create_jsonl(train_df, "training_data")
create_jsonl(val_df, "validation_data")
create_jsonl(test_df, "testing_data")

JSONL file saved to data/training_data.jsonl
JSONL file saved to data/validation_data.jsonl
JSONL file saved to data/testing_data.jsonl


### Upload train data to bucket in google cloud storage

1. Go to google cloud console
2. Go to bucket
3. Upload file manually

### Set Google Cloud project information and initialize Vertex AI and Gen AI SDK

In [1]:
from google import genai
import os
from dotenv import load_dotenv
import vertexai

load_dotenv()
PROJECT_ID = os.getenv('FIREBASE_PROJECT_ID')
LOCATION = "us-central1"
BUCKET_URI = "gs://distortions-bucket1"

vertexai.init(project=PROJECT_ID, location=LOCATION)

client = genai.Client(vertexai=True, project=PROJECT_ID, location=LOCATION)


In [7]:
import IPython
IPython.__version__
from IPython.display import display as ipy_display
import IPython.core.display
IPython.core.display.display = ipy_display

In [16]:
from vertexai.preview.tuning import sft

base_model = 'gemini-1.5-flash-002'
tuned_model_display_name = "cognitive-distortion-classifier-v01" 

sft_tuning_job = sft.train(
    source_model=base_model,
    train_dataset=f"""{BUCKET_URI}/training_data.jsonl""",
    validation_dataset=f"""{BUCKET_URI}/validation_data.jsonl""",
    tuned_model_display_name=tuned_model_display_name,
)

Creating SupervisedTuningJob
SupervisedTuningJob created. Resource name: projects/777495406665/locations/us-central1/tuningJobs/1817495486672666624
To use this SupervisedTuningJob in another session:
tuning_job = sft.SupervisedTuningJob('projects/777495406665/locations/us-central1/tuningJobs/1817495486672666624')
View Tuning Job:
https://console.cloud.google.com/vertex-ai/generative/language/locations/us-central1/tuning/tuningJob/1817495486672666624?project=777495406665
