In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

#Author: Victor Dantas

# Generate a fine-tuning dataset for code-bison model from 100 examples
This Notebook helps automate and simplify the task of generating code fine-tuning dataset, starting from a Google Sheet containing 100 examples of prompts and corresponding generated code (no special formatting required, just copied and pasted from e.g. an IDE).

In this example, the following dataset is used:

[Code generation fine-tuning dataset (Python)](https://docs.google.com/spreadsheets/d/1CU5SSf6tVLqXtYUE8F5rFmJyzoZ6gx7YCuSyyZpHRMg/edit?usp=sharing&resourcekey=0-SjhKOlrgOOMvGqPvEAZ0aA)

This is a synthetic dataset of Python code tasks where the code output always begins with the disclaimer comment "The following code was generated by AI". The code tasks span data structure manipulation, file manipulation, statistics, pandas, and use of Google Cloud Python client libraries. Essentially, we're trying to fine-tune the model to always add a comment to generated code saying that the code was AI-generated.

This Notebook will:

1. Augment the dataset by using PaLM LLM to generate 4 additional prompt variants for each prompt in the dataset. This is to help capture more ways people could ask the same question.
2. Convert the augmented dataset to a JSONL file with code markdown formatting applied.
3. Upload the JSONL file to a GCS bucket for use in a fine-tuning pipeline.


### Install required libraries

In [None]:
! pip install google-cloud-aiplatform langchain  --upgrade --user

**Colab only:** Uncomment the following cell to restart the kernel or use the button to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top.

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

### Authenticating your notebook environment
**Colab only:** If you are using Colab to run this notebook, uncomment the cell below and continue.

In [None]:
#from google.colab import auth as google_auth
#google_auth.authenticate_user()

### Import libraries

In [None]:
PROJECT_ID = "PROJECT_ID"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

**Colab only:** Uncomment the following cell to initialize the Vertex AI SDK. For Vertex AI Workbench, you don't need to run this.

In [None]:
## Vertex AI
# import vertexai

#vertexai.init(project=PROJECT_ID, location=LOCATION)

In [None]:
# Utils
import time
import os

# Langchain
import langchain
from langchain.llms import VertexAI
from langchain.prompts import PromptTemplate


print(f"LangChain version: {langchain.__version__}")

### Using Langchain and designing a prompt for dataset augmentation

In [None]:
# LLM model
llm = VertexAI(
    model_name="text-bison@latest", #"text-bison-32@latest",
    max_output_tokens=2048,
    temperature=0.1,
    top_p=0.8,
    top_k=40,
    verbose=True,
)


prompt_template = PromptTemplate(
    input_variables=["question"],
    template='''Given a prompt, generate 4 additional ways of asking the exact same thing by rephrashing the prompt slightly each time.
    These are intended as prompts to a code generation language model and should preferrably be short and concise, not formal, and
    should simulate the way developers may prompt a code model to help them with a code task.
    Have one of the prompts not include any instruction word (such as write, generate, implement, etc.) but simply a short description
    of the task at hand (for example: Python function that reverses a string).
    Generate the response as one answer per line.
    \n Prompt: {question}''',
)

prompt = prompt_template.format(question="Write a Python function that takes a list of numbers and returns the average")
llm(prompt)


### Augmenting the base fine-tuning dataset

In [None]:
import random
import string


# Generate a uuid of a specifed length(default=8)
def generate_uuid(length: int = 8) -> str:
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))


UUID = generate_uuid()

In [None]:
!echo $REGION

In [None]:
BUCKET_NAME = "vertex-" + UUID
BUCKET_URI = f"gs://{BUCKET_NAME}"
REGION = LOCATION

# Create a bucket
! gsutil mb -l $REGION $BUCKET_URI

# Relative path to base dataset file (export from Sheet)
DATASET = 'finetuning_dataset.csv'  # @param {type:"string"}
! gsutil cp $DATASET $BUCKET_NAME

DATASET_GCS_URI = f"{BUCKET_URI}/{DATASET}"


In [None]:
import json
import csv
import os
from datetime import date
from langchain.text_splitter import RecursiveCharacterTextSplitter
from google.cloud import storage


# Helper function for repeated experimentation
def augment_dataset(csv_filename, num_of_rows = None):
  line_num = 0
  skip_count = 0
  results = []
  with open(csv_filename) as csv_file:
      try:
        csv_reader = csv.reader(csv_file, delimiter=',')
        for row in csv_reader:
          line_num += 1
          if line_num == 1: # skip header row
            continue

          original_prompt = row[0].strip()
          code_output = row[1]

          try:
            llm_output = llm(prompt_template.format(question=original_prompt))
            llm_output = llm_output.replace('1. ','').replace('2. ','').replace('3. ','').replace('4. ','').strip()
            alternative_prompts = llm_output.split('\n')
            if len(alternative_prompts) != 4:
              raise Exception(f"Expected 4 alternative prompts, but got {len(alternative_prompts)}")

            results.append({'prompt': original_prompt, 'code_output': code_output})
            for prompt in alternative_prompts:
              #print(prompt)
              results.append({'prompt': prompt, 'code_output': code_output})
          except Exception as e:
            print(">>> WARNING:", e)
            skip_count += 1
            continue


          if line_num == num_of_rows:
            break

          #endfor
      except Exception as e:
        print(f"ERROR: Something went wrong:", e)

      finally:
        print(f"\nProcesed {line_num-1} prompts. Skipped {skip_count}")

        return results


def write_results_to_csv(results, filename = 'results.csv'):
  keys = results[0].keys()

  with open(filename, 'w', newline='') as output_file:
      dict_writer = csv.DictWriter(output_file, keys)
      dict_writer.writeheader()
      dict_writer.writerows(results)


def format_and_convert_to_jsonl(input_csv_filename, output_jsonl_filename):
  line_num = 0
  dictl = []
  with open(input_csv_filename) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
        dict_entry = {}
        line_num += 1
        if line_num == 1: # skip header row
            continue

        prompt = row[0]
        code = row[1]
        #raw_code = code.encode('unicode-escape').decode()
        formatted_code = f'```python\n{code}\n```'

        dict_entry['input_text'] = prompt
        dict_entry['output_text'] = formatted_code

        dictl.append(dict_entry)
        # TODO: add prompt variations and append

  with open(output_jsonl_filename, 'a') as jsonl_file:
      for line in dictl:
          jsonl_file.write(json.dumps(line))
          jsonl_file.write('\n')


def upload_file_to_gcs(filename, bucket_name = BUCKET_NAME):
  today_date = date.today().strftime('%Y%m%d')
  storage_client = storage.Client()
  bucket = storage_client.bucket(bucket_name)
  blob = bucket.blob(f"{today_date}_{filename}")
  blob.upload_from_filename(filename)


In [None]:
# Set num_of_rows to test on a small subset of rows. If set to None, it will read the entire file
results = augment_dataset(DATASET, num_of_rows = 3)
# results = augment_dataset(DATASET, num_of_rows = None)



### Exporting results

In [None]:
write_results_to_csv(results, filename = 'augmented_fine_tuning_dataset.csv')

In [None]:
format_and_convert_to_jsonl('augmented_fine_tuning_dataset.csv', 'augmented_fine_tuning_dataset.jsonl')

In [None]:
upload_file_to_gcs('augmented_fine_tuning_dataset.jsonl')

### Model tuning

✅ Here are some recommended configurations for tuning a code foundation model. You can find more in the [documentation](https://cloud.google.com/vertex-ai/docs/generative-ai/models/tune-code-models#recommended-configurations).

In [None]:
from google.cloud import aiplatform

MODEL_NAME = f"genai-workshop-tuned-model-{UUID}"

TRAINING_STEPS = 200

pipeline_arguments = {
    "model_display_name": MODEL_NAME,
    "location": REGION,
    "large_model_reference": "code-bison@latest",
    "project": PROJECT_ID,
    "train_steps": TRAINING_STEPS,
    "dataset_uri": DATASET_GCS_URI,
}

pipeline_root = f'{BUCKET_URI}/{MODEL_NAME}'
template_path = 'https://us-kfp.pkg.dev/ml-pipeline/large-language-model-pipelines/tune-large-model/v2.0.0'

# Function that starts the tuning job
def tuned_model(
    location: str,
    template_path: str,
    model_display_name: str,
    pipeline_arguments: str,
):
    """Prompt-tune a new model, based on a prompt-response data.

    "training_data" can be either the GCS URI of a file formatted in JSONL format
    (for example: training_data=f'gs://{bucket}/{filename}.jsonl'), or a pandas
    DataFrame. Each training example should be JSONL record with two keys, for
    example:
      {
        "input_text": <input prompt>,
        "output_text": <associated output>
      },

    Args:
      project_id: GCP Project ID, used to initialize aiplatform
      location: GCP Region, used to initialize aiplatform
      template_path: path to the template
      model_display_name: Name for your model.
      pipeline_arguments: arguments used during pipeline runtime
    """

    aiplatform.init(project=PROJECT_ID, location=location)

    

    job = aiplatform.PipelineJob(
        template_path=template_path,
        display_name=model_display_name,
        parameter_values=pipeline_arguments,
        location=REGION,
        pipeline_root=pipeline_root,
        enable_caching=True,
    )

    return job

#### Start tuning job

In [None]:
job = tuned_model(REGION, template_path, MODEL_NAME, pipeline_arguments)

In [None]:
job.submit()