# SageMaker Code Generation with Code-Llama: Customizing CodeLLaMa with Retrieval Augmented Generation with your data


### Installing some dependencies and libraries

In [5]:
pip install --upgrade pip

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting pip
  Obtaining dependency information for pip from https://files.pythonhosted.org/packages/e0/63/b428aaca15fcd98c39b07ca7149e24bc14205ad0f1c80ba2b01835aedde1/pip-23.3-py3-none-any.whl.metadata
  Downloading pip-23.3-py3-none-any.whl.metadata (3.5 kB)
Downloading pip-23.3-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m74.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.2.1
    Uninstalling pip-23.2.1:
      Successfully uninstalled pip-23.2.1
Successfully installed pip-23.3
[0mNote: you may need to restart the kernel to use updated packages.


In [16]:
 import sys
!{sys.executable} -m pip install langchain
!{sys.executable} -m pip install chromadb
!{sys.executable} -m pip install --upgrade boto3

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mLooking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0mLooking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
[0m

### Now, we will query the endpoint of the pre trained model that we deployed

In [17]:
 import argparse
import os
from langchain.document_loaders import DirectoryLoader
import chromadb
import json
import boto3
import time
import glob
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    Language,
)
import ast
import sys

In [18]:
endpoint_name = 'meta-textgeneration-llama-codellama-7b-2023-10-19-01-05-43-652'

def query_endpoint(payload):
    client = boto3.client('runtime.sagemaker')
    response = client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType='application/json',
        Body=json.dumps(payload).encode('utf-8'),
        CustomAttributes="accept_eula=true",
    )
    response = response["Body"].read().decode("utf8")
    response = json.loads(response)
    return response

## Setting variables for our vectordb, context source, embeddings model (Titan) and so on"

In [21]:
txtdir = "finetuningCodeLLama"
chrdir = "chroma"
embedding_model = 'amazon.titan-embed-text-v1'
chroma_client = chromadb.PersistentClient(chrdir)

In [22]:
from typing import Dict, List

def class_list(filename:str):
    with open(filename,"r") as f:
        file_raw = f.read()
   
   # Convert the loaded file into an Abstract Syntax Tree
    file_ast = ast.parse(file_raw)
    cnames = []

   # Walk every node in the tree
    for node in ast.walk(file_ast):
        if isinstance(node,ast.ClassDef):
            cnames.append(node.name)
            
    return cnames

def get_embedding(text, modelId, client):
    accept = 'application/json'
    contentType = 'application/json'
    inp = json.dumps({"inputText": text})
    response = client.invoke_model(body=inp, modelId=modelId, accept=accept, contentType=contentType)
    response_body = json.loads(response.get('body').read())
    embedding = response_body.get('embedding')
    return embedding

def get_context(prompt, q_filter=None):
    print(f"Creating embedding for question")

    bedrock = boto3.client(
        service_name='bedrock',
        region_name='us-east-1'
    )
    
    collection = chroma_client.get_collection(name="pyrag")
    embedding = get_embedding(prompt, embedding_model, bedrock_runtime)
    if q_filter is None:
        q_embed = collection.query(query_embeddings = embedding, n_results=3)
    else:
        q_embed = collection.query(query_embeddings = embedding, n_results=3, where=q_filter)
    context_docs = q_embed['documents'][0]
    print(f"Found {len(context_docs)} context docs")
    context = "\n".join(context_docs)
    
    return context


def format_instructions(instructions: List[Dict[str, str]]) -> List[str]:
    """Format instructions for CodeLlama.
    
    The model only supports 'system', 'user' and 'assistant' roles, starting with 'system', then 'user' and 
    alternating (u/a/u/a/u...). The last message must be from 'user'.
    """
    prompt: List[str] = []

    if instructions[0]["role"] == "system":
        content = "".join(["<<SYS>>\n", instructions[0]["content"], "\n<</SYS>>\n\n", instructions[1]["content"]])
        instructions = [{"role": instructions[1]["role"], "content": content}] + instructions[2:]

    for user, answer in zip(instructions[::2], instructions[1::2]):
        prompt.extend(["<s>", "[INST] ", (user["content"]).strip(), " [/INST] ", (answer["content"]).strip(), "</s>"])

    prompt.extend(["<s>", "[INST] ", (instructions[-1]["content"]).strip(), " [/INST] "])

    return "".join(prompt)


def print_instructions(prompt: str, response: str) -> None:
    bold, unbold = '\033[1m', '\033[0m'
    print(f"{bold}> Output{unbold}\n{response['generated_text']}\n")
    # print(f"{bold}> Input{unbold}\n{prompt}\n\n{bold}> Output{unbold}\n{response['generated_text']}\n")

## Now, we will create embeddings of the code in our context to be able to use RAG on that data, chunk it, and then be able to give the most accurate and relevant prompt completions

In [23]:
print(f"Splitting python files in {txtdir}")
python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=5000, chunk_overlap=0
)
texts = []
metadatas = []
txtdir_len = len(txtdir) + 1
for filename in glob.iglob(os.path.join(txtdir, '**/*.py'), recursive=True):
    sub_dir, sub_file = os.path.split(filename[txtdir_len:])
    mname = sub_file[:-3]
    parent_module = sub_dir.split("/")[-1]
    with open(filename, 'r') as IF:
        doc_lines = IF.readlines()
        doc_text = "".join(doc_lines)
    texts.append(doc_text)
    cnames = class_list(filename)
    if len(cnames) > 0:
        metadatas.append({'module': mname, 'module': parent_module, 'class': cnames[0]})
    else:
        metadatas.append({'module': mname, 'module': parent_module})

python_docs = python_splitter.create_documents(texts, metadatas)
print(f"Creating embeddings")
os.makedirs(chrdir, exist_ok=True)
bedrock_runtime = boto3.client(
    service_name='bedrock-runtime',
    region_name='us-east-1'
)

collection = chroma_client.get_or_create_collection(name="pyrag")
cnt = 0
for t in python_docs:
    embedding = get_embedding(t.page_content, embedding_model, bedrock_runtime)
    collection.add(
        embeddings=embedding,
        documents=t.page_content,
        ids=f"id{cnt}",
        metadatas=t.metadata
    )
    cnt = cnt + 1
    time.sleep(1)
print(f"Embeddings created")

Splitting python files in finetuningCodeLLama
Creating embeddings


Insert of existing embedding ID: id0
Add of existing embedding ID: id0


Embeddings created


### Now we will use instruction prompting to leverage RAG and get a best response

In [24]:
prompt="""Write a python code that displays how a llama model can be trained"""

context = get_context(prompt)


prompt_data = f"""Use the following pieces of related code to respond to the request.

{context}

Request: {prompt}
"""

instructions = [
    {
        "role": "user",
        "content": prompt_data,
    }
]

prompt = format_instructions(instructions)
payload = {
    "inputs": prompt,
    "parameters": {"max_new_tokens": 1000, "temperature": 0.2, "top_p": 0.9}
}
response = query_endpoint(payload)
print_instructions(prompt, response)

Creating embedding for question
Found 3 context docs
[1m> Output[0m


        "\"\"\"\n",
        "\n",
        "    return tokenize(full_prompt)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "5f7K7Q7p1CSK"
      },
      "source": [
        "### 5. Dataset\n",
        "Now that I have a tokenize function, I can create a dataset that I can use to train the model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "61-605r_-1CSK"
      },
      "outputs": [],
      "source": [
        "dataset = Dataset.from_pandas(train_df)\n",
        "dataset = dataset.map(generate_and_tokenize_prompt, batched=True)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "2q284RQf1CSK"
      },
      "source": [
        "### 6. Model\n",
        "Now that I have a dataset, I can create a model:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count"