# SageMaker Code Generation with Code Llama: Deploying Pre trained Code Llama

#### Importing sys and other important libraries: Lanchain, Chromadb as our vectordb to store indexes and boto3 for our environment

In [None]:
import sys
!{sys.executable} -m pip install langchain
!{sys.executable} -m pip install chromadb
!{sys.executable} -m pip install --upgrade boto3

#### Import other libraries and document loaders as well as libraries like the recursive character splitting to be able to efficiently generate code through our model

In [None]:
import argparse
import os
from langchain.document_loaders import DirectoryLoader
import chromadb
import json
import boto3
import time
import glob
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    Language,
)
import ast
import sys

### Deploy the code Llama 7b model


In [None]:
model_id = "meta-textgeneration-llama-codellama-7b"

from sagemaker.jumpstart.model import JumpStartModel

model = JumpStartModel(model_id=model_id)
predictor = model.deploy()

In [None]:
# Get the name of the endpoint
endpoint_name = str(predictor.endpoint)

print(endpoint_name)

In [None]:
def query_endpoint(payload):
    client = boto3.client('runtime.sagemaker')
    response = client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType='application/json',
        Body=json.dumps(payload).encode('utf-8'),
        CustomAttributes="accept_eula=true",
    )
    response = response["Body"].read().decode("utf8")
    response = json.loads(response)
    return response

### Supported parameters

***
This model supports many parameters while performing inference. They include:

* **max_length:** Model generates text until the output length (which includes the input context length) reaches `max_length`. If specified, it must be a positive integer.
* **max_new_tokens:** Model generates text until the output length (excluding the input context length) reaches `max_new_tokens`. If specified, it must be a positive integer.
* **num_beams:** Number of beams used in the greedy search. If specified, it must be integer greater than or equal to `num_return_sequences`.
* **no_repeat_ngram_size:** Model ensures that a sequence of words of `no_repeat_ngram_size` is not repeated in the output sequence. If specified, it must be a positive integer greater than 1.
* **temperature:** Controls the randomness in the output. Higher temperature results in output sequence with low-probability words and lower temperature results in output sequence with high-probability words. If `temperature` -> 0, it results in greedy decoding. If specified, it must be a positive float.
* **early_stopping:** If True, text generation is finished when all beam hypotheses reach the end of sentence token. If specified, it must be boolean.
* **do_sample:** If True, sample the next word as per the likelihood. If specified, it must be boolean.
* **top_k:** In each step of text generation, sample from only the `top_k` most likely words. If specified, it must be a positive integer.
* **top_p:** In each step of text generation, sample from the smallest possible set of words with cumulative probability `top_p`. If specified, it must be a float between 0 and 1.
* **return_full_text:** If True, input text will be part of the output generated text. If specified, it must be boolean. The default value for it is False.
* **stop**: If specified, it must a list of strings. Text generation stops if any one of the specified strings is generated.

We may specify any subset of the parameters mentioned above while invoking an endpoint. Next, we show an example of how to invoke endpoint with these arguments.
***

## Code completion without context
***
This section demonstrate how to perform code generation where the expected endpoint response is the natural continuation of the prompt. No context is provided to. As seen below the LLM hallucinates when providing the continuation of the code because it has not been trained on the library used to test
***

In [None]:
def print_completion(prompt: str, response: str) -> None:
    bold, unbold = '\033[1m', '\033[0m'
    print(f"{bold}> Input{unbold}\n{prompt}{bold}\n> Output{unbold}\n{response['generated_text']}\n")

In [None]:
%%time

prompt = """\
import sagemaker

# Create an HTML page about Amazon SageMaker
html_content = f'''
<!DOCTYPE html>
<html>
<head>
    <title>Amazon SageMaker</title>
</head>
<body>
    <h1>Welcome to Amazon SageMaker</h1>
    <p>Amazon SageMaker is a fully managed service for building, training, and deploying machine learning models.</p>
    <h2>Key Features</h2>
    <ul>
        <li>Easy to use</li>
        <li>Scalable</li>
        <li>End-to-end machine learning workflow</li>
    </ul>
    <p>Get started with SageMaker today and unlock the power of machine learning!</p>
</body>
</html>
'''

html_content
"""

payload = {"inputs": prompt, "parameters": {"max_new_tokens": 256, "temperature": 0.2, "top_p": 0.9}}
response = query_endpoint(payload)
print_completion(prompt, response)

# Code completion
***
The examples in this section demonstrate how to perform code generation where the expected endpoint response is the natural continuation of the prompt.
***

In [None]:
def print_completion(prompt: str, response: str) -> None:
    bold, unbold = '\033[1m', '\033[0m'
    print(f"{bold}> Input{unbold}\n{prompt}{bold}\n> Output{unbold}\n{response['generated_text']}\n")

In [None]:
%%time

prompt = """\
import socket

def ping_exponential_backoff(host: str):\
"""

payload = {"inputs": prompt, "parameters": {"max_new_tokens": 256, "temperature": 0.2, "top_p": 0.9}}
response = predictor.predict(payload, custom_attributes='accept_eula=true')
print_completion(prompt, response)

In [None]:
%%time

prompt = """\
import argparse

def main(string: str):
    print(string)
    print(string[::-1])

if __name__ == "__main__":\
"""

payload = {"inputs": prompt, "parameters": {"max_new_tokens": 256, "temperature": 0.2, "top_p": 0.9}}
response = predictor.predict(payload, custom_attributes='accept_eula=true')
print_completion(prompt, response)

## Code infilling
***
The examples in this section demonstrate how to perform code generation where the expected endpoint response infills text between a prefix and a suffix. Only 7B, 7B-Instruct, 13B, and 13B-Instruct models have this capability, while the non-instruct models have been observed to obtain the best anecdotal performance.
***

In [None]:
def format_infilling(prompt: str) -> str:
    prefix, suffix = prompt.split("<FILL>")
    return f"<PRE> {prefix} <SUF>{suffix} <MID>"


def print_infilling(prompt: str, response: str) -> str:
    green, font_reset = "\x1b[38;5;2m", "\x1b[0m"
    prefix, suffix = prompt.split("<FILL>")
    print(f"{prefix}{green}{response['generated_text']}{font_reset}{suffix}")

In [None]:
%%time

prompt = '''\
def remove_non_ascii(s: str) -> str:
    """<FILL>
    return result
'''
prompt_formatted = format_infilling(prompt)
payload = {
    "inputs": prompt_formatted,
    "parameters": {"max_new_tokens": 256, "temperature": 0.05, "top_p": 0.9}
}
response = predictor.predict(payload, custom_attributes='accept_eula=true')
print_infilling(prompt, response)

In [None]:
%%time

prompt = """\
# Installation instructions:
    ```bash
<FILL>
    ```
This downloads the LLaMA inference code and installs the repository as a local pip package.
"""
prompt_formatted = format_infilling(prompt)
payload = {
    "inputs": prompt_formatted,
    "parameters": {"max_new_tokens": 256, "temperature": 0.05, "top_p": 0.9}
}
response = predictor.predict(payload, custom_attributes='accept_eula=true')
print_infilling(prompt, response)

In [None]:
%%time

prompt = """\
class InterfaceManagerFactory(AbstractManagerFactory):
    def __init__(<FILL>
def main():
    factory = InterfaceManagerFactory(start=datetime.now())
    managers = []
    for i in range(10):
        managers.append(factory.build(id=i))
"""
prompt_formatted = format_infilling(prompt)
payload = {
    "inputs": prompt_formatted,
    "parameters": {"max_new_tokens": 256, "temperature": 0.05, "top_p": 0.9}
}
response = predictor.predict(payload, custom_attributes='accept_eula=true')
print_infilling(prompt, response)

In [None]:
%%time

prompt = """\
/-- A quasi-prefunctoid is 1-connected iff all its etalisations are 1-connected. -/
theorem connected_iff_etalisation [C D : precategoroid] (P : quasi_prefunctoid C D) :
  π₁ P = 0 ↔ <FILL> = 0 :=
begin
  split,
  { intros h f,
    rw pi_1_etalisation at h,
    simp [h],
    refl
  },
  { intro h,
    have := @quasi_adjoint C D P,
    simp [←pi_1_etalisation, this, h],
    refl
  }
end
"""
prompt_formatted = format_infilling(prompt)
payload = {
    "inputs": prompt_formatted,
    "parameters": {"max_new_tokens": 256, "temperature": 0.05, "top_p": 0.9}
}
response = predictor.predict(payload, custom_attributes='accept_eula=true')
print_infilling(prompt, response)

## Clean up the endpoint
If you are running the next lab on customizing Code Llama model then do not delete the endpoint. Otherwise go ahead and delete the endpoint by running the next cell.

In [None]:
predictor.delete_endpoint()