In [1]:
#Setup installation of packages and what to import/consume later:
%pip install scipy
%pip install tokenizers
%pip install openai

import regex
import IPython.display
import os
import shutil
import subprocess
import openai
import re
from tokenizers import Tokenizer
from tokenizers import CharBPETokenizer as CBPET
from tokenizers import ByteLevelBPETokenizer as BBPET
from tokenizers import SentencePieceBPETokenizer as SPBPET
from tokenizers import BertWordPieceTokenizer as BWPT
from tokenizers.trainers import BpeTrainer
import ast  # for converting embeddings saved as strings back to arrays
import pandas as pd  # for storing text and embeddings data
import tiktoken  # for counting tokens
from scipy import spatial  # for calculating vector similarities for search


Note: you may need to restart the kernel to use updated packages.



In [2]:
#Set the opeanAI API key:
openai.api_key_path = 'openai-key.txt'

In [3]:
"""Setup a function that cleans up the Java Code slightly 
Removes 
    single-line comments
    multi-line comments
    empty new lines
"""

def remove_comments(java_code):
    # Remove single-line comments
    java_code = re.sub(r"//.*", "", java_code)

    # Remove multi-line comments
    java_code = re.sub(r"/\*(.|[\r\n])*?\*/", "", java_code)

    # Remove empty white lines:
    java_code = re.sub(r"\n\s*\n", "\n", java_code)

    return java_code

In [10]:
"""This cell just checks the occurrences of "test" in dirs
"""

repo_dir = "fineract" #Comment this out later maybe?

line_count = 0
for root, dirs, files in os.walk(repo_dir):
    for file_name in files:
        file_path = os.path.join(root, file_name)
        if file_name.endswith(".java"):
            with open(file_path, 'r') as file:
                for line_number, line in enumerate(file, start=1):
                    if 'test' in line:
                        if line_count == 0:
                            print(f"{file_path}:")
                        line_count += 1
                        print(f"Line {line_number}: {line.strip()}")

print(f"Total lines with 'test': {line_count}\n")

new-fineract/custom/acme/loan/job/src/test/java/AcmeNoopJobTaskletTest.java:
Line 47: public void testJobExecution() throws Exception {
Line 31: import org.springframework.boot.test.context.runner.ApplicationContextRunner;
Line 23: import org.springframework.test.context.TestPropertySource;
Line 24: import org.springframework.test.context.junit.jupiter.SpringExtension;
Line 25: import org.springframework.test.context.web.WebAppConfiguration;
Line 25: import org.springframework.boot.test.context.runner.ApplicationContextRunner;
Line 23: import org.springframework.test.context.TestPropertySource;
Line 24: import org.springframework.test.context.junit.jupiter.SpringExtension;
Line 25: import org.springframework.test.context.web.WebAppConfiguration;
Line 70: * code which would like to handle non-success HTTP responses, notably tests asserting non-200 results.
Line 472: * This is intended for https://localhost:8443/ testing of development servers with self-signed certificates,
Line 54: // p

In [4]:
"""Set up the codebase chunks and embed them
    - Clone the repo down (commented out right now so I don't have to wait for a clone each time it fails)
    - Extract & Clean (using above remove_comments() function) the java files
    - Chunk the java files into 1600 token chunks
    - Store the chunks in memory
    - Send the chunks off to openAI's embedding API endpoint (using text-embedding-ada-002)
    - Store the embedded text as a Pandas Dataframe
    - Print the dataframe (just to be sure)    
"""

repo_dir = "fineract"  # Use the cleaned repo
# Clone the GitHub repository -- should really only need to do this once
# repo_url = "https://github.com/apache/fineract"
# subprocess.run(["git", "clone", "-b", "1.8.4", repo_url, repo_dir]) # Grab the 1.8.4 fineract git branch just because it's stable

# Set up the tokenizer
# tokenizer = BBPET()  # Byte
tokenizer = SPBPET()   # Sentence

# Train the tokenizer on the Java files
java_files = []
for root, dirs, files in os.walk(repo_dir):
    if dirs == "test":
        os.rmdir(dirs)
    for file in files:
        if file.endswith(".java"):
            java_files.append(os.path.join(root, file))

# Process the Java files and break them into 1600 token chunks
# output_dir = "java_files_chunks_byte"
new_dir = "fineract-java"
output_dir = "java_files_chunks_sentence"
os.makedirs(output_dir, exist_ok=True)
os.makedirs(new_dir, exist_ok=True)

for file in java_files:
    with open(file, "r") as f:
        java_code = f.read()

    # Use the remove_comments function to redefine java_code as sans-comments java code:
    java_code = remove_comments(java_code)

    new_filename = f"cleaned-{os.path.basename(file)}"
    new_filepath = os.path.join(new_dir, new_filename)

    with open(new_filepath, "w") as f:
        f.write(java_code)

new_java_files = []
for root, dirs, files in os.walk(new_dir):
    if dirs == "test":
        os.rmdir(dirs)
    for file in files:
        if file.endswith(".java"):
            new_java_files.append(os.path.join(root, file))
# tokenizer_trainer = BpeTrainer(vocab_size=1600, min_frequency=2)
tokenizer.train(new_java_files)

output_content = []
for file in new_java_files:
    with open(file, "r") as f:
        java_code = f.read()

    encoding = tokenizer.encode(java_code)
    tokens = encoding.tokens
    ids = encoding.ids
    chunk_size = 1600
    num_chunks = (len(tokens) + chunk_size - 1) // chunk_size

    for i in range(num_chunks):
        start = i * chunk_size
        end = (i + 1) * chunk_size
        chunk_tokens = tokens[start:end]
        chunk_ids = ids[start:end]
        chunk_code = tokenizer.decode(chunk_ids)

        chunk_filename = os.path.basename(file) + f".chunk{i+1}.java"
        chunk_filepath = os.path.join(output_dir, chunk_filename)

        output_content.append(chunk_code)
        # If you want to visualize the tokenized java chunks, uncomment the following two lines:
        # with open(chunk_filepath, "w") as f:
        #     f.write(chunk_code)

EMBEDDING_MODEL = "text-embedding-ada-002"

# Number of token chunks to send at a time
# OpenAI's example specifies 1000 but I've had greater success with 100
BATCH_SIZE = 100

embeddings = []
for batch_start in range(0, len(output_content), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = output_content[batch_start:batch_end]
    print(f"Batch {batch_start} to {batch_end-1}")

    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response["data"]):
        assert i == be["index"]  # double check embeddings are in same order as input
    batch_embeddings = [e["embedding"] for e in response["data"]]
    embeddings.extend(batch_embeddings)

df = pd.DataFrame({"text": output_content, "embedding": embeddings})

# Print out the Dataframe, just to be sure:
df

Batch 0 to 99
Batch 100 to 199
Batch 200 to 299
Batch 300 to 399
Batch 400 to 499
Batch 500 to 599
Batch 600 to 699
Batch 700 to 799
Batch 800 to 899
Batch 900 to 999
Batch 1000 to 1099
Batch 1100 to 1199
Batch 1200 to 1299
Batch 1300 to 1399
Batch 1400 to 1499
Batch 1500 to 1599
Batch 1600 to 1699
Batch 1700 to 1799
Batch 1800 to 1899
Batch 1900 to 1999
Batch 2000 to 2099
Batch 2100 to 2199
Batch 2200 to 2299
Batch 2300 to 2399
Batch 2400 to 2499
Batch 2500 to 2599
Batch 2600 to 2699
Batch 2700 to 2799
Batch 2800 to 2899
Batch 2900 to 2999
Batch 3000 to 3099
Batch 3100 to 3199
Batch 3200 to 3299
Batch 3300 to 3399
Batch 3400 to 3499
Batch 3500 to 3599
Batch 3600 to 3699
Batch 3700 to 3799
Batch 3800 to 3899
Batch 3900 to 3999
Batch 4000 to 4099
Batch 4100 to 4199
Batch 4200 to 4299
Batch 4300 to 4399
Batch 4400 to 4499
Batch 4500 to 4599
Batch 4600 to 4699
Batch 4700 to 4799
Batch 4800 to 4899
Batch 4900 to 4999
Batch 5000 to 5099
Batch 5100 to 5199
Batch 5200 to 5299
Batch 5300 to 53

Unnamed: 0,text,embedding
0,\npackage org.apache.fineract.infrastructure.c...,"[-0.004885378293693066, -0.0031812922097742558..."
1,\npackage org.apache.fineract.infrastructure.c...,"[0.0057373046875, 0.01892096921801567, 0.00506..."
2,\npackage org.apache.fineract.infrastructure.c...,"[-0.0013821504544466734, 0.0016122340457513928..."
3,\npackage org.apache.fineract.infrastructure.c...,"[-0.005346867721527815, -0.0030837294179946184..."
4,\npackage org.apache.fineract.portfolio.busine...,"[-0.01613900251686573, -0.019924694672226906, ..."
...,...,...
5322,\npackage org.apache.fineract.integrationtests...,"[0.00895959697663784, 0.009982196614146233, -0..."
5323,\npackage org.apache.fineract.mix.exception;\n...,"[-0.008838596753776073, -0.021555285900831223,..."
5324,\npackage org.apache.fineract.mix.service;\nim...,"[-0.017134567722678185, 0.013209346681833267, ..."
5325,\npackage org.apache.fineract.mix.service;\nim...,"[-0.016589995473623276, 0.02383294515311718, -..."


In [18]:
# Actually store the chunks into memory after making the local repo
"""
There's definitely a better way to do this
I'm first writing the entire repo into chunked files and then reading them
    into memory to then pass it off to be embedded by AI
"""
output_content = []
for output_file in os.listdir(output_dir):
    output_file_path = os.path.join(output_dir, output_file)
    if os.path.isfile(output_file_path):
        with open(output_file_path, "r") as f:
            output_content.append(f.read())

In [19]:
"""Create a Pandas Dataframe against the embedded text from OpenAI's API endpoint
"""

EMBEDDING_MODEL = "text-embedding-ada-002"

# Number of token chunks to send at a time
# OpenAI's example specifies 1000 but I've had greater success with 100
BATCH_SIZE = 100

embeddings = []
for batch_start in range(0, len(output_content), BATCH_SIZE):
    batch_end = batch_start + BATCH_SIZE
    batch = output_content[batch_start:batch_end]
    print(f"Batch {batch_start} to {batch_end-1}")

    response = openai.Embedding.create(model=EMBEDDING_MODEL, input=batch)
    for i, be in enumerate(response["data"]):
        assert i == be["index"]  # double check embeddings are in same order as input
    batch_embeddings = [e["embedding"] for e in response["data"]]
    embeddings.extend(batch_embeddings)

df = pd.DataFrame({"text": output_content, "embedding": embeddings})

Batch 0 to 99
Batch 100 to 199
Batch 200 to 299
Batch 300 to 399
Batch 400 to 499
Batch 500 to 599
Batch 600 to 699
Batch 700 to 799
Batch 800 to 899
Batch 900 to 999
Batch 1000 to 1099
Batch 1100 to 1199
Batch 1200 to 1299
Batch 1300 to 1399
Batch 1400 to 1499
Batch 1500 to 1599
Batch 1600 to 1699
Batch 1700 to 1799
Batch 1800 to 1899
Batch 1900 to 1999
Batch 2000 to 2099
Batch 2100 to 2199
Batch 2200 to 2299
Batch 2300 to 2399
Batch 2400 to 2499
Batch 2500 to 2599
Batch 2600 to 2699
Batch 2700 to 2799
Batch 2800 to 2899
Batch 2900 to 2999
Batch 3000 to 3099
Batch 3100 to 3199
Batch 3200 to 3299
Batch 3300 to 3399
Batch 3400 to 3499
Batch 3500 to 3599
Batch 3600 to 3699
Batch 3700 to 3799
Batch 3800 to 3899
Batch 3900 to 3999
Batch 4000 to 4099
Batch 4100 to 4199
Batch 4200 to 4299
Batch 4300 to 4399
Batch 4400 to 4499
Batch 4500 to 4599
Batch 4600 to 4699
Batch 4700 to 4799


In [20]:
# Create the directory so that py is happy
os.makedirs("data", exist_ok=True)

# save document chunks and embeddings
SAVE_PATH = "data/fineract.csv"

df.to_csv(SAVE_PATH, index=False)

In [21]:
embeddings_path = SAVE_PATH

dataframe_consumed = pd.read_csv(embeddings_path)

In [22]:
# df['embedding'] = df['embedding'].apply(ast.literal_eval)
dataframe_consumed['embedding'] = dataframe_consumed['embedding'].apply(ast.literal_eval)

In [23]:
# df
dataframe_consumed

Unnamed: 0,text,embedding
0,\npackage org.apache.fineract.infrastructure.c...,"[-0.004885378293693066, -0.0031812922097742558..."
1,\npackage org.apache.fineract.infrastructure.c...,"[0.0057373046875, 0.01892096921801567, 0.00506..."
2,\npackage org.apache.fineract.infrastructure.c...,"[-0.0013821504544466734, 0.0016122340457513928..."
3,\npackage org.apache.fineract.infrastructure.c...,"[-0.005346867721527815, -0.0030837294179946184..."
4,\npackage org.apache.fineract.portfolio.busine...,"[-0.01613900251686573, -0.019924694672226906, ..."
...,...,...
4776,\npackage org.apache.fineract.mix.data;\nimpor...,"[-0.011889135465025902, 0.015257378108799458, ..."
4777,\npackage org.apache.fineract.mix.exception;\n...,"[-0.008838596753776073, -0.021555285900831223,..."
4778,\npackage org.apache.fineract.mix.service;\nim...,"[-0.017134567722678185, 0.013209346681833267, ..."
4779,\npackage org.apache.fineract.mix.service;\nim...,"[-0.016249122098088264, 0.015869341790676117, ..."


In [24]:
"""
Setup the vector search function that scrapes through your dataframe and/or CSV file to find related code based on your prompt
"""

# search function
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100
) -> tuple[list[str], list[float]]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = openai.Embedding.create(
        model=EMBEDDING_MODEL,
        input=query,
    )
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["text"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n], relatednesses[:top_n]

In [27]:
"""
Examples -- this just shows the underlying relatedness scoring
    Don't actually need to run this, but it's good to understand
    what it's doing under the hood
"""

strings, relatednesses = strings_ranked_by_relatedness("Create a new unit test for public class CodesApiResource", dataframe_consumed, top_n=5)
# strings, relatednesses = strings_ranked_by_relatedness("Provide the code for a new unit test for the /v1/accounts/{type} path", dataframe_consumed, top_n=5)

# strings, relatednesses = strings_ranked_by_relatedness("Write a new unit test for the /v1/accounts/{type} path", dataframe_consumed, top_n=5)
for string, relatedness in zip(strings, relatednesses):
    print(f"{relatedness=:.3f}")
    print(string)

relatedness=0.796

package org.apache.fineract.infrastructure.codes.api;
import io.swagger.v3.oas.annotations.media.Schema;
final class CodesApiResourceSwagger {
    private CodesApiResourceSwagger() {
    }
    @Schema(description = "GetCodesResponse")
    public static final class GetCodesResponse {
        private GetCodesResponse() {
        }
        @Schema(example = "1")
        public Long id;
        @Schema(example = "Education")
        public String name;
        @Schema(example = "true")
        public boolean systemDefined;
    }
    @Schema(description = "PostCodesRequest")
    public static final class PostCodesRequest {
        private PostCodesRequest() {
        }
        @Schema(example = "MyNewCode")
        public String name;
    }
    @Schema(description = "PostCodesResponse")
    public static final class PostCodesResponse {
        private PostCodesResponse() {
        }
        @Schema(example = "4")
        public Long resourceId;
    }
    @Schema(descripti

In [35]:
"""
Define the functions for how to reach out to ChatGPT
    use by calling ask(<prompt>)
    Can optionally get the hidden context printed by typing

        ask(<prompt>, print_message=True)
"""

GPT_MODEL = "gpt-3.5-turbo"

def num_tokens(text: str, model: str = GPT_MODEL) -> int:
    """Return the number of tokens in a string."""
    encoding = tiktoken.encoding_for_model(model)
    return len(encoding.encode(text))


def query_message(
    query: str,
    df: pd.DataFrame,
    model: str,
    token_budget: int
) -> str:
    """Return a message for GPT, with relevant source texts pulled from a dataframe."""
    strings, relatednesses = strings_ranked_by_relatedness(query, df)
    introduction = 'Use the below codeset from the fineract java web application to answer the subsequent question. If the answer cannot be found from the code sample, write "I could not find an answer."'
    question = f"\n\nQuestion: {query}"
    message = introduction
    for string in strings:
        next_article = f'\n\nFineract Application codebase selection:\n"""\n{string}\n"""'
        if (
            num_tokens(message + next_article + question, model=model)
            > token_budget
        ):
            break
        else:
            message += next_article
    return message + question


def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = GPT_MODEL,
    token_budget: int = 4096 - 500,
    print_message: bool = False,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    message = query_message(query, df, model=model, token_budget=token_budget)
    if print_message:
        print(message)
    messages = [
        {"role": "system", "content": "You answer questions about the Fineract Web Application."},
        {"role": "user", "content": message},
    ]
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0
    )
    response_message = response["choices"][0]["message"]["content"]
    return response_message

In [36]:
"""
Call out to ChatGPT

Syntax:
    
    ask("<prompt>", print_message=<True/False>)

print_message defaults to false; if set to True it displays the hidden context along with the answer provided by ChatGPT
"""

# Ask a question while hiding the context:
ask("Create a new unit test for public class CodesApiResource",print_message=True)

# Ask a question while showing the context:
# ask('Write a unit test for the /v1/accounts/{type} path', print_message=True)

Use the below codeset from the fineract java web application to answer the subsequent question. If the answer cannot be found from the code sample, write "I could not find an answer."

Fineract Application codebase selection:
"""

package org.apache.fineract.infrastructure.codes.api;
import io.swagger.v3.oas.annotations.media.Schema;
final class CodesApiResourceSwagger {
    private CodesApiResourceSwagger() {
    }
    @Schema(description = "GetCodesResponse")
    public static final class GetCodesResponse {
        private GetCodesResponse() {
        }
        @Schema(example = "1")
        public Long id;
        @Schema(example = "Education")
        public String name;
        @Schema(example = "true")
        public boolean systemDefined;
    }
    @Schema(description = "PostCodesRequest")
    public static final class PostCodesRequest {
        private PostCodesRequest() {
        }
        @Schema(example = "MyNewCode")
        public String name;
    }
    @Schema(description

InvalidRequestError: The model: `code-davinci-002` does not exist