In [1]:
# Import libraries
import os
from glob import glob
import pandas as pd
import openai
from openai.embeddings_utils import get_embedding
import tiktoken

# Set OpenAI API key
with open('openai-key.txt', 'r') as f:
    key = f.read().strip()

os.environ['OPENAI_API_KEY'] = key
openai.api_key_path = 'openai-key.txt'

# Set location of code repo
#repoFolder = "fineract/fineract-provider/src/main/java/org/apache/fineract/portfolio/account"
#repoFolder = "openai-cookbook"
repoFolder = "BF_Planning_Poker/modules"

# Set code file extension type
extType = "js"

In [2]:
# Functions to calculate tokens used in openai models

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301"):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        print("Warning: gpt-3.5-turbo may change over time. Returning num tokens assuming gpt-3.5-turbo-0301.")
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
    elif model == "gpt-4":
        print("Warning: gpt-4 may change over time. Returning num tokens assuming gpt-4-0314.")
        return num_tokens_from_messages(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

In [3]:
# Functions to create dataframe for Python repos broken down by individual functions

def get_function_name(code):
    """
    Extract function name from a line beginning with "def "
    """
    assert code.startswith("def ")
    return code[len("def "): code.index("(")]

def get_until_no_space(all_lines, i) -> str:
    """
    Get all lines until a line outside the function definition is found.
    """
    ret = [all_lines[i]]
    for j in range(i + 1, i + 10000):
        if j < len(all_lines):
            if len(all_lines[j]) == 0 or all_lines[j][0] in [" ", "\t", ")"]:
                ret.append(all_lines[j])
            else:
                break
    return "\n".join(ret)

def get_functions(filepath):
    """
    Get all functions in a Python file.
    """
    whole_code = open(filepath).read().replace("\r", "\n")
    all_lines = whole_code.split("\n")
    for i, l in enumerate(all_lines):
        if l.startswith("def "):
            code = get_until_no_space(all_lines, i)
            function_name = get_function_name(code)
            yield {"code": code, "function_name": function_name, "filepath": filepath}

In [4]:
# Create dataframe for code embedding

code_root = repoFolder

code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], f"*.{extType}"))]
print(f"Total number of {extType} files:", len(code_files))

if extType == "py":
    all_funcs = []
    for code_file in code_files:
        funcs = list(get_functions(code_file))
        for func in funcs:
            all_funcs.append(func)

    print("Total number of functions extracted:", len(all_funcs))

else:
    all_funcs = []
    for code_file in code_files:
        whole_code = open(code_file).read().replace("\r", "\n")
        
#         code1 = open(code_file).read().replace("\r", "\n")
#         start_index = code1.find("*/")
#         whole_code = code1[start_index:]
        
        all_funcs.append({"code": whole_code, "filepath": code_file})

df = pd.DataFrame(all_funcs)
df

Total number of js files: 12


Unnamed: 0,code,filepath
0,"(function(window, angular, undefined) {\n ""us...",BF_Planning_Poker/modules/App.js
1,"(function(window, angular, undefined) {\n ""us...",BF_Planning_Poker/modules/landing/Landing.js
2,"(function(window, angular, undefined) {\n ""us...",BF_Planning_Poker/modules/landing/LandingContr...
3,"(function(window, angular, undefined) {\n ""us...",BF_Planning_Poker/modules/room/ResultsService.js
4,"(function(window, angular, undefined) {\n ""us...",BF_Planning_Poker/modules/room/DeckFactory.js
5,"(function(window, angular, undefined) {\n ""us...",BF_Planning_Poker/modules/room/DeckController.js
6,"(function(window, angular, undefined) {\n ""us...",BF_Planning_Poker/modules/room/Room.js
7,"(function(window, angular, undefined) {\n ""us...",BF_Planning_Poker/modules/room/RoomController.js
8,"(function(window, angular, undefined) {\n ""...",BF_Planning_Poker/modules/siteRequest/SiteRequ...
9,"(function(window, angular, undefined) {\n ""...",BF_Planning_Poker/modules/siteRequest/SiteRequ...


In [5]:
# Create code embeddings for each row in the dataframe by using the text-embedding-ada-002 model

from openai.embeddings_utils import get_embedding
import warnings
warnings.filterwarnings("ignore")

df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
if extType == "py":
    df['filepath'] = df['filepath'].apply(lambda x: x.replace(code_root, ""))

# Calculate tokens used
totalCount = 0
for value in df['code']:
    tokenCount = num_tokens_from_string(value, "cl100k_base")
    totalCount += tokenCount
    
print(f"Tokens used for embeddings: {totalCount}.")    
    
df.to_csv("code_search_openai-python.csv", index=False)
df.head()

# Instead of running the above code, the below code can read an existing file that already contains code embeddings

# df = pd.read_csv("code_search_openai-python.csv")
# df.head()

Tokens used for embeddings: 6566.


Unnamed: 0,code,filepath,code_embedding
0,"(function(window, angular, undefined) {\n ""us...",BF_Planning_Poker/modules/App.js,"[-0.0037332025822252035, 0.017689839005470276,..."
1,"(function(window, angular, undefined) {\n ""us...",BF_Planning_Poker/modules/landing/Landing.js,"[-0.005229437258094549, 0.01890694722533226, -..."
2,"(function(window, angular, undefined) {\n ""us...",BF_Planning_Poker/modules/landing/LandingContr...,"[-0.012401297688484192, 0.012381413020193577, ..."
3,"(function(window, angular, undefined) {\n ""us...",BF_Planning_Poker/modules/room/ResultsService.js,"[-0.010964068584144115, 0.02558058872818947, -..."
4,"(function(window, angular, undefined) {\n ""us...",BF_Planning_Poker/modules/room/DeckFactory.js,"[-0.006666587665677071, 0.004226575139909983, ..."


In [6]:
# Function to get embedding for a string, specifically meant to be used for the user prompt for openai models

from openai.embeddings_utils import cosine_similarity

def search_functions(df, code_query, n=3, pprint=False, n_lines=7):
    embedding = get_embedding(code_query, engine='text-embedding-ada-002')
    df['similarities'] = df.code_embedding.apply(lambda x: cosine_similarity(x, embedding))
    
    # Calculate tokens used
    tokenCount = num_tokens_from_string(code_query, "cl100k_base")
    print(f"Tokens used for embeddings: {tokenCount}.")

    res = df.sort_values('similarities', ascending=False).head(n)
    if pprint:
        for r in res.iterrows():
            print(r[1].filepath+":"+r[1].function_name + "  score=" + str(round(r[1].similarities, 3)))
            print("\n".join(r[1].code.split("\n")[:n_lines]))
            print('-'*70)
    return res

In [7]:
# Functions to print out response from gpt models. Used when stream = True in the openai.ChatCompletion.create function

color_prefix_by_role = {
    "system": "\033[0m",  # gray
    "user": "\033[0m",  # gray
    "assistant": "\033[92m",  # green
}


def print_messages(messages, color_prefix_by_role=color_prefix_by_role) -> None:
    """Prints messages sent to or from GPT."""
    for message in messages:
        role = message["role"]
        color_prefix = color_prefix_by_role[role]
        content = message["content"]
        print(f"{color_prefix}\n[{role}]\n{content}")


def print_message_delta(delta, color_prefix_by_role=color_prefix_by_role) -> None:
    """Prints a chunk of messages streamed back from GPT."""
    if "role" in delta:
        role = delta["role"]
        color_prefix = color_prefix_by_role[role]
        print(f"{color_prefix}\n[{role}]\n", end="")
    elif "content" in delta:
        content = delta["content"]
        print(content, end="")
    else:
        pass


In [9]:
# Function to query GPT based on a prompt

def ask(
    query: str,
    df: pd.DataFrame = df,
    model: str = "gpt-3.5-turbo",
    print_message: bool = True,
) -> str:
    """Answers a query using GPT and a dataframe of relevant texts and embeddings."""
    res = search_functions(df, query, n=3)

    # Loop through each row of the dataframe
    context = ""
    for index, row in res.iterrows():
        # Access the value in the 'column_name' column for the current row
        filePath = row['filepath']
        code = row['code']
        combined = f'''\n\nCode File Location: {filePath}\nCode: {code}'''
        context = context + combined
    
    message = f'''
    Use the below code to respond to this prompt: {query}
    {context}
    '''
    
    messages = [
        {"role": "system", "content": "You are a world-class developer with an eagle eye for unintended bugs and edge cases. You carefully explain code with great detail and accuracy. You organize your explanations in markdown-formatted, bulleted lists."},
        {"role": "user", "content": message},
    ]
    
    numTokens = num_tokens_from_messages(messages)
    print(f"Tokens used in prompt for gpt-3.5-turbo: {numTokens}.")
    
    response = openai.ChatCompletion.create(
        model=model,
        messages=messages,
        temperature=0.5,
        #stream=True
    )
    
# Run the following code if stream = True in the above openai.ChatCompletion.create params   
#     execution = ""
#     tokenCount = 0
#     for chunk in response:
#         delta = chunk["choices"][0]["delta"]
#         if print_message:
#             print_message_delta(delta)
#             tokenCount += 1
#         if "content" in delta:
#             execution += delta["content"]
    
#     print(f"Tokens used in OpenAI response: {tokenCount}")

    print(f'''Tokens used in OpenAI response: {response["usage"]["completion_tokens"]}.''')
    response_message = response["choices"][0]["message"]["content"]
    
    return response_message

In [10]:
#response = ask('Using the gradle framework, write the code for one unit test to test transferring money from one savings account to another.')
response = ask('Write the code for only 1 very basic unit test for the RoomController function.')
print(response)

Tokens used for embeddings: 17.
Tokens used in prompt for gpt-3.5-turbo: 1438.
Tokens used in OpenAI response: 278.
Basic Unit Test for RoomController function:

1. Test that the function 'resetVotes' resets all user votes to null and sets the reveal and results properties of the room object to their default values.

Test Code:

```
describe('RoomController', function() {
  beforeEach(module('ATS.Room'));

  var $controller;

  beforeEach(inject(function(_$controller_){
    $controller = _$controller_;
  }));

  describe('$scope.reset', function() {
    it('should reset all user votes to null and set reveal and results properties to default values', function() {
      var $scope = {};
      var controller = $controller('RoomController', { $scope: $scope });

      $scope.room = {
        users: {
          user1: {
            vote: {
              val: 5,
              text: '5'
            }
          },
          user2: {
            vote: {
              val: 3,
              text: