## CODE BLOCK 1

### Description
pip installs the library for OpenAI.

In [None]:
!pip install openai



## CODE BLOCK 2

### Description
Sets up the importation of Python code libraries and sets up environment variables:
- OpenAPI key setup
- Code repository location

In [None]:
# Import libraries
import os
from glob import glob
import pandas as pd
import openai

# Set OpenAI API key
with open('openai-key.txt', 'r') as f:
    key = f.read().strip()

os.environ['OPENAI_API_KEY'] = key

# Set location of code repo
repoFolder = "/Users/blakemeisenheimer/Documents/breakfree/bench2023/AI_ML/fine-tuning/dragon-bank"
#repoFolder = "openai-cookbook"

# Set code file extension type
extType = "py"

# Set the OpenAI key path
openai.api_key_path = 'openai-key.txt'

## CODE BLOCK 3

### Description
Defines the functions for creating the dataframe for Python repositories broken down by individual functions


In [None]:
# Functions to create dataframe for Python repos broken down by individual functions

def get_function_name(code):
    """
    Extract function name from a line beginning with "def "
    """
    assert code.startswith("def ")
    return code[len("def "): code.index("(")]

def get_until_no_space(all_lines, i) -> str:
    """
    Get all lines until a line outside the function definition is found.
    """
    ret = [all_lines[i]]
    for j in range(i + 1, i + 10000):
        if j < len(all_lines):
            if len(all_lines[j]) == 0 or all_lines[j][0] in [" ", "\t", ")"]:
                ret.append(all_lines[j])
            else:
                break
    return "\n".join(ret)

def get_functions(filepath):
    """
    Get all functions in a Python file.
    """
    whole_code = open(filepath).read().replace("\r", "\n")
    all_lines = whole_code.split("\n")
    for i, l in enumerate(all_lines):
        if l.startswith("def "):
            code = get_until_no_space(all_lines, i)
            function_name = get_function_name(code)
            # yield {"function_name": function_name, "code": code, "filepath": filepath}
            yield {"function_name": function_name, "code": code, "filepath": filepath}

## CODE BLOCK 4

### Description
Creates the pandas dataframe to perform code embedding

In [None]:
# Create dataframe for code embedding

code_root = repoFolder

code_files = [y for x in os.walk(code_root) for y in glob(os.path.join(x[0], f"*.{extType}"))]
print(f"Total number of {extType} files:", len(code_files))

if extType == "py":
    all_funcs = []
    for code_file in code_files:
        funcs = list(get_functions(code_file))
        for func in funcs:
            all_funcs.append(func)

    print("Total number of functions extracted:", len(all_funcs))

else:
    all_funcs = []
    for code_file in code_files:
        whole_code = open(code_file).read().replace("\r", "\n")
        all_funcs.append({"code": whole_code, "filepath": code_file})


df = pd.DataFrame(all_funcs)
df

# test = pd.DataFrame(all_funcs)
# df = test.head()

# df

Total number of py files: 1
Total number of functions extracted: 23


Unnamed: 0,function_name,code,filepath
0,get_data,"def get_data():\n """"""\n Reads the bank i...",/Users/blakemeisenheimer/Documents/breakfree/b...
1,set_data,"def set_data(data):\n """"""\n Writes the b...",/Users/blakemeisenheimer/Documents/breakfree/b...
2,get_users_as_list,"def get_users_as_list():\n """"""\n This fu...",/Users/blakemeisenheimer/Documents/breakfree/b...
3,list_to_linked_list,"def list_to_linked_list(arr):\n """"""\n Co...",/Users/blakemeisenheimer/Documents/breakfree/b...
4,heap_sort,"def heap_sort(input_list, field):\n """"""\n ...",/Users/blakemeisenheimer/Documents/breakfree/b...
5,swap,"def swap(input_list, a, b):\n """"""\n Swap...",/Users/blakemeisenheimer/Documents/breakfree/b...
6,sift_down,"def sift_down(input_list, field, start_index, ...",/Users/blakemeisenheimer/Documents/breakfree/b...
7,text_binary_search,"def text_binary_search(input_list, field, quer...",/Users/blakemeisenheimer/Documents/breakfree/b...
8,make_text_searchable,"def make_text_searchable(text):\n """"""\n ...",/Users/blakemeisenheimer/Documents/breakfree/b...
9,generate_account_number,"def generate_account_number():\n """"""\n G...",/Users/blakemeisenheimer/Documents/breakfree/b...


## CODE BLOCK 5

### Description
Create CSV file representing the correlation between each function of code and what the model (currently text-embedding-ada-002) believes it knows about the code.  It calculates the embeddings for code snippets in the 'code' column of the DataFrame using the 'text-embedding-ada-002' engine. It then saves the DataFrame to a CSV file and displays the initial rows of the DataFrame.

In [None]:
from openai.embeddings_utils import get_embedding
import warnings
warnings.filterwarnings("ignore")

import json

# df['code_embedding'] = df['code'].apply(lambda x: get_embedding(x, engine='text-embedding-ada-002'))
if extType == "py":
    df['filepath'] = df['filepath'].apply(lambda x: x.replace(code_root, ""))
# df.to_csv("code_search_openai-python.csv", index=False)

df.to_dict
df.head()


####
## Testing dictionary values cleanup after converting dictionary to json
####
# for i in df:
#     print(i)
#     print(df[i])

# def dumper(df):
#     try:
#         return df.toJSON()
#     except:
#         return df.__dict__
# df = df.to_json()

# res = json.dumps(df)
# print(res)
# with open('result.json', 'w') as fp:
#     json.dump(df, fp)

Unnamed: 0,function_name,code,filepath
0,get_data,"def get_data():\n """"""\n Reads the bank i...",/bank.py
1,set_data,"def set_data(data):\n """"""\n Writes the b...",/bank.py
2,get_users_as_list,"def get_users_as_list():\n """"""\n This fu...",/bank.py
3,list_to_linked_list,"def list_to_linked_list(arr):\n """"""\n Co...",/bank.py
4,heap_sort,"def heap_sort(input_list, field):\n """"""\n ...",/bank.py


## Code Block 6
### Description
Iterate over csv file, and have GPT API describe each 

In [None]:
from time import time,sleep

def gpt3_completion(prompt, engine='gpt-3.5-turbo', temp=1.0, top_p=1.0, tokens=1000, freq_pen=0.25, pres_pen=0.25, stop=['<<END>>']):
    max_retry = 5
    retry = 0
    while True:
        try:
            response = openai.ChatCompletion.create(
                model=engine,        
                messages=prompt,        
                temperature=temp)
            # response = openai.ChatCompletion.create(
            #     engine=engine,
            #     prompt=prompt,
            #     temperature=temp,
            #     max_tokens=tokens,
            #     top_p=top_p,
            #     frequency_penalty=freq_pen,
            #     presence_penalty=pres_pen,
            #     stop=stop)
            text = response["choices"][0]["message"]["content"]
            return text
        except Exception as oops:
            retry += 1
            if retry >= max_retry:
                return "GPT3 error: %s" % oops
            print('Error communicating with OpenAI:', oops)
            sleep(1)

In [None]:



count =0
df['summary']=None
for index, row in df.iterrows():
    function_name = row['function_name']
    code = row['code']
    prompt = f'''Summarize in english what the python function {function_name}
        does and list the required input parameters, output parameters 
        and any external functions called through the following code.
        {code}'''
    messages = [
        {"role": "user", "content": prompt}]
    response = gpt3_completion(messages)
    df.at[index, 'summary'] = response





In [None]:

# df["summary"].to_csv("summary.csv", index=False)
# df

# for index, row in df.iterrows():
#     function_name = row['function_name']
#     code = row['code']
#     summary = row['summary']
#     prompt = f'''{"prompt": "Function_name: {function_name}\n\n Description: {summary}\n\n completion: {code} }'''

# {"prompt": "Function_name: {function_name}\n\n Description: {summary}\n\n completion: {code} }

import json

listOfDicts = []
for index, row in df.iterrows():
    function_name = row['function_name']
    code = row['code']
    summary = row['summary']
    newDict = {}
    newDict['prompt'] = f"Function_name: {function_name}\n\n Description: {summary}\n\n"
    newDict['completion'] = code
    listOfDicts.append(newDict)
jsonObj = json.dumps(listOfDicts)

with open("DragonBank.json", "w") as f:
    for i in listOfDicts:
        json.dump(i, f)
        f.write('\n')
