### Code Indexer

This takes in the dir url, and return a dictionary where key=filepath, value=encoded file

In [None]:
import requests
import base64
import urllib
from urllib.parse import urlparse
import os
from dotenv import load_dotenv

load_dotenv()

MY_AUTH = (os.getenv("GITHUB_USER"), os.getenv("GITHUB_PAT"))

def parse_url(repo_url):
    parts = urlparse(repo_url).path.split("/")
    owner = parts[1]
    repo = parts[2]
    return owner, repo

def get_tree(owner, repo, branch='master'):
    response = requests.get(f'https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1', auth=MY_AUTH)
    if response.status_code == 200:
        repo_tree = response.json()
        return [item for item in repo_tree['tree'] if item['path'].startswith('app/') and item['type'] == 'blob']
    else:
        return []

def get_content(owner, repo, file_sha):
    response = requests.get(f'https://api.github.com/repos/{owner}/{repo}/git/blobs/{file_sha}', auth=MY_AUTH)
    if response.status_code == 200:
        file_content = response.json()
        # base64.b64decode(file_content["content"]).decode('utf-8')
        return file_content["content"]
    else:
        return ''

def get_app_files(owner, repo, branch='master'):
    app_files = {}
    app_tree = get_tree(owner, repo, branch)
    for item in app_tree:
        file_content = get_content(owner, repo, item['sha'])
        app_files[item['path']] = file_content
    return app_files

dir_url = "https://github.com/alexwohlbruck/cat-facts/tree/master/app"
owner,repo = parse_url(dir_url)
tree = get_tree(owner, repo)
app_files = get_app_files(owner, repo)

print(app_files.keys())


# Code Pruning

In [68]:
from code_indexer import get_code_index
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")


code_index = get_code_index()

In [5]:
# read all prompts

prompts = {}
templates = {}

for filename in os.listdir("prompts"):
    fpath = os.path.join("prompts", filename)

    with open(fpath) as f:
        fcontent = f.read()

    file_key = filename.replace(".txt","")
    prompts[file_key] = fcontent

# read all templates

for filename in os.listdir("doc_templates"):
    fpath = os.path.join("prompts", filename)

    with open(fpath) as f:
        fcontent = f.read()

    file_key = filename.replace(".json","")
    templates[file_key] = fcontent

In [72]:
max_tokens_per_file = int(2048 / len(code_index.keys()))

In [46]:
reduced_code_index = {}

for filepath in code_index.keys():

    code_prompt = f"""
    Reduce the following code file to a brief summary that includes its essential components and functionality. The summary should provide a high-level understanding of the code's purpose. 
    This should be no more than {max_tokens_per_file} tokens in length.

    {code_index[filepath]}"""

    reduced_code = openai.Completion.create(
                model="text-davinci-003",
                prompt=code_prompt,
                temperature=0,
                max_tokens=max_tokens_per_file
    )

    reduced_code_index[filepath] = reduced_code.choices[0].text

In [47]:
for k in reduced_code_index:
    print(k,reduced_code_index[k])

app/models/api-log.js 

This code file is a Mongoose schema for an API log model. It includes fields for host, body, client IP, and original URL, as well as timestamps. It also includes plugins for soft delete and random selection. It exports the model for use in other files.
app/models/fact.js 

This code file is a Mongoose model for a Fact object. It includes fields for user, text, sendDate, type, and status. It also includes plugins for soft delete and random selection, as well as a static method for retrieving a fact.
app/models/message.js 

This code file creates a Mongoose schema for a Message model, which includes fields for text, number, and type. It also sets an index to expire after two weeks and exports the Message model.
app/models/recipient.js 

This code file is a Mongoose schema for a Recipient model. It includes fields for name, notes, number, addedBy, and subscriptions. It also includes methods for adding recipients, validating phone numbers, and sending messages. It a

In [48]:
reduced_code_index["app/routes/catbot.routes.js"]

"\n\nThis code file is a router for an Express application that sends out daily facts about animals to recipients, processes incoming messages from recipients, and tweets out cat facts. It uses the apiai-promise, bluebird, and twitter.service modules, as well as the Fact, Message, and Recipient models. It also uses the strings.js and keys.js config files. It has two routes: '/daily' and '/message'. '/daily' gets all recipients and a fact to be sent out each day, and '/message' processes incoming messages from recipients and responds."

In [65]:
with open("catfact_readme.md", "r") as file:
    # Read the contents of the file
    read_me = file.read()

base_prompt = f"""

I will provide you a codebase that I want you to understand. here is the readme: {read_me}

I want you to plan out documentation for this codebase. do the following:

1. Write a summary of the codebase 
2. Identify the end user for the documentation
3. Write out a list of .md files, and describe what they would each document with respect the the end user

"""



In [66]:
sample_prompt = f"{base_prompt} here is the dictionary: {reduced_code_index}"

response = openai.Completion.create(
                model="text-davinci-003",
                prompt=base_prompt,
                temperature=0,
                max_tokens=2048
    )

generated_md = response.choices[0].text

In [67]:
with open("generated_docs.md", "w") as file:
    file.write(generated_md)

In [84]:
prompts['base_prompt']

'I will provide you a codebase that I want you to understand. \n\nThe codebase directory will be represented as a dictionary, where each key is a file path in a repository, and each value is the file content.\n\nKey: file path to file name\nValue: file content\n\nI will feed you each file one by one. When you are ready, only reply with the phrase "ready". I do not want you to say anything else. \nIf you do not understand, please reply only with the phrase "stop"\n'

In [3]:
res.keys()

dict_keys(['app/models/api-log.js', 'app/models/fact.js', 'app/models/message.js', 'app/models/recipient.js', 'app/models/unsubscribe-date.js', 'app/models/user.js', 'app/models/verification-code.js', 'app/routes/auth.routes.js', 'app/routes/catbot.routes.js', 'app/routes/console.routes.js', 'app/routes/contact.routes.js', 'app/routes/fact.routes.js', 'app/routes/index.js', 'app/routes/recipient.routes.js', 'app/routes/user.routes.js', 'app/routes/webhook.routes.js'])

In [4]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(str(res))
print(len(tokens))
# base64.b64decode(catbot_file).decode('utf-8')


5230
