### Code Indexer

This takes in the dir url, and return a dictionary where key=filepath, value=encoded file

In [27]:
import requests
import base64
import urllib
from urllib.parse import urlparse
import os
from dotenv import load_dotenv

load_dotenv()

MY_AUTH = (os.getenv("GITHUB_USER"), os.getenv("GITHUB_PAT"))

def parse_url(repo_url):
    parts = urlparse(repo_url).path.split("/")
    owner = parts[1]
    repo = parts[2]
    return owner, repo

def get_tree(owner, repo, branch='master'):
    response = requests.get(f'https://api.github.com/repos/{owner}/{repo}/git/trees/{branch}?recursive=1', auth=MY_AUTH)
    if response.status_code == 200:
        repo_tree = response.json()
        return [item for item in repo_tree['tree'] if item['path'].startswith('app/') and item['type'] == 'blob']
    else:
        return []

def get_content(owner, repo, file_sha):
    response = requests.get(f'https://api.github.com/repos/{owner}/{repo}/git/blobs/{file_sha}', auth=MY_AUTH)
    if response.status_code == 200:
        file_content = response.json()
        # base64.b64decode(file_content["content"]).decode('utf-8')
        return file_content["content"]
    else:
        return ''

def get_app_files(owner, repo, branch='master'):
    app_files = {}
    app_tree = get_tree(owner, repo, branch)
    for item in app_tree:
        file_content = get_content(owner, repo, item['sha'])
        app_files[item['path']] = file_content
    return app_files

dir_url = "https://github.com/alexwohlbruck/cat-facts/tree/master/app"
owner,repo = parse_url(dir_url)
tree = get_tree(owner, repo)
app_files = get_app_files(owner, repo)

print(app_files.keys())


dict_keys(['app/config/functions.js', 'app/config/google.js', 'app/config/keys.js', 'app/config/passport.js', 'app/config/strings.js', 'app/middleware.js', 'app/models/api-log.js', 'app/models/fact.js', 'app/models/message.js', 'app/models/recipient.js', 'app/models/unsubscribe-date.js', 'app/models/user.js', 'app/models/verification-code.js', 'app/routes/auth.routes.js', 'app/routes/catbot.routes.js', 'app/routes/console.routes.js', 'app/routes/contact.routes.js', 'app/routes/fact.routes.js', 'app/routes/index.js', 'app/routes/recipient.routes.js', 'app/routes/user.routes.js', 'app/routes/webhook.routes.js', 'app/services/ifttt.service.js', 'app/services/twitter.service.js'])


# Code Pruning

In [31]:
from code_indexer import get_code_index
import openai
import pandas as pd
openai.api_key = os.getenv("OPENAI_API_KEY")


# code_index = get_code_index()
code_index = get_code_index()

In [32]:
code_df = pd.DataFrame({
        "file_path":code_index.keys(),
        "file_content":code_index.values()
    })

code_df

Unnamed: 0,file_path,file_content
0,app/models/api-log.js,const mongoose = require('mongoose');\nconst S...
1,app/models/fact.js,const mongoose = require('mongoose');\nconst S...
2,app/models/message.js,var mongoose = require('mongoose');\nvar Schem...
3,app/models/recipient.js,const mongoose = require('mongoose');\nconst S...
4,app/models/unsubscribe-date.js,const mongoose = require('mongoose');\nconst S...
5,app/models/user.js,const mongoose = require('mongoose');\nconst S...
6,app/models/verification-code.js,const mongoose = require('mongoose');\nconst S...
7,app/routes/auth.routes.js,const express = require('express');\nconst rou...
8,app/routes/catbot.routes.js,const express = require('express');\nconst rou...
9,app/routes/console.routes.js,const express = require('express');\nconst rou...


In [7]:
# read all prompts

prompts = {}
templates = {}

for filename in os.listdir("prompts"):
    fpath = os.path.join("prompts", filename)

    with open(fpath) as f:
        fcontent = f.read()

    file_key = filename.replace(".txt","")
    prompts[file_key] = fcontent

# read all templates

for filename in os.listdir("doc_templates"):
    fpath = os.path.join("doc_templates", filename)

    with open(fpath) as f:
        fcontent = f.read()

    file_key = filename.replace(".json","")
    templates[file_key] = fcontent

In [18]:
# max_tokens_per_file = int(2048 / len(code_index.keys()))

max_tokens_per_file = 20

In [30]:
reduced_code_index = {}

for filepath in code_index.keys():

    code_prompt = f"""
    Reduce the following code file to a brief summary that includes its essential components and functionality. 
    This should be no more than {max_tokens_per_file} tokens in length.

    {code_index[filepath]}"""

    reduced_code = openai.Completion.create(
                model="text-davinci-003",
                prompt=code_prompt,
                temperature=0,
                max_tokens=max_tokens_per_file
    )

    reduced_code_index[filepath] = reduced_code.choices[0].text

InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 6861 tokens (6841 in your prompt; 20 for the completion). Please reduce your prompt; or completion length.

In [20]:
for k in reduced_code_index:
    print(k,reduced_code_index[k])

app/models/api-log.js 

Mongoose model for API logs with timestamps, mongoose-delete,
app/models/fact.js 

Mongoose model for facts with soft delete, random plugin, and getFact static method
app/models/message.js 

Mongoose model for messages with text, number, type, and expiration.
app/models/recipient.js 

Add/update recipients to subscriptions with SMS messages using Mongoose, IFTTT, and
app/models/unsubscribe-date.js 

Mongoose module to check if current date is between two dates in a collection.
app/models/user.js 

Mongoose model for user with unique email/phone, encryption, and soft delete.
app/models/verification-code.js 

Mongoose model for VerificationCode with code, user, type, data, and
app/routes/auth.routes.js 

Router for Google OAuth authentication with Passport, using Express.js, to allow
app/routes/catbot.routes.js 

Express router for daily fact sending, text processing, and response.
app/routes/console.routes.js 

Router for authenticated admin to get data from Reci

In [6]:
reduced_code_index["app/routes/catbot.routes.js"]

'\n\nThis code file is a router for an Express application that sends out daily facts about animals to subscribed recipients. It also processes incoming messages from recipients and responds with a fact or a welcome message. It uses the apiai-promise, bluebird, and twitter.service modules, as well as the Fact, Message, and Recipient models.'

In [25]:
with open("catfact_readme.md", "r") as file:
    # Read the contents of the file
    read_me = file.read()

base_prompt = f"""

codebase readme: {read_me}

I will give you two things:

1. json containing the intended documentation structure.
2. dict, key=filepath, value=file summary, of the codebase

return the json file, but do this

1. replace placeholder markdown files with appropriate names. You can add more if necessary.
2. Under the field 'code_files' add the code filepaths that are relevant to creating that documentation. The md file will be generated from only those code files.

return only json, nothing else

"""



In [26]:
api_template = templates["api_template"]
sample_prompt = f"{base_prompt} Ok here is the json {api_template} here is the dictionary: {reduced_code_index}"

response = openai.Completion.create(
                model="text-davinci-003",
                prompt=sample_prompt,
                temperature=0,
                max_tokens=2048
    )

documentation_json = response.choices[0].text

generated_md = ""

documentation_json

'\n\nHere is the updated json:\n\n{\n    "docs": {\n        "overview.md": {\n            "description": "An introduction to the API and its features."\n        },\n        "getting-started.md": {\n            "description": "A guide on how to set up and start using the API."\n        },\n        "endpoints": {\n            "auth.routes.md": {\n                "description": "Documentation for the authentication endpoint",\n                "code_files": ["app/routes/auth.routes.js"]\n            },\n            "catbot.routes.md": {\n                "description": "Documentation for the catbot endpoint",\n                "code_files": ["app/routes/catbot.routes.js"]\n            },\n            "console.routes.md": {\n                "description": "Documentation for the console endpoint",\n                "code_files": ["app/routes/console.routes.js"]\n            },\n            "contact.routes.md": {\n                "description": "Documentation for the contact endpoint",\n        

In [67]:
with open("generated_docs.md", "w") as file:
    file.write(generated_md)

In [84]:
prompts['base_prompt']

'I will provide you a codebase that I want you to understand. \n\nThe codebase directory will be represented as a dictionary, where each key is a file path in a repository, and each value is the file content.\n\nKey: file path to file name\nValue: file content\n\nI will feed you each file one by one. When you are ready, only reply with the phrase "ready". I do not want you to say anything else. \nIf you do not understand, please reply only with the phrase "stop"\n'

In [3]:
res.keys()

dict_keys(['app/models/api-log.js', 'app/models/fact.js', 'app/models/message.js', 'app/models/recipient.js', 'app/models/unsubscribe-date.js', 'app/models/user.js', 'app/models/verification-code.js', 'app/routes/auth.routes.js', 'app/routes/catbot.routes.js', 'app/routes/console.routes.js', 'app/routes/contact.routes.js', 'app/routes/fact.routes.js', 'app/routes/index.js', 'app/routes/recipient.routes.js', 'app/routes/user.routes.js', 'app/routes/webhook.routes.js'])

In [4]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(str(res))
print(len(tokens))
# base64.b64decode(catbot_file).decode('utf-8')


5230


In [3]:
import json

cache = {}

with open("../cache/code_cache.json", "r") as file:
        cache = json.load(file)

cache

{'cached_url': '', 'cached_index': {}}