In [5]:
# clone the repo
# flatten the repo
# go through each file and generate a summary for each file at the function level
# store the summary, code
import dotenv
dotenv.load_dotenv()

True

In [2]:
!git clone https://github.com/python/mypy.git

Cloning into 'mypy'...
remote: Enumerating objects: 96389, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 96389 (delta 0), reused 0 (delta 0), pack-reused 96378[K
Receiving objects: 100% (96389/96389), 64.87 MiB | 21.09 MiB/s, done.
Resolving deltas: 100% (74700/74700), done.


In [3]:
import glob
import os

# Define the path to the folder
folder_path = 'flat'

# Get all the files in the folder and its subfolders
files = glob.glob(os.path.join(folder_path, '**'), recursive=True)

# Filter out directories and keep only files
files = [file for file in files if os.path.isfile(file)]

# Move all the files to the flat folder
flat_folder_path = 'flat'
for file in files:
    file_name = os.path.basename(file)
    new_file_path = os.path.join(flat_folder_path, file_name)
    os.rename(file, new_file_path)

In [6]:
import os

flat_folder_path = 'flat'
file_tuples = []

for file_name in os.listdir(flat_folder_path):
    file_path = os.path.join(flat_folder_path, file_name)
    if os.path.isfile(file_path):
        with open(file_path, 'r') as file:
            file_text = file.read()
            file_type = os.path.splitext(file_name)[1]
            file_tuples.append((file_text, file_name, file_type))

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter 
splitter = RecursiveCharacterTextSplitter.from_language("python",
    chunk_size= 10000,
    chunk_overlap= 1000,
  )

In [8]:
file_text_list = [file_tuple[0] for file_tuple in file_tuples]
file_info_list = [{'file_name': file_tuple[1], 'file_type': file_tuple[2]} for file_tuple in file_tuples]

In [9]:
pythonOutput = splitter.create_documents(file_text_list, file_info_list)

In [10]:
pythonOutput[0]

Document(page_content='import sys\nfrom collections.abc import Callable, Iterator, Mapping\nfrom typing import Any, ClassVar, Generic, TypeVar, final, overload\nfrom typing_extensions import ParamSpec\n\nif sys.version_info >= (3, 9):\n    from types import GenericAlias\n\n__all__ = ("Context", "ContextVar", "Token", "copy_context")\n\n_T = TypeVar("_T")\n_D = TypeVar("_D")\n_P = ParamSpec("_P")\n\n@final\nclass ContextVar(Generic[_T]):\n    @overload\n    def __init__(self, name: str) -> None: ...\n    @overload\n    def __init__(self, name: str, *, default: _T) -> None: ...\n    def __hash__(self) -> int: ...\n    @property\n    def name(self) -> str: ...\n    @overload\n    def get(self) -> _T: ...\n    @overload\n    def get(self, default: _T, /) -> _T: ...\n    @overload\n    def get(self, default: _D, /) -> _D | _T: ...\n    def set(self, value: _T, /) -> Token[_T]: ...\n    def reset(self, token: Token[_T], /) -> None: ...\n    if sys.version_info >= (3, 9):\n        def __class

In [34]:
from typing import List
from octoai.text_gen import ChatMessage, ChatCompletionResponseFormat
from octoai.client import AsyncOctoAI, OctoAI
from pydantic import BaseModel, Field
import json 
import os


class Summary(BaseModel):
    summaries: List[str]



client = OctoAI(
    api_key=os.getenv("OCTOAI_API_KEY"),
)        
asnyc_client = AsyncOctoAI(
    api_key=os.getenv("OCTOAI_API_KEY"),
    
)        

In [35]:
import asyncio
import time
import aiohttp

sem = asyncio.Semaphore(4)

async def generate_summaries(document):
	sem.acquire()
	response = await client.text_gen.create_chat_completion(
		max_tokens=512,
		messages=[
			ChatMessage(
				content="You are an expert coder that creates summaries of code files.",
				role="system"
			),
			ChatMessage(
				content=f"Take this code file and create a list of summaries for each class and function in the block of code. Here is the code: {document}",
				role="user"
			)
		],
		model="mistral-7b-instruct-v0.3",
		presence_penalty=0,
		temperature=0,
		top_p=1,
		response_format=ChatCompletionResponseFormat(type='json_object', schema=Summary.model_json_schema()))
	time.sleep(1)
	sem.release()
	return json.loads(response.choices[0].message.content)['summaries']

async def get_all_resps(pythonOutput):
    async with aiohttp.ClientSession() as session:
        resps = await asyncio.gather(*[generate_summaries(doc) for doc in pythonOutput])
        return resps

In [104]:
def gen_resp(document):
	response = client.text_gen.create_chat_completion(
		max_tokens=4000,
		messages=[
			ChatMessage(
				content="You are an expert coder that creates summaries of code files. Respond only following the provided json schema.",
				role="system"
			),
			ChatMessage(
				content=f"Take this code file and create a list of summaries for what this file does. Here is the code: {document.page_content}",
				role="user"
			)
		],
		model="meta-llama-3-8b-instruct",
		presence_penalty=0,
		temperature=0,
		top_p=1,
		response_format=ChatCompletionResponseFormat(type='json_object', schema=Summary.model_json_schema())
  	)	
	# print(response.choices[0].message.content)
	# summaries = json.loads(response.choices[0].message.content)['summaries']
 
	return response.choices[0].message.content

In [98]:
from ast import literal_eval
from langchain_pinecone import PineconeVectorStore
index_name = 'hackathon'

from langchain_community.embeddings import HuggingFaceEmbeddings

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


all_summaries = []
all_j_summ = []
all_docs = []


def add_docs_to_pinecone(docs, start):
    all_summaries = [gen_resp(doc) for doc in docs]
    all_j_summ = []
    for i, sum in enumerate(all_summaries):
        try:
            all_j_summ.append(json.loads(sum)['summaries'])
        except:
            try:
                all_j_summ.append(literal_eval('[' + sum.split('[')[1].split(']')[0] + ']'))
            except:
                all_j_summ.append([])
                print(f'error for element {start + i}')
    all_docs = []
    for i, summ in enumerate(all_j_summ):
        for s in summ:
            doc = pythonOutput[i].copy()
            doc.metadata['code'] = doc.page_content
            doc.page_content = s
            all_docs.append(doc)
            
    PineconeVectorStore.from_documents(
        all_docs,
        index_name=index_name,
        embedding=embeddings,
        namespace='codebase'
    )
    print(f"Success upload: {start} to {start + len(docs)}")



In [105]:
start = 30
batch_size = 10
while start < len(pythonOutput):
    add_docs_to_pinecone(pythonOutput[start:start + batch_size], start)
    start += batch_size

Success upload: 30 to 40
error for element 46
Success upload: 40 to 50
error for element 52
error for element 58
Success upload: 50 to 60
error for element 60
error for element 61
error for element 64
error for element 65
Success upload: 60 to 70
error for element 72
error for element 73
error for element 74
error for element 75
error for element 79
Success upload: 70 to 80
Success upload: 80 to 90
Success upload: 90 to 100
Success upload: 100 to 110
error for element 119
Success upload: 110 to 120
Success upload: 120 to 130
Success upload: 130 to 140
error for element 143
error for element 144
Success upload: 140 to 150
Success upload: 150 to 160
error for element 166
error for element 167
error for element 168
Success upload: 160 to 170
error for element 170
error for element 173
error for element 177
error for element 178
Success upload: 170 to 180
error for element 180
error for element 181
error for element 182
error for element 188
Success upload: 180 to 190
error for element 190

KeyboardInterrupt: 

In [None]:
# embeded = embeddings.embed_documents(pythonOutput)

vectorstore_from_docs = PineconeVectorStore.from_documents(
        all_docs,
        index_name=index_name,
        embedding=embeddings,
        namespace='codebase'
    )

In [79]:
all_j_summ = []
for sum in all_summaries:
    try:
        all_j_summ.append(json.loads(sum)['summaries'])
    except:
        try:
            all_j_summ.append(literal_eval('[' + sum.split('[')[1].split(']')[0] + ']'))
        except:
            all_j_summ.append([])

In [15]:
from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY")
pci = pc.Index('hackathon')

  from tqdm.autonotebook import tqdm


In [32]:
type(pythonOutput[0])

langchain_core.documents.base.Document

In [87]:
len(all_j_summ)

10

In [94]:
all_docs = []
for i, summ in enumerate(all_j_summ):
    
    for s in summ:
        doc = pythonOutput[i].copy()
        doc.metadata['code'] = doc.page_content
        doc.page_content = s
        all_docs.append(doc)

In [95]:
all_docs

[Document(page_content='Defines a ContextVar class for managing context variables in a program.', metadata={'file_name': 'contextvars.pyi', 'file_type': '.pyi', 'text': 'import sys\nfrom collections.abc import Callable, Iterator, Mapping\nfrom typing import Any, ClassVar, Generic, TypeVar, final, overload\nfrom typing_extensions import ParamSpec\n\nif sys.version_info >= (3, 9):\n    from types import GenericAlias\n\n__all__ = ("Context", "ContextVar", "Token", "copy_context")\n\n_T = TypeVar("_T")\n_D = TypeVar("_D")\n_P = ParamSpec("_P")\n\n@final\nclass ContextVar(Generic[_T]):\n    @overload\n    def __init__(self, name: str) -> None: ...\n    @overload\n    def __init__(self, name: str, *, default: _T) -> None: ...\n    def __hash__(self) -> int: ...\n    @property\n    def name(self) -> str: ...\n    @overload\n    def get(self) -> _T: ...\n    @overload\n    def get(self, default: _T, /) -> _T: ...\n    @overload\n    def get(self, default: _D, /) -> _D | _T: ...\n    def set(se

In [81]:
for i, out in enumerate(pythonOutput[:10]):
    out.metadata['code'] = out.page_content
    out.metadata['summaries']
    out.page_content(all_summaries[i])

{'file_name': 'contextvars.pyi',
 'file_type': '.pyi',
 'text': 'import sys\nfrom collections.abc import Callable, Iterator, Mapping\nfrom typing import Any, ClassVar, Generic, TypeVar, final, overload\nfrom typing_extensions import ParamSpec\n\nif sys.version_info >= (3, 9):\n    from types import GenericAlias\n\n__all__ = ("Context", "ContextVar", "Token", "copy_context")\n\n_T = TypeVar("_T")\n_D = TypeVar("_D")\n_P = ParamSpec("_P")\n\n@final\nclass ContextVar(Generic[_T]):\n    @overload\n    def __init__(self, name: str) -> None: ...\n    @overload\n    def __init__(self, name: str, *, default: _T) -> None: ...\n    def __hash__(self) -> int: ...\n    @property\n    def name(self) -> str: ...\n    @overload\n    def get(self) -> _T: ...\n    @overload\n    def get(self, default: _T, /) -> _T: ...\n    @overload\n    def get(self, default: _D, /) -> _D | _T: ...\n    def set(self, value: _T, /) -> Token[_T]: ...\n    def reset(self, token: Token[_T], /) -> None: ...\n    if sys.ve

In [83]:
type(pythonOutput[0])

langchain_core.documents.base.Document



In [21]:
from llama_index.vector_stores.pinecone import PineconeVectorStore
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext


vector_store = PineconeVectorStore(pinecone_index=pci)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(
    pythonOutput, storage_context=storage_context
)

AttributeError: 'Document' object has no attribute 'get_doc_id'

In [76]:
import tqdm

all_summaries = []


for doc in tqdm.tqdm(pythonOutput):
    resp = generate_summaries(doc)
    all_summaries.append(resp)

  0%|          | 2/2622 [00:18<6:48:23,  9.35s/it]


KeyboardInterrupt: 