In [44]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import LanguageParser
from langchain_text_splitters import Language

In [45]:
loader = GenericLoader.from_filesystem(
    "source/A_GIS",
    glob="**/*",
    suffixes=[".py"],
    exclude=["**/non-utf8-encoding.py"],
    parser=LanguageParser(language=Language.PYTHON, parser_threshold=500),
)

In [46]:
documents = loader.load()
len(documents)

159

In [125]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

python_splitter = RecursiveCharacterTextSplitter.from_language(
    language=Language.PYTHON, chunk_size=2000, chunk_overlap=200
)
texts = documents #python_splitter.split_documents(documents)
len(texts)

159

In [126]:
persist_directory='./chromadb'
from tqdm import tqdm
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import OllamaEmbeddings
embeddings = OllamaEmbeddings(model='mixtral')
batch_size = int(len(texts) / 10)
batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
for batch in tqdm(batches, desc="Processing batches"):
    db = Chroma.from_documents(batch, embeddings, persist_directory=persist_directory)
retriever = db.as_retriever()

Processing batches: 100%|█████████████████████████████████████████████████████████████████████████| 11/11 [06:32<00:00, 35.71s/it]


In [191]:
import ollama
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def ollama_llm(question,context):
    formatted_prompt = f"Question: {question}\n\nContext: {context}"
    response = ollama.chat(model='mixtral',messages=[{'role': 'system', 'content': '''
    You are a Python code cataloger expert in the contents of the A_GIS code database. 
    You respond with the top 5 fully-qualified names of the 
    A_GIS functions that fit the query.
    '''},{'role': 'user', 'content': formatted_prompt}],
                            options=ollama.Options(temperature=0.7,mirostat=2)
                          )
    return response['message']['content']

def rag_chain(question):
    retrieved_docs = retriever.invoke(question)
    formatted_context = format_docs(retrieved_docs)
    return ollama_llm(question,formatted_context)

In [192]:
x=rag_chain("An existing function to format code to A_GIS standards")
print(x)

 Based on the provided context, here are the top 5 fully-qualified names of functions in the A\_GIS code database that could be related to formatting code to A\_GIS standards:

1. A\_GIS.Code.\_distill\_imports
2. A\_GIS.File.read
3. A\_GIS.Code.Tree.get
4. A\_GIS.Code.Tree.update\_path\_to\_package
5. A\_GIS.Code.format (note: this function is not explicitly shown in the provided context, but it's mentioned in the documentation string of A\_GIS.Code.\_distill\_imports, so I included it as a potential match)


In [202]:
def invoke_with_scores(retriever, query):
    # Generate a query vector for your query
    query_vector = retriever.vectorstore.embeddings.embed_query(query)
    # Perform the search; this method and its parameters might differ
    search_results = retriever.vectorstore.search(query_vector, search_type='similarity')
    # Inspect search_results to understand its structure
    print(search_results)  # Temporarily print to debug
    # Correctly unpack search_results based on its actual structure
    return [(retriever.document_store.get_document(doc_id), score) for doc_id, score, *_ in search_results]

retrieved_docs_with_scores = invoke_with_scores(retriever, "format code")
for doc, score in retrieved_docs_with_scores:
    print(f"Document: {doc}, Similarity Score: {score}")


[Document(page_content='def from_samples(*, realizations):\n    """Calculate correlation coefficient from samples."""\n    import numpy\n    import A_GIS.Math.CorrelationMatrix.init_from_numpy\n\n    matrix = numpy.corrcoef(realizations, rowvar=False)\n    return A_GIS.Math.CorrelationMatrix.init_from_numpy(matrix=matrix)', metadata={'language': 'python', 'source': 'source/A_GIS/Math/CorrelationMatrix/from_samples/__init__.py'}), Document(page_content='import typing\n\ndef init(*, size: int = 1, values: typing.Optional[typing.List[float]] = []):\n    """\n    Return a correlation matrix data class from the upper triangular values.\n\n    This function initializes a `_CorrMatrix` data class with a specified size and set of values.\n    The values are expected to represent the upper triangular part of the matrix.\n\n    Args:\n        size (int): The size of the correlation matrix. Defaults to 1.\n        values (Optional[List[float]]): A list of float values to initialize the matrix. Mu

AttributeError: 'VectorStoreRetriever' object has no attribute 'document_store'

In [206]:
def invoke_with_scores(retriever, query):
    # Assuming this function correctly generates a query vector
    query_vector = retriever.vectorstore.embeddings.embed_query(query)
    # Perform the search; adjust parameters as needed
    search_results = retriever.vectorstore.search(query_vector, search_type='similarity')
    # Directly use search results if they include document content or references
    return search_results  # Adjust this based on the actual format of search_results

retrieved_docs = invoke_with_scores(retriever, "format code")

In [211]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def calculate_similarity_scores(retriever, query, documents):
    # Embed the query
    query_vector = np.asarray(retriever.vectorstore.embeddings.embed_query(query)).reshape(1, -1)  # Reshape for cosine_similarity
    
    # Embed each document
    doc_vectors = np.array([retriever.vectorstore.embeddings.embed_documents(doc.page_content) for doc in documents])
    
    # Calculate cosine similarity
    scores = cosine_similarity(query_vector, doc_vectors).flatten()
    
    # Return documents paired with their similarity scores
    return list(zip(documents, scores))

# Assume 'retrieved_docs' is the list of documents returned by your retrieval function
retrieved_docs_with_scores = calculate_similarity_scores(retriever, "format code", retrieved_docs)

for doc, score in retrieved_docs_with_scores:
    print(f"Document: {doc.page_content[:100]}... Similarity Score: {score}")


KeyboardInterrupt: 

In [214]:
x='\ndef example_function(param1, param2):\n    """This is a docstring."""\n    return (param1, param2)\n'

In [219]:
y=x.encode('utf-8').decode('unicode_escape')

In [220]:
print(y)


def example_function(param1, param2):
    """This is a docstring."""
    return (param1, param2)



In [221]:
print(x)


def example_function(param1, param2):
    """This is a docstring."""
    return (param1, param2)



In [222]:
import ast

class StringTransformer(ast.NodeTransformer):
    def visit_Str(self, node):
        # This method will visit every string in the code.
        if "\n" in node.s:
            # Prepare the string by escaping triple quotes and backslashes
            escaped_s = node.s.replace('\\', '\\\\').replace('"""', '\\"""')
            # Interpret escape sequences
            interpreted_s = escaped_s.encode('utf-8').decode('unicode_escape')
            # Enclose in triple quotes
            new_s = f'"""{interpreted_s}"""'
        else:
            # Escape double quotes and backslashes for single-line strings
            escaped_s = node.s.replace('\\', '\\\\').replace('"', '\\"')
            new_s = f'"{escaped_s}"'

        return ast.Str(s=new_s)

In [270]:
import astor
# Sample code
code = "\ndef example_function(param1, param2):\n    '''This is a docstring.'''\n    return (param1, param2)\n"""

import io

# Create a StringIO object
output = io.StringIO()

# Use the file argument in print to direct output to the StringIO object
print(code, file=output)

# Retrieve the content of the StringIO object
captured_output = output.getvalue()

# Don't forget to close the StringIO object when you're done
output.close()

# captured_output now contains the string 'Hello, world!\n'
print(captured_output)



def example_function(param1, param2):
    '''This is a docstring.'''
    return (param1, param2)




In [266]:
print(code)


def example_function(param1, param2):
    """This is a docstring."""
    return (param1, param2)



In [267]:
test_code = """
def example_function(param1, param2):
    '''This is a docstring.'''
    return (param1, param2)
"""

In [268]:
print(test_code)


def example_function(param1, param2):
    '''This is a docstring.'''
    return (param1, param2)



In [271]:
x='\ndef example_function(param1, param2):\n    """This is a docstring."""\n    return (param1, param2)\n'

In [272]:
x.replace('"""',"'''")

"\ndef example_function(param1, param2):\n    '''This is a docstring.'''\n    return (param1, param2)\n"

In [273]:
x='\ndef example_function(param1, param2):\n    return (param1, param2)\n'

In [274]:
print(x)


def example_function(param1, param2):
    return (param1, param2)



In [287]:
y=    """Formats Python code using the autopep8 and black libraries.

    This function takes a string of Python code as input, applies formatting with autopep8 and black, and returns the resulting formatted code.

    Args:
        code (str): The Python code to be formatted.

    Raises:
        None

    Returns:
        str: The formatted Python code.

    """


In [300]:
import textwrap
textwrap.fill(y)

'Formats Python code using the autopep8 and black libraries.      This\nfunction takes a string of Python code as input, applies formatting\nwith autopep8 and black, and returns the resulting formatted code.\nArgs:         code (str): The Python code to be formatted.\nRaises:         None      Returns:         str: The formatted Python\ncode.'

In [295]:
textwrap.fill(,width=70)

'Formats Python code using the autopep8 and black libraries.      This\nfunction takes a string of Python code as input, applies formatting\nwith autopep8 and black, and returns the resulting formatted code.\nArgs:         code (str): The Python code to be formatted.\nRaises:         None      Returns:         str: The formatted Python\ncode.'

In [283]:
print(x)


def example_function(param1, param2):
    return (param1, param2)



In [284]:
print(y)


    def example_function(param1, param2):
        return (param1, param2)



In [286]:
import libcst as cst
import textwrap

def convert_multiline_libcst(*, code: str) -> str:
    """Convert multiline strings to a canonical form using LibCST.

    Args:
        code (str): The Python source code to convert.
    """

    class MultilineStringTransformer(cst.CSTTransformer):
        def leave_SimpleString(self, original_node, updated_node):
            # Check if string is multiline
            if '\n' in updated_node.value:
                # Handle triple quotes in strings
                if '"""' in updated_node.value or "'''" in updated_node.value:
                    # Use the other form of triple quotes if one is found in the string
                    quote_type = "'''" if '"""' in updated_node.value else '"""'
                else:
                    # Default to triple double quotes
                    quote_type = '"""'
                
                # Create a new multiline string node
                new_value = textwrap.dedent(updated_node.value).strip()
                new_value = f"{quote_type}{new_value}{quote_type}"
                return updated_node.with_changes(value=new_value)
            return updated_node

    # Parse the source code into a LibCST Module
    tree = cst.parse_module(code)

    # Transform the CST
    wrapper = cst.MetadataWrapper(tree)
    transformed_tree = wrapper.visit(MultilineStringTransformer())

    # Convert the CST back into a code string
    return transformed_tree.code

# Example usage
code = "\ndef example_function(param1, param2):\n    '''This is a docstring.'''\n    return (param1, param2)"

converted_code = convert_multiline_libcst(code=code)
print(converted_code)



def example_function(param1, param2):
    '''This is a docstring.'''
    return (param1, param2)


In [305]:
import textwrap

def reformat_docstring(docstring: str, max_width: int = 72) -> str:
    """
    Reformat a docstring to adhere to PEP 257 conventions.

    This function ensures that the docstring has a one-line summary separated
    by a blank line from any further explanatory text. It wraps the text so
    that no line exceeds the specified maximum width.

    Args:
        docstring (str): The original docstring to reformat.
        max_width (int): The maximum width for any line in the docstring.

    Returns:
        str: The reformatted docstring.
    """
    lines = docstring.strip().split('\n')
    if not lines:
        return ''

    # Process the first line separately as the summary.
    summary_line = lines[0].strip()
    wrapped_summary = textwrap.fill(summary_line, max_width)

    # Process the rest of the docstring.
    rest = '\n'.join(lines[1:]).strip()
    wrapped_rest = textwrap.fill(rest, max_width)

    # Combine the summary and the rest with a blank line in between if there is more detail to add.
    if wrapped_rest:
        return f'{wrapped_summary}\n\n{wrapped_rest}'
    else:
        return wrapped_summary

# Example usage
original_docstring = """\
Formats Python code using the autopep8 and black libraries. This function takes a string of Python code as input, applies formatting with autopep8 and black, and returns the resulting formatted code.

Args:
    code (str): The Python code to be formatted.

Raises:
    None

Returns:
    str: The formatted Python code.
"""

reformatted_docstring = reformat_docstring(original_docstring, 72)
print(reformatted_docstring)


Formats Python code using the autopep8 and black libraries. This
function takes a string of Python code as input, applies formatting with
autopep8 and black, and returns the resulting formatted code.

Args:     code (str): The Python code to be formatted.  Raises:     None
Returns:     str: The formatted Python code.
