<a href="https://colab.research.google.com/github/aditya161205/codoc/blob/main/CODOC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [55]:
!pip install GitPython

import os
import ast
from git import Repo



## **First lets clone the repo to be documented**

In [56]:
import os
from git import Repo
import shutil

repo_url = "https://github.com/aditya161205/mock"
local_dir = "dir"

if os.path.exists(local_dir) and os.path.isdir(local_dir):
    shutil.rmtree(local_dir)

repo = Repo.clone_from(repo_url, local_dir)

In [57]:
parsed_data = []

## **Parsing the files in the repo**

In [58]:
import ast

def parsing(file_path):
    file_info = {
        "file": file_path,
        "imports": [],
        "functions": [],
        "classes": []
    }

    try:
        with open(file_path, "r", encoding="utf-8") as file:
            content = file.read()
        tree = ast.parse(content)

        for node in ast.walk(tree):
            # --- Functions ---
            if isinstance(node, ast.FunctionDef):
                func_code = ast.get_source_segment(content, node)
                func = {
                    "name": node.name,
                    "doc": ast.get_docstring(node) or "",
                    "args": [arg.arg for arg in node.args.args],
                    "code": func_code or ""
                }
                file_info["functions"].append(func)

            # --- Classes ---
            elif isinstance(node, ast.ClassDef):
                methods_list = []
                for method in node.body:
                    if isinstance(method, ast.FunctionDef):
                        method_code = ast.get_source_segment(content, method)
                        methods_list.append({
                            "name": method.name,
                            "doc": ast.get_docstring(method) or "",
                            "args": [arg.arg for arg in method.args.args],
                            "code": method_code or ""
                        })
                cls = {
                    "name": node.name,
                    "doc": ast.get_docstring(node) or "",
                    "methods": methods_list
                }
                file_info["classes"].append(cls)

            # --- Imports ---
            elif isinstance(node, ast.Import):
                for alias in node.names:
                    file_info["imports"].append(alias.name)
            elif isinstance(node, ast.ImportFrom):
                if node.module:
                    file_info["imports"].append(node.module)

        return file_info

    except SyntaxError as e:
        print(f"SyntaxError parsing {file_path}: {e}")
        return file_info
    except Exception as e:
        print(f"Error parsing {file_path}: {e}")
        return file_info

In [59]:
# === Walk through repo ===
for root, dirs, files in os.walk(local_dir):
    for file in files:
        if file.endswith(".py"):
            file_path = os.path.join(root, file)
            file_data = parsing(file_path)
            parsed_data.append(file_data)


In [60]:
# === Output preview ===
import json
print(json.dumps(parsed_data, indent=2))

[
  {
    "file": "dir/ex.py",
    "imports": [],
    "functions": [
      {
        "name": "add_numbers",
        "doc": "Return the sum of two numbers.",
        "args": [
          "a",
          "b"
        ],
        "code": "def add_numbers(a, b):\n    \"\"\"Return the sum of two numbers.\"\"\"\n    return a + b"
      },
      {
        "name": "multiply_numbers",
        "doc": "Return the product of two numbers.",
        "args": [
          "a",
          "b"
        ],
        "code": "def multiply_numbers(a, b):\n    \"\"\"Return the product of two numbers.\"\"\"\n    return a * b"
      },
      {
        "name": "is_even",
        "doc": "Check if a number is even.",
        "args": [
          "number"
        ],
        "code": "def is_even(number):\n    \"\"\"Check if a number is even.\"\"\"\n    return number % 2 == 0"
      },
      {
        "name": "greet",
        "doc": "Return a greeting message.",
        "args": [
          "name"
        ],
        "code": "

## **Adding schemantic meaning to functions and classes**

In [61]:
pip install faiss-cpu sentence-transformers




In [62]:
from sentence_transformers import SentenceTransformer

# Lightweight, fast, and good for semantic search
embed_model = SentenceTransformer("all-MiniLM-L6-v2")


In [63]:
import numpy as np

embeddings = []
metadata = []

for file_item in parsed_data:
    # Functions
    for func in file_item["functions"]:
        # --- CHANGE HERE ---
        # Create a richer text document for embedding.
        # This combines the most important semantic information.
        args_str = ", ".join(func.get("args", []))
        docstring = func.get("doc", "This function has no docstring.")

        text_to_embed = (
            f"Function Name: {func['name']}\n"
            f"Signature: def {func['name']}({args_str}):\n"
            f"Docstring: {docstring}"
        )

        # Encode this richer text instead of just the summary.
        vector = embed_model.encode(text_to_embed)
        embeddings.append(vector)

        # The metadata can stay the same, as it holds the raw info.
        metadata.append({
            "file": file_item["file"],
            "type": "function",
            "name": func["name"],
            "args": func.get("args", []),
            "doc": docstring, # Store the clean docstring
            "code": func.get("code", "")
        })

    # Classes
    for cls in file_item["classes"]:
        # --- CHANGE HERE ---
        # Do the same for classes
        method_names = [m.get("name", "") for m in cls.get("methods", [])]
        docstring = cls.get("doc", "This class has no docstring.")

        text_to_embed = (
            f"Class Name: {cls['name']}\n"
            f"Methods: {', '.join(method_names)}\n"
            f"Docstring: {docstring}"
        )

        vector = embed_model.encode(text_to_embed)
        embeddings.append(vector)

        metadata.append({
            "file": file_item["file"],
            "type": "class",
            "name": cls["name"],
            "doc": docstring,
            "methods": cls.get("methods", []),
            "code": "\n".join([m.get("code","") for m in cls["methods"]])
        })

embeddings = np.array(embeddings).astype("float32")

In [64]:
import faiss

dimension = embeddings.shape[1]  # embedding size
index = faiss.IndexFlatL2(dimension)  # L2 distance
index.add(embeddings)


In [65]:

from transformers import T5ForConditionalGeneration, T5Tokenizer

print("Loading Q&A model (Flan-T5)...")
# FIX: Load the correct model and tokenizer with consistent variable names
qa_model_name = "google/flan-t5-base"
qa_tokenizer = T5Tokenizer.from_pretrained(qa_model_name)
qa_model = T5ForConditionalGeneration.from_pretrained(qa_model_name)

Loading Q&A model (Flan-T5)...


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [66]:
# Assumes you have loaded an instruction-tuned model like this:
# from transformers import T5ForConditionalGeneration, T5Tokenizer
# qa_model_name = "google/flan-t5-base"
# qa_tokenizer = T5Tokenizer.from_pretrained(qa_model_name)
# qa_model = T5ForConditionalGeneration.from_pretrained(qa_model_name)

def rag_query(query, top_k=5, max_context_length=1024):
    # --- 1. Embed query and retrieve documents (same as before) ---
    query_vector = embed_model.encode([query]).astype("float32")

    distances, indices = index.search(query_vector, top_k)
    retrieved_items = [metadata[idx] for idx in indices[0]]

    # --- 2. CHANGE: Build a cleaner, more structured context ---
    # This format is easier for the LLM to understand than raw code.
    context_pieces = []
    for item in retrieved_items:
        args_str = ", ".join(item.get("args", []))
        piece = (
            f"File Path: {item['file']}\n"
            f"Type: {item['type'].title()}\n"
            f"Name: {item['name']}\n"
            f"Signature: def {item['name']}({args_str}):\n"
            f"Docstring: {item['doc']}"
        )
        context_pieces.append(piece)

    context = "\n\n---\n\n".join(context_pieces)

    # Truncate if the combined context is too long
    context = context[:max_context_length]

    # --- 3. CHANGE: Use a more robust prompt template ---
    # This guides the model to answer accurately based only on the context.
    prompt_template = """
Use the following context from a Python codebase to answer the question.
If the context does not contain the information needed to answer the question, state that you cannot find the answer in the provided documentation.

Context:
{context}

Question: {question}

Answer:
"""
    prompt = prompt_template.format(context=context, question=query)

    # --- 4. CHANGE: Use the dedicated Q&A model for generation ---
    inputs = qa_tokenizer(prompt, return_tensors="pt", truncation=True, max_length=1024)

    output_ids = qa_model.generate(
        inputs["input_ids"],
        max_length=256,         # Increased max_length for more detailed answers
        num_beams=5,            # Increased beams for better quality generation
        early_stopping=True
    )

    answer = qa_tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return answer, retrieved_items

In [67]:

query = "how to give a greet message?"
answer, retrieved = rag_query(query, top_k=3)

print("===== RAG Answer =====")
print(answer)
print("-" * 50)

print("===== Retrieved Items =====")
for item in retrieved:
    # FIX: Changed item['summary'] to item['doc'] to match the new metadata
    print(f"Type: {item['type'].title()}\nName: {item['name']}\nFile: {item['file']}\nDocstring: {item['doc']}")
    print("-" * 50)

===== RAG Answer =====
return a greeting message.
--------------------------------------------------
===== Retrieved Items =====
Type: Function
Name: greet
File: dir/ex.py
Docstring: Return a greeting message.
--------------------------------------------------
Type: Class
Name: Person
File: dir/ex.py
Docstring: Represents a person with a name and age.
--------------------------------------------------
Type: Function
Name: __init__
File: dir/ex.py
Docstring: 
--------------------------------------------------
