# GitRAG - CodeBase Retrieval Augmented Generation

## 1. Importing Dependencies

In [82]:
! pip install pygithub langchain langchain-community pinecone-client langchain-pinecone sentence-transformers gitpython



In [83]:
import os
import tempfile
from git import Repo
from google import genai
from pathlib import Path
from pinecone import Pinecone
from google.colab import userdata
from langchain.schema import Document
from langchain_pinecone import PineconeVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings

## 2. Retrieving Github Repo. Contents

In [84]:
def clone_repository(repo_url):
    """Clones a GitHub repository to a temporary directory."""
    repo_name = repo_url.split("/")[-1]
    repo_path = f"/content/{repo_name}"
    Repo.clone_from(repo_url, str(repo_path))
    return str(repo_path)

In [85]:
SUPPORTED_EXTENSIONS = {'.py', '.js', '.tsx', '.jsx', '.ipynb', '.java',
                       '.cpp', '.ts', '.go', '.rs', '.vue', '.swift', '.c', '.h'}

IGNORED_DIRS = {'node_modules', 'venv', 'env', 'dist', 'build', '.git',
                '__pycache__', '.next', '.vscode', 'vendor'}

In [86]:
def get_file_content(file_path, repo_path):
    """Get content of a single file."""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            content = f.read()
        rel_path = os.path.relpath(file_path, repo_path)
        return {
            "name": rel_path,
            "content": content
        }
    except UnicodeDecodeError:
        print(f"Skipping file {file_path} due to encoding issues.")
        return None
    except Exception as e:
        print(f"Error processing file {file_path}: {str(e)}")
        return None

def get_main_files_content(repo_path: str):
    """Get content of supported code files from the local repository."""
    files_content = []
    try:
        for root, _, files in os.walk(repo_path):
            if os.path.basename(root) in IGNORED_DIRS:
                print(f"Skipping directory: {root}")
                continue
            for file in files:
                file_path = os.path.join(root, file)
                file_ext = os.path.splitext(file)[1]
                if file_ext in SUPPORTED_EXTENSIONS:
                    print(f"Processing file: {file_path}")
                    file_content = get_file_content(file_path, repo_path)
                    if file_content:
                        files_content.append(file_content)
                else:
                    print(f"Skipping unsupported file: {file_path}")
    except Exception as e:
        print(f"Error reading repository: {str(e)}")
    return files_content

In [87]:
# Clone and process repository
gitrepo = "https://github.com/bilalsavagexd/Frequency-Analysis.git"
path = clone_repository(gitrepo)
file_content = get_main_files_content(path)

Skipping unsupported file: /content/Frequency-Analysis.git/README.md
Processing file: /content/Frequency-Analysis.git/frequency-analysis.py
Skipping directory: /content/Frequency-Analysis.git/.git
Skipping unsupported file: /content/Frequency-Analysis.git/.git/refs/heads/main
Skipping unsupported file: /content/Frequency-Analysis.git/.git/refs/remotes/origin/HEAD
Skipping unsupported file: /content/Frequency-Analysis.git/.git/hooks/pre-applypatch.sample
Skipping unsupported file: /content/Frequency-Analysis.git/.git/hooks/pre-receive.sample
Skipping unsupported file: /content/Frequency-Analysis.git/.git/hooks/commit-msg.sample
Skipping unsupported file: /content/Frequency-Analysis.git/.git/hooks/pre-push.sample
Skipping unsupported file: /content/Frequency-Analysis.git/.git/hooks/applypatch-msg.sample
Skipping unsupported file: /content/Frequency-Analysis.git/.git/hooks/pre-merge-commit.sample
Skipping unsupported file: /content/Frequency-Analysis.git/.git/hooks/prepare-commit-msg.samp

In [88]:
# Create documents
documents = []
for file in file_content:
    doc = Document(
        page_content=f"{file['name']}\n{file['content']}",
        metadata={"source": file['name']}
    )
    documents.append(doc)

In [115]:
documents

[Document(metadata={'source': 'frequency-analysis.py', 'text': 'frequency-analysis.py\nfrom collections import Counter \nimport re\n\nTOP_K = 26 \nN_GRAM = 3\n\n# Generate all the n-grams for value n \n\ndef ngrams(n, text): \n    \n    for i in range(len(text) -n + 1):\n\n        # Ignore n-grams containing white space \n        if not re.search(r\'\\s\', text[i:i+n]): \n            yield text[i:i+n]\n\n# Read the data from the ciphertext \n\nwith open(\'ciphertext.txt\') as f: \n    text = f.read()\n\n#Count, sort, and print out the n-grams \n\nfor N in range(N_GRAM): \n    print("------------------------------------")\n\n    print("{}-gram (top {}):".format(N+1, TOP_K))\n\n    counts = Counter(ngrams (N+1, text)) # Count\n\n    sorted_counts = counts.most_common (TOP_K) # Sort \n    \n    for ngram, count in sorted_counts: \n        print("{}: {}".format(ngram, count)) # Print'}, page_content='frequency-analysis.py\nfrom collections import Counter \nimport re\n\nTOP_K = 26 \nN_GRAM 

## 3. Initializing Embedding Model

In [89]:
# Initialize embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

## 4. Initialize Pinecone to Store Vector Embeddings

In [90]:
# Initialize Pinecone
pinecone_api_key = userdata.get("PINECONE_API_KEY")
os.environ['PINECONE_API_KEY'] = pinecone_api_key
pc = Pinecone(api_key=pinecone_api_key)

In [91]:
# Create vector store
vectorstore = PineconeVectorStore.from_documents(
    documents=documents,
    embedding=embeddings,  # Pass the embeddings object, not the encoded documents
    index_name="codebase-rag",
    namespace="https://github.com/bilalsavagexd/Frequency-Analysis.git"
)

## 5. Performing Retrieval Augmented Generation

In [117]:
# Initialize LLM Service Provider
gemini_api_key = userdata.get("GEMINI_API_KEY")
os.environ['GEMINI_API_KEY'] = gemini_api_key

client = genai.Client(api_key = gemini_api_key)

In [118]:
def perform_rag(query):

    # Get embeddings for the User Query
    raw_query_embedding = embeddings.embed_query(query)

    # Query Pinecone to retrieve top matches through cosing similarity search
    top_matches = pinecone_index.query(
        vector=raw_query_embedding,
        top_k=5,
        include_metadata=True,
        namespace="https://github.com/bilalsavagexd/Frequency-Analysis.git"
    )

    # Get the list of retrieved texts
    contexts = [item['metadata']['text'] for item in top_matches['matches']]

    augmented_query = "<CONTEXT>\n" + "\n\n-------\n\n".join(contexts[:10]) + "\n-------\n</CONTEXT>\n\n\n\nMY QUESTION:\n" + query

    # Create the prompt
    prompt = f"""You are a Senior Software Engineer.

    Answer the following question about the codebase, based on the code provided. Always consider all of the context provided when forming a response.

    {augmented_query}
    """

    try:
        # Generate streaming response using Gemini
        response = client.models.generate_content_stream(
            model="gemini-2.0-flash",
            contents=[prompt]  # Note: contents expects a list
        )

        # Stream the response
        for chunk in response:
            print(chunk.text, end="")

    except Exception as e:
        print(f"An error occurred during generation: {str(e)}")

In [119]:
perform_rag("What is happening in the code?")

The python script `frequency-analysis.py` performs frequency analysis on a ciphertext stored in `ciphertext.txt`. It calculates and displays the most frequent n-grams (sequences of n characters) within the ciphertext.

Here's a breakdown:

1.  **Imports:** It imports the `Counter` class from the `collections` module, used for counting the occurrences of n-grams, and the `re` module for regular expressions, used to ignore ngrams containing whitespace.
2.  **Constants:** Defines `TOP_K` as 26 (to display the top 26 most frequent n-grams) and `N_GRAM` as 3 (to analyze n-grams of length 1, 2, and 3).
3.  **`ngrams(n, text)` Function:**
    *   Takes the n-gram length `n` and the text `text` as input.
    *   Iterates through the text to generate n-grams.
    *   Uses a regular expression `re.search(r'\s', text[i:i+n])` to check if an n-gram contains whitespace. If it does, the n-gram is skipped.
    *   `yield` returns the n-gram if it doesn't contain whitespace. `yield` makes this functio