## Git Functionalty 

In [1]:
import os
import pygit2
import asyncio

## Clone repo 

In [2]:
REPO_DIRS="./repos"

In [3]:

def clone_repository_sync(repo_url: str, destination: str):
    return pygit2.clone_repository(repo_url, destination)

async def clone_repo(repo_url: str):
    """Clone a Git repository using pygit2 into a designated directory for repositories."""
    repo_name = os.path.basename(repo_url.rstrip('/')).replace('.git', '')
    destination = os.path.join(REPO_DIRS, repo_name)
    os.makedirs(REPO_DIRS, exist_ok=True)
    
    try:
        # Run the blocking clone operation in a separate thread
        cloned_repo = await asyncio.get_running_loop().run_in_executor(
            None, clone_repository_sync, repo_url, destination
        )
        print(f"Repository cloned successfully to {destination}")
        return {
            "message": "Repository cloned successfully.",
            "repository_path": destination,
        }
    except Exception as e:
        print(f"Error cloning repository: {e}")


In [4]:
await clone_repo("https://github.com/bashiryounis/delivery-ocr.git")

Repository cloned successfully to ./repos/delivery-ocr


{'message': 'Repository cloned successfully.',
 'repository_path': './repos/delivery-ocr'}

## Git Repo Tree

In [6]:
import os
import pygit2

repo_path = "./repos/delivery-ocr"
repo = pygit2.Repository(repo_path)

def traverse_tree(tree, parent_full_path):
    """
    Recursively traverse the pygit2 tree and return a list of node info dictionaries.
    The 'path' and 'parent_path' values are computed relative to the repository base.
    """
    nodes = []
    for entry in tree:
        # Compute the full path based on the parent's full path
        full_path = os.path.join(parent_full_path, entry.name)
        # Compute the relative path from the repository base
        relative_path = os.path.relpath(full_path, repo_path)
        # For the parent, compute its relative path as well
        relative_parent = os.path.relpath(parent_full_path, repo_path)
        if entry.filemode == pygit2.GIT_FILEMODE_TREE:
            # It's a folder
            nodes.append({
                "type": "folder",
                "name": entry.name,
                "path": relative_path,
                "parent_path": relative_parent
            })
            subtree = repo[entry.id]
            nodes.extend(traverse_tree(subtree, full_path))
        else:
            # It's a file; extract the extension
            _, ext = os.path.splitext(entry.name)
            nodes.append({
                "type": "file",
                "name": entry.name,
                "path": relative_path,
                "extension": ext.lstrip('.'),
                "parent_path": relative_parent
            })
    return nodes

# Get the HEAD commit and traverse its tree starting from repo_path (using repo_path as the full base)
head_commit = repo[repo.head.target]
repo_nodes = traverse_tree(head_commit.tree, repo_path)

print("Collected nodes from repository:")
for n in repo_nodes:
    print(n)


Collected nodes from repository:
{'type': 'file', 'name': '.gitignore', 'path': '.gitignore', 'extension': '', 'parent_path': '.'}
{'type': 'file', 'name': 'Makefile', 'path': 'Makefile', 'extension': '', 'parent_path': '.'}
{'type': 'file', 'name': 'README.md', 'path': 'README.md', 'extension': 'md', 'parent_path': '.'}
{'type': 'folder', 'name': 'api', 'path': 'api', 'parent_path': '.'}
{'type': 'file', 'name': 'Dockerfile', 'path': 'api/Dockerfile', 'extension': '', 'parent_path': 'api'}
{'type': 'file', 'name': 'dev_start.sh', 'path': 'api/dev_start.sh', 'extension': 'sh', 'parent_path': 'api'}
{'type': 'file', 'name': 'poetry.lock', 'path': 'api/poetry.lock', 'extension': 'lock', 'parent_path': 'api'}
{'type': 'file', 'name': 'pyproject.toml', 'path': 'api/pyproject.toml', 'extension': 'toml', 'parent_path': 'api'}
{'type': 'folder', 'name': 'src', 'path': 'api/src', 'parent_path': 'api'}
{'type': 'folder', 'name': 'core', 'path': 'api/src/core', 'parent_path': 'api/src'}
{'type':

## Extract branches 

In [12]:
def extract_branches(repo: pygit2.Repository):
    """
    Extract all local branches from the repo with their metadata.

    Returns:
        List of branch dicts, each with:
        - name
        - is_head (bool)
        - target_commit_hash
        - latest_commit_summary
        - latest_commit_time
        - commit_count
    """
    branches = []

    for ref_name in repo.references:
        if not ref_name.startswith("refs/heads/"):
            continue  # skip remote or tag references

        branch_name = ref_name.split("/")[-1]
        ref = repo.lookup_reference(ref_name)
        target = ref.target
        commit = repo[target]

        # Count number of commits in this branch
        commit_count = sum(1 for _ in repo.walk(target, pygit2.GIT_SORT_TIME))

        branches.append({
            "name": branch_name,
            "is_head": repo.head.shorthand == branch_name,
            "target_commit_hash": str(commit.id),
            "latest_commit_summary": commit.message.strip().split('\n')[0],
            "latest_commit_time": commit.commit_time,
            "commit_count": commit_count
        })

    return branches


In [13]:
repo_path = "./repos/delivery-ocr"
repo = pygit2.Repository(repo_path)

branches = extract_branches(repo)

for b in branches:
    print(f"Branch: {b['name']}")
    print(f"  Is HEAD?         {b['is_head']}")
    print(f"  Target Commit:   {b['target_commit_hash']}")
    print(f"  Summary:         {b['latest_commit_summary']}")
    print(f"  Commit Count:    {b['commit_count']}")
    print(f"  Last Updated At: {b['latest_commit_time']}")


Branch: main
  Is HEAD?         True
  Target Commit:   61b6f0a6af549f4b24b25dfd0ecb522cefa8d739
  Summary:         - update port
  Commit Count:    4
  Last Updated At: 1744541659


## Extract Commit History 

In [10]:
import pygit2
from typing import List, Dict

def map_commits_to_branches(repo: pygit2.Repository) -> Dict[str, List[str]]:
    """
    Map each commit hash to the list of branches it appears in.
    """
    commit_branch_map = {}

    for ref_name in repo.references:
        if not ref_name.startswith("refs/heads/"):
            continue

        branch_name = ref_name.split("/")[-1]
        target = repo.lookup_reference(ref_name).target

        for commit in repo.walk(target, pygit2.GIT_SORT_TIME):
            h = str(commit.id)
            if h not in commit_branch_map:
                commit_branch_map[h] = []
            if branch_name not in commit_branch_map[h]:
                commit_branch_map[h].append(branch_name)

    return commit_branch_map


def extract_commits_with_diffs(repo: pygit2.Repository, max_count: int = 100):
    """
    Extract commits from all local branches with per-file diffs and associated branch names.
    
    Returns:
        commits: List of commit node dictionaries
        relationships: List of file-modification relationship dictionaries
    """
    commits = []
    relationships = []

    commit_branch_map = map_commits_to_branches(repo)
    visited = set()  # avoid duplicate commit processing

    for commit_hash, branches in commit_branch_map.items():
        commit = repo.revparse_single(commit_hash)
        if commit.id in visited:
            continue
        visited.add(commit.id)

        parents = commit.parents
        is_merge = len(parents) > 1

        if not parents:
            continue  # skip root commits

        parent = parents[0]
        diff = repo.diff(parent, commit)

        modified_files = []
        for patch in diff:
            file_path = patch.delta.new_file.path
            modified_files.append(file_path)

            # Build full patch
            diff_lines = []
            for hunk in patch.hunks:
                diff_lines.append(hunk.header)
                for line in hunk.lines:
                    prefix = line.origin  # '+', '-', ' '
                    content = line.content.strip('\n')
                    diff_lines.append(f"{prefix}{content}")

            file_diff_text = "\n".join(diff_lines)

            relationships.append({
                "commit_hash": str(commit.id),
                "file_path": file_path,
                "diff": file_diff_text
            })

        summary_diff = f"Files changed: {', '.join(modified_files)}"

        commits.append({
            "hash": str(commit.id),
            "summary": commit.message.strip().split('\n')[0],
            "message": commit.message.strip(),
            "author": {
                "name": commit.author.name,
                "email": commit.author.email,
                "time": commit.author.time
            },
            "committer": {
                "name": commit.committer.name,
                "email": commit.committer.email,
                "time": commit.committer.time
            },
            "parents": [str(p.id) for p in parents],
            "is_merge": is_merge,
            "branches": branches,
            "summary_diff": summary_diff
        })

        if len(commits) >= max_count:
            break

    return commits, relationships


In [11]:
repo_path = "./repos/delivery-ocr"
repo = pygit2.Repository(repo_path)

commits, modified_rels = extract_commits_with_diffs(repo, max_count=50)

for c in commits:
    print(f"{c['hash']} | branches: {c['branches']} | {c['summary_diff']}")

for r in modified_rels:
    print(f"{r['commit_hash']} modified {r['file_path']}")


61b6f0a6af549f4b24b25dfd0ecb522cefa8d739 | branches: ['main'] | Files changed: docker-compose.yaml
0b99e1cb10c567aee329cc3833e7babaa45f17b0 | branches: ['main'] | Files changed: README.md
cbd1055d6186163c39152e98dfccf2ee19495e9a | branches: ['main'] | Files changed: Makefile, README.md, api/Dockerfile, api/dev_start.sh, api/poetry.lock, api/pyproject.toml, api/src/core/config.py, api/src/core/llm.py, api/src/core/logger_config.py, api/src/core/prompt.py, api/src/core/schema.py, api/src/main.py, api/src/service/ocr_delivery.py, api/src/service/utils.py, docker-compose.yaml, docs/delivery_ocr.ipynb, docs/sample1.jpeg, docs/sample2.jpeg, main_dir/sample2.jpeg
61b6f0a6af549f4b24b25dfd0ecb522cefa8d739 modified docker-compose.yaml
0b99e1cb10c567aee329cc3833e7babaa45f17b0 modified README.md
cbd1055d6186163c39152e98dfccf2ee19495e9a modified Makefile
cbd1055d6186163c39152e98dfccf2ee19495e9a modified README.md
cbd1055d6186163c39152e98dfccf2ee19495e9a modified api/Dockerfile
cbd1055d6186163c39152

## Main Class 

In [None]:
repo = pygit2.Repository(repo_path)


remote = repo.remotes["origin"]
print(f"[Remote]")
print(f"  Name : {remote.name}")
print(f"  URL  : {remote.url}")

[Remote]
  Name : origin
  URL  : https://github.com/bashiryounis/delivery-ocr.git
  Fetch: ['+refs/heads/*:refs/remotes/origin/*']
  Push : []


In [41]:
class GitRepoParser:
    def __init__(self, repo_path: str):
        self.repo_path = repo_path
        self.repo = pygit2.Repository(repo_path)
        self.nodes = {
            "metadata": {},
            "folders": [],
            "files": [],
            "branches": [],
            "commits": [],
        }

        self.relationship = {
            "parent": [],
            "modified": [],
        }
    
    def get_repo_metadata(self):
        """
        Extract and return repository metadata, including name, remote URL,
        default branch, and description. Populates self.nodes["metadata"].
        """
        remote_url = None
        repo_name = None

        try:
            remote = self.repo.remotes["origin"]
            if remote:
                remote_url = remote.url
                repo_name = os.path.splitext(os.path.basename(remote_url.rstrip("/")))[0]
                print(f"Repository remote URL: {remote_url}")
        except Exception as e:
            print(f"Failed to read remote origin: {e}")

        if not repo_name:
            repo_name = os.path.basename(os.path.abspath(self.repo_path))
            print(f"Fallback repository name from path: {repo_name}")

        try:
            default_branch = self.repo.head.shorthand
            print(f"Default branch: {default_branch}")
        except pygit2.GitError:
            default_branch = None
            print("No default branch found.")

        description_file = os.path.join(self.repo_path, ".git", "description")
        if os.path.exists(description_file):
            with open(description_file, "r") as f:
                description = f.read().strip()
                if description.strip().startswith("Unnamed repository; edit this file"):
                    description = "No description available"
        metadata = {
            "name": repo_name,
            "url": remote_url,
            "description": description,
            "default_branch": default_branch,
        }

        self.nodes["metadata"] = metadata
        return metadata
    
    def get_branches(self):
        """
        Return unified branch objects with optional remote tracking info.
        This avoids duplicating local/remote branches unless needed.
        """
        branch_dicts = []
        repo_name = self.nodes.get("metadata", {}).get("name", "unknown")

        for branch_name in self.repo.branches.local:
            try:
                branch_ref = self.repo.branches.local.get(branch_name)
                if not branch_ref:
                    continue

                commit = self.repo.get(branch_ref.target)

                # Tracking info
                is_tracking = False
                upstream_name = None
                remote_name = None

                try:
                    upstream = branch_ref.upstream
                    if upstream:
                        is_tracking = True
                        upstream_name = upstream.name
                        remote_name = upstream.remote_name
                except Exception:
                    pass

                # Latest commit ID
                commit_id = str(commit.id)

                # Count commits in branch
                commit_count = sum(1 for _ in self.repo.walk(commit.id, pygit2.GIT_SORT_TOPOLOGICAL))

                # Build branch object
                branch_dicts.append({
                    "name": branch_name,
                    "is_head": branch_ref.is_head(),
                    "is_default": branch_ref.is_head(),  # Treat HEAD as default in this context
                    "is_remote_tracking": is_tracking,
                    "upstream_name": upstream_name,
                    "remote_name": remote_name,
                    "latest_commit_id": commit_id,
                    "commit_count": commit_count,
                    "repo_name": repo_name,
                })

            except Exception as e:
                print(f"Failed to process branch '{branch_name}': {e}")

        self.nodes["branches"] = branch_dicts
        return branch_dicts




In [42]:
service = GitRepoParser(repo_path)
service.get_repo_metadata()
service.get_branches()

Repository remote URL: https://github.com/bashiryounis/delivery-ocr.git
Default branch: main


[{'name': 'main',
  'is_head': True,
  'is_default': True,
  'is_remote_tracking': True,
  'upstream_name': 'refs/remotes/origin/main',
  'remote_name': 'origin',
  'latest_commit_id': '61b6f0a6af549f4b24b25dfd0ecb522cefa8d739',
  'commit_count': 4,
  'repo_name': 'delivery-ocr'}]

## build Basic KG in Neo4j 

In [1]:
from neo4j import GraphDatabase, AsyncGraphDatabase
import asyncio

NEO4J_URI = "bolt://localhost:7687"
NEO4J_USERNAME="neo4j"
NEO4J_PASSWORD="passowrd"
# Define constants for node labels
REPO_LABEL = "Repository"
FOLDER_LABEL = "Folder"
FILE_LABEL = "File"


async def neo4j():
    driver = AsyncGraphDatabase.driver(
        NEO4J_URI, 
        auth=(NEO4J_USERNAME, NEO4J_PASSWORD))

    await driver.close()  # close the driver object

In [21]:
import os
import pygit2

repo_path = "./repos/sec-insights"
repo = pygit2.Repository(repo_path)

def traverse_tree(tree, parent_full_path):
    """
    Recursively traverse the pygit2 tree and return a list of node info dictionaries.
    The 'path' and 'parent_path' values are computed relative to the repository base.
    """
    nodes = []
    for entry in tree:
        # Compute the full path based on the parent's full path
        full_path = os.path.join(parent_full_path, entry.name)
        # Compute the relative path from the repository base
        relative_path = os.path.relpath(full_path, repo_path)
        # For the parent, compute its relative path as well
        relative_parent = os.path.relpath(parent_full_path, repo_path)
        if entry.filemode == pygit2.GIT_FILEMODE_TREE:
            # It's a folder
            nodes.append({
                "type": "folder",
                "name": entry.name,
                "path": relative_path,
                "parent_path": relative_parent
            })
            subtree = repo[entry.id]
            nodes.extend(traverse_tree(subtree, full_path))
        else:
            # It's a file; extract the extension
            _, ext = os.path.splitext(entry.name)
            nodes.append({
                "type": "file",
                "name": entry.name,
                "path": relative_path,
                "extension": ext.lstrip('.'),
                "parent_path": relative_parent
            })
    return nodes

# Get the HEAD commit and traverse its tree starting from repo_path (using repo_path as the full base)
head_commit = repo[repo.head.target]
repo_nodes = traverse_tree(head_commit.tree, repo_path)

print("Collected nodes from repository:")
for n in repo_nodes:
    print(n)


Collected nodes from repository:
{'type': 'folder', 'name': '.devcontainer', 'path': '.devcontainer', 'parent_path': '.'}
{'type': 'file', 'name': 'Dockerfile', 'path': '.devcontainer/Dockerfile', 'extension': '', 'parent_path': '.devcontainer'}
{'type': 'file', 'name': 'README.md', 'path': '.devcontainer/README.md', 'extension': 'md', 'parent_path': '.devcontainer'}
{'type': 'file', 'name': 'devcontainer.json', 'path': '.devcontainer/devcontainer.json', 'extension': 'json', 'parent_path': '.devcontainer'}
{'type': 'file', 'name': 'post_create_command.sh', 'path': '.devcontainer/post_create_command.sh', 'extension': 'sh', 'parent_path': '.devcontainer'}
{'type': 'file', 'name': 'FAQ.md', 'path': 'FAQ.md', 'extension': 'md', 'parent_path': '.'}
{'type': 'file', 'name': 'LICENSE', 'path': 'LICENSE', 'extension': '', 'parent_path': '.'}
{'type': 'file', 'name': 'README.md', 'path': 'README.md', 'extension': 'md', 'parent_path': '.'}
{'type': 'folder', 'name': 'backend', 'path': 'backend',

In [33]:
async def create_repository_node(session, name):
    query = (
        f"MERGE (r:{REPO_LABEL} {{ name: $name }}) "
        "RETURN r"
    )
    result = await session.run(query, name=name)
    record = await result.single()
    return record["r"]

In [44]:
async def create_folder_node(session, name, path, parent_path):
    if parent_path == ".":
        query = (
            f"MERGE (f:{FOLDER_LABEL} {{ path: $path }}) "
            "SET f.name = $name "
            "WITH f "
            f"MATCH (p:{REPO_LABEL} {{ name: $repo_name }}) "  # only double braces
            "MERGE (p)-[:CONTAINS]->(f) "
            "RETURN f"
        )
        result = await session.run(query, name=name, path=path, repo_name="sec-insights")
    else:
        query = (
            f"MERGE (f:{FOLDER_LABEL} {{ path: $path }}) "
            "SET f.name = $name "
            "WITH f "
            f"MATCH (p:{FOLDER_LABEL} {{ path: $parent_path }}) "  
            "MERGE (p)-[:CONTAINS]->(f) "
            "RETURN f"
        )
        result = await session.run(query, name=name, path=path, parent_path=parent_path)
    record = await result.single()
    return record["f"]

async def create_file_node(session, name, path, extension, parent_path):
    if parent_path == ".":
        query = (
            f"MERGE (f:{FILE_LABEL} {{ path: $path }}) "
            "SET f.name = $name, f.extension = $extension "
            "WITH f "
            f"MATCH (p:{REPO_LABEL} {{ name: $repo_name }}) "  
            "MERGE (p)-[:CONTAINS]->(f) "
            "RETURN f"
        )
        result = await session.run(query, name=name, path=path, extension=extension, repo_name="sec-insights")
    else:
        query = (
            f"MERGE (f:{FILE_LABEL} {{ path: $path }}) "
            "SET f.name = $name, f.extension = $extension "
            "WITH f "
            f"MATCH (p:{FOLDER_LABEL} {{ path: $parent_path }}) "  
            "MERGE (p)-[:CONTAINS]->(f) "
            "RETURN f"
        )
        result = await session.run(query, name=name, path=path, extension=extension, parent_path=parent_path)
    record = await result.single()
    return record["f"]


In [45]:
driver = AsyncGraphDatabase.driver(
    NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD)
)
driver = AsyncGraphDatabase.driver(
        NEO4J_URI, auth=(NEO4J_USERNAME, NEO4J_PASSWORD)
    )

async with driver.session() as session:
    # Create the repository node with just a name
    repo_node = await create_repository_node(session, "sec-insights")
    print("Created Repository Node:", repo_node)
    
    # Create folder and file nodes using the collected repository tree data
    for node in repo_nodes:
        if node["type"] == "folder":
            folder_node = await create_folder_node(
                session, node["name"], node["path"], node["parent_path"]
            )
            print("Created Folder Node:", folder_node)
        elif node["type"] == "file":
            file_node = await create_file_node(
                session, node["name"], node["path"], node["extension"], node["parent_path"]
            )
            print("Created File Node:", file_node)
await driver.close()



Created Repository Node: <Node element_id='4:80166c63-cba2-410b-85d0-b8a3f32d747a:200' labels=frozenset({'Repository'}) properties={'name': 'sec-insights'}>
Created Folder Node: <Node element_id='4:80166c63-cba2-410b-85d0-b8a3f32d747a:0' labels=frozenset({'Folder'}) properties={'path': '.devcontainer', 'name': '.devcontainer'}>
Created File Node: <Node element_id='4:80166c63-cba2-410b-85d0-b8a3f32d747a:1' labels=frozenset({'File'}) properties={'path': '.devcontainer/Dockerfile', 'extension': '', 'name': 'Dockerfile'}>
Created File Node: <Node element_id='4:80166c63-cba2-410b-85d0-b8a3f32d747a:2' labels=frozenset({'File'}) properties={'path': '.devcontainer/README.md', 'extension': 'md', 'name': 'README.md'}>
Created File Node: <Node element_id='4:80166c63-cba2-410b-85d0-b8a3f32d747a:3' labels=frozenset({'File'}) properties={'path': '.devcontainer/devcontainer.json', 'extension': 'json', 'name': 'devcontainer.json'}>
Created File Node: <Node element_id='4:80166c63-cba2-410b-85d0-b8a3f32