<div style="background-color: #ADD8E6; border: 1px solid gray; padding: 3px">
    <h3>Agentic Module</h3>
    This notebook builds out the following multi-agent system using CrewAI:
    <ul>
        <li><b>Code Retriever</b>: Retrieves the legacy codebase (JSP) from a specified list of github repositories.</li>
        <li><b>Code Parser</b>: Using a simple lexical parser, transforms the codebase into a hierarchical graph structure based on their dependencies, then clusters and ranks the processing order of the code based on these dependencies.</li>
        <li><b>Code Analyzer</b>: Generates a mapping of code-to-text pairs for the low-level components of the code.</li>
        <li><b>Code Scribe</b>: Generates software specification documents for the code using the output from the Code Analyzer.</li>
        <li><b>Code Translator</b>: Using spec-driven development, generates a translated version of the code into the target language/framework (NodeJS/React) based on the specs from the Code Scribe.</li>
        <li><b>Code Aggregator</b>: Uses a bottom-up approach to aggregate the translated code and generated specs into one codebase.</li>
        <li><b>Code Evaluator</b>: Evaluates the generated code and specs using both custom and classic metrics.</li>
    </ul>
</div>

In [1]:
##############################################################################
# Imports
##############################################################################
import importlib
from agentic.crew import CodeToSummary, SummaryToSpec, SpecToCode
from crewai import Agent, Task, Crew, Process, LLM
from crewai.project import CrewBase, agent, crew, task
from crewai.agents.agent_builder.base_agent import BaseAgent
from typing import List, Optional
from pydantic import Field, BaseModel
from crewai_tools import SerperDevTool
from crewai.tools import tool
from crewai.flow.flow import Flow, listen, start
import os
from urllib.parse import urlparse
from dotenv import load_dotenv
from custom.dependency_parser import JspDependencyParser, GithubTools
load_dotenv()
import asyncio
import nest_asyncio
nest_asyncio.apply()

In [2]:
##############################################################################
# Structured Output
##############################################################################

class FileCluster(BaseModel):
    cluster_id: int = Field(description="Cluster identifier", default=0)
    
    files: List[str] = Field(description="List of files associated with this cluster")

class GitRepo(BaseModel):
    repo_id: str = Field(description="Git repo identifier", default="0")
    
    repo_url: str = Field(description="Git repo URI")
    
    repo_branch_sha: str = Field(description="Git branch SHA")

    summary: Optional[str] = Field(description="Summary of the aggregated file content", default="")
    
    clusters: Optional[List[FileCluster]] = Field(description="Ranked list of file clusters ordered by number of dependencies", default=[])

In [3]:
##############################################################################
# Functions
##############################################################################

def rank_relevant_files(github_repo: str, tree_sha: str):

    def get_clusters(files, start_index=0):
        """Splits the list of files into clusters."""

        idx = start_index
        
        while idx < len(files):

            current_degree = files[idx][0]

            cluster_size = len([degree for degree, _ in files if degree==current_degree])

            cluster = files[idx:idx + cluster_size]

            cluster_files = [f for _, f in cluster]
            
            yield cluster_files

            idx = idx + cluster_size
    
    parser = JspDependencyParser(github_repo, tree_sha)
    
    ranked_files = parser.get_ranked_files_by_outdegree()

    ranked_file_clusters = list(get_clusters(ranked_files))

    return ranked_file_clusters

def get_github_files(github_repo: str, tree_sha: str):

    repo_api = GithubTools.get_git_repo_api(github_repo)
    
    return GithubTools.get_relevant_files(tree_sha, repo_api)

def get_aggregated_github_file_content(github_repo: str, repo_files: List[str]):

    repo_api = GithubTools.get_git_repo_api(github_repo)

    output = [f"""============={f}\n{GithubTools.get_github_file_content(f.strip(), repo_api)}"""
              for f in repo_files]

    output = "\n".join(output)

    return output


##############################################################################
# Tools
##############################################################################
@tool
def rank_relevant_files_tool(github_repo: str, tree_sha: str):
    """Generates a list of relevant files from the git repo ranked by their number of dependencies, or their out-degree (smaller number of dependencies equates to lower rank)."""
    
    return rank_relevant_files(github_repo, tree_sha)

@tool
def get_github_files_tool(github_repo: str, tree_sha: str):
    """Gets the list of files associated with the provided git repo and branch."""
    
    return get_github_files(github_repo, tree_sha)

@tool
def get_aggregated_github_file_content_tool(repo_files: List[str], repo_api: str):
    """Returns a blob representing the aggregation of the file content for each file in repo_files."""

    return get_aggregated_github_file_content(repo_files, repo_api)
    
    

In [4]:
##############################################################################
# Flows
##############################################################################


class CodeTranslationFlow(Flow):
    """Flow for CodeTranslation."""

    @start()
    def retrieve_code(self):
        
        print(f"Starting flow {self.state['id']} for {self.state["repo_url"]}#{self.state["repo_branch_sha"]}...")

        github_repo, tree_sha = self.state["repo_url"], self.state["repo_branch_sha"]

        clusters = rank_relevant_files(github_repo, tree_sha)

        self.state["clusters"] = [{"files": cluster} for cluster in clusters]

        self.state["summary"] = ""

    @listen(retrieve_code)
    def analyze_code(self):
        
        def analyze_code_cluster(cluster, cluster_id, running_summary):

            repo_files = cluster["files"]

            repo_url = self.state["repo_url"]

            repo_id = self.state["repo_id"]
            
            inputs = get_aggregated_github_file_content(repo_url, repo_files)

            output = CodeToSummary().crew().kickoff(inputs={"inputs": inputs, 
                                                            
                                                            "summary": running_summary,
                                                           
                                                            "output_base_path": f"{repo_id}/{cluster_id}"})

            return output

        # Get aggregate summary of code
        
        running_summary = self.state.get("summary", "")
        
        for cluster_id, cluster in enumerate(self.state["clusters"]):

            self.state["cluster_id"] = cluster_id

            output_section = analyze_code_cluster(cluster, cluster_id, running_summary)

            files_section = "\n  -".join(cluster["files"])

            if not running_summary:

                running_summary = "Context:\n=======\n"

            running_summary += f"\n -{files_section}\n=======\n{output_section}"

        # Get spec from aggregate summary
        spec = SummaryToSpec().crew().kickoff(inputs={"inputs": running_summary,
                                                      
                                                      "output_base_path": f"{self.state["repo_id"]}"})

        return spec.raw

    @listen(analyze_code)
    def generate_code(self, spec):

        print("Generating code!...")

        repo_id = self.state["repo_id"]

        SpecToCode().crew().kickoff(inputs={"spec": spec,
            
                                            "output_base_path": f"{repo_id}",
                                           
                                            "code_base_path": f"{repo_id}/code"})

### Execute Code Translation Flow
Execute the flow!

In [None]:
##############################################################################
# Execute the Flow
##############################################################################
git_repos = [
    ("https://github.com/dmorrison/jsp-guestbook","master")
]

flow = CodeTranslationFlow()

flow.plot("CodeTranslationFlowPlot")

for git_repo, git_branch_sha in git_repos:

    repo_id = ("_").join(urlparse(git_repo).path.split("/")[1:])

    result = flow.kickoff(inputs={"repo_url":git_repo,
                                  
                                  "repo_branch_sha":git_branch_sha, 
                                  
                                  "repo_id": repo_id})

    print("Code Translation flow complete.")