In [0]:
import os
import tempfile
import shutil
from git import Repo
from tqdm import tqdm
import re

from pyspark.sql.functions import lit

from databricks_langchain import ChatDatabricks


GITHUB_REPO_URL = "https://<github>.git"
# DATABRICKS_MODEL_ENDPOINT = "databricks-meta-llama-3-1-405b-instruct" 
DATABRICKS_MODEL_ENDPOINT = "databricks-claude-3-7-sonnet" 



llm = ChatDatabricks(model=DATABRICKS_MODEL_ENDPOINT, temperature=0, host="https://<host>", token="dapiXXX")

def clone_repo(github_url):
    print(f"\n📥 Cloning repo: {github_url} ...")
    temp_dir = tempfile.mkdtemp(prefix="repo_")
    Repo.clone_from(github_url, temp_dir)
    return temp_dir

def get_python_files(base_path):
    excluded_files = {"setup.py", "__init__.py", "_README.py"}
    python_files = []

    for root, _, files in os.walk(base_path):
        for file in files:
            file_path = os.path.join(root, file)

            if not file.endswith(".py"):
                continue

            lower_file = file.lower()
            if (file in excluded_files or lower_file.startswith("test") or "__pycache__" in root):
                continue

            python_files.append(file_path)

    return python_files

def analyze_file_with_langchain(file_content, file_name):
    messages = [
        {
            "role": "system",
            "content": (
                "You are a senior performance engineer with deep expertise in Apache Spark, Python, Scala, and SQL. Your role is to analyze code for inefficiencies and provide precise, actionable optimization recommendations"
            ),
        },
        {
            "role": "user",
            "content": (
                f"Review the Python file `{file_name}`:\n"
                f"---BEGIN FILE CONTENT---\n{file_content}\n---END FILE CONTENT---\n"
                "Identify inefficiencies and suggest improvements, with a primary focus on Spark-related code. If the code is not Spark-related, conclude with one or two concise sentences. If the code is simple and needs no changes, respond with: 'Code is simple. No change needed'. Be clear and concise"
            ),
        },
    ]

    
    response = llm.invoke(messages)
    return response["content"] if "content" in response else str(response)

def main():
    responses = []

    try:
        repo_path = clone_repo(GITHUB_REPO_URL)

        python_files = get_python_files(repo_path)
        print(f"\n🔍 Found {len(python_files)} Python files to analyze.\n")

        python_files = python_files[:20]
        print(f"python_files izz {python_files}")

        for py_file in tqdm(python_files, desc="🔎 Analyzing Python files"):
            try:
                with open(py_file, "r", encoding="utf-8", errors="ignore") as f:
                    file_content = f.read()

                feedback = analyze_file_with_langchain(file_content, py_file)
                responses.append({"file": py_file, "feedback": feedback})
            except Exception as e:
                responses.append({"file": py_file, "feedback": f"[Error reading file]: {e}"})

        print(f"type of responses is {type(responses)}")
        print(f"responses is {responses}")

        results = []
        for item in responses:
            file_path = item.get('file')
            feedback = item.get('feedback', '')

            match = re.search(r'content=(["\'])(.*?)\1\s+additional_kwargs=', feedback, re.DOTALL)
            content = match.group(2) if match else None

            results.append({'file': file_path, 'content': content, 'raw_data': str(item)})

        df = spark.createDataFrame(results)

        delta_table = "bircatalog.birschema.llm_op"
        df = df.withColumn("llm_used", lit(DATABRICKS_MODEL_ENDPOINT))

        # df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable(delta_table)
        df.write.format("delta").mode("append").saveAsTable(delta_table)

    finally:
        if os.path.exists(repo_path):
            shutil.rmtree(repo_path)

if __name__ == "__main__":
    main()
