In [None]:
import os
import ast
import re
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder




In [27]:
print(os.path.exists("files"))
print(os.path.isdir("files"))

True
True


In [100]:
def get_python_files(folder_path):
    """Retrieve all Python files in the given folder."""
    return [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".py")]

get_python_files("files")

['files\\a.py', 'files\\b.py', 'files\\c.py', 'files\\d.py', 'files\\main.py']

In [101]:
processes = []
files = get_python_files("files")
for file in files:
    with open(file, "r", encoding="utf-8") as f:
        source_code = f.read()
        tree = ast.parse(source_code, filename=file, mode='exec')
        data = ast.dump(tree, indent=2).replace("Module(", f"Module(filename='{file}', ")
        processes.append(data)
processes[0]

"Module(filename='files\\a.py', \n  body=[\n    FunctionDef(\n      name='a',\n      args=arguments(\n        posonlyargs=[],\n        args=[],\n        kwonlyargs=[],\n        kw_defaults=[],\n        defaults=[]),\n      body=[\n        Assign(\n          targets=[\n            Name(id='a', ctx=Store())],\n          value=Constant(value=1)),\n        Return(\n          value=Name(id='a', ctx=Load()))],\n      decorator_list=[],\n      type_params=[])],\n  type_ignores=[])"

In [98]:
def extract_variable_lineage(ast_string):
    """
    Parses an AST string to extract variable assignments and their sources.
    Returns a list of (filename, function, variable, source) tuples.
    """
    tree = ast.parse(ast_string)
    filename = None
    lineage = []

    for node in ast.walk(tree):
        if isinstance(node, ast.Module):
            filename = getattr(node, "filename", "unknown")

        elif isinstance(node, ast.FunctionDef):
            func_name = node.name

            for stmt in node.body:
                if isinstance(stmt, ast.Assign):
                    target = stmt.targets[0].id if isinstance(stmt.targets[0], ast.Name) else "unknown"
                    if isinstance(stmt.value, ast.Constant):
                        source = str(stmt.value.value)
                    elif isinstance(stmt.value, ast.BinOp):
                        left = stmt.value.left.id if isinstance(stmt.value.left, ast.Name) else "?"
                        right = stmt.value.right.id if isinstance(stmt.value.right, ast.Name) else "?"
                        source = f"{left} + {right}"
                    elif isinstance(stmt.value, ast.Call):
                        func_call = stmt.value.func
                        if isinstance(func_call, ast.Attribute):
                            source = f"{func_call.value.id}.{func_call.attr}()"
                        else:
                            source = "function call"
                    else:
                        source = "unknown"
                    
                    lineage.append((filename, func_name, target, source))

    return lineage

# Process all AST representations
data = []
for ast_str in processes:
    data.extend(extract_variable_lineage(ast_str))

# Create DataFrame
df = pd.DataFrame(data, columns=["File", "Function", "Variable", "Source of Value"])




In [99]:
df

Unnamed: 0,File,Function,Variable,Source of Value
