In [32]:
import pandas as pd
import re

df = pd.read_csv("object_definitions.csv")


def robust_clean_sql(sql_query):
    sql_text = str(sql_query)

    sql_text = sql_text.replace('\\n', '\n').replace('\\t', '\t')

    # Remove single-line comments (-- ...)
    sql_text = re.sub(r'--.*', '', sql_text)
    # Remove multi-line comments (/* ... */)
    sql_text = re.sub(r'/\*.*?\*/', '', sql_text, flags=re.DOTALL)

    # Replace multiple newlines with a single newline
    sql_text = re.sub(r'\n\s*\n', '\n', sql_text)
    # Collapse horizontal spaces (tabs/spaces) into one space
    sql_text = re.sub(r'[ \t]+', ' ', sql_text)
    
    return sql_text.strip()


df = df[df["ObjectType"] == "SQL_STORED_PROCEDURE"]
# object_schema = "tmp"
# object_name = "usp_Cobachi_Consumption_Volumes_Actual_Email"#"usp_Fleet_snapshot"
# df = df.query(f"Schema == '{object_schema}' and Object == '{object_name}'")
df

Unnamed: 0,DatabaseName,Schema,Object,ObjectType,definition
0,shell-01-eun-sqdb-cgljxfpzifaquvuwplpe,MITAO_TDM,usp_SNV_sc_task_sla,SQL_STORED_PROCEDURE,CREATE PROCEDURE [MITAO_TDM].[usp_SNV_sc_task_...
1,shell-01-eun-sqdb-cgljxfpzifaquvuwplpe,CX,Load_MI_data_r5,SQL_STORED_PROCEDURE,CREATE PROCEDURE [CX].[Load_MI_data_r5]\nAS\nB...
3,shell-01-eun-sqdb-cgljxfpzifaquvuwplpe,SAM,usp_stg_ws1_software_version,SQL_STORED_PROCEDURE,--select top 100 * from [SAM].[WS1_usage_detai...
5,shell-01-eun-sqdb-cgljxfpzifaquvuwplpe,PR,usp_DevSecOps_Project_Adoption,SQL_STORED_PROCEDURE,\n \nCREATE proc [PR].[usp_DevSecOps_Project_...
7,shell-01-eun-sqdb-cgljxfpzifaquvuwplpe,tmp,usp_SNV_asmt_metric_STG,SQL_STORED_PROCEDURE,\n-- =========================================...
...,...,...,...,...,...
2176,shell-01-eun-sqdb-cgljxfpzifaquvuwplpe,tmp,usp_deployment_attributes_prd,SQL_STORED_PROCEDURE,CREATE Procedure [tmp].[usp_deployment_attribu...
2177,shell-01-eun-sqdb-cgljxfpzifaquvuwplpe,SAM,usp_stg_mecm_software_version,SQL_STORED_PROCEDURE,--select top 100 * from [SAM].[MECM_Add_Remove...
2182,shell-01-eun-sqdb-cgljxfpzifaquvuwplpe,tmp,usp_SNV_u_business_service_locations_STG,SQL_STORED_PROCEDURE,CREATE PROCEDURE [tmp].[usp_SNV_u_business_ser...
2184,shell-01-eun-sqdb-cgljxfpzifaquvuwplpe,tmp,usp_ppm_SNV_pm_project_task,SQL_STORED_PROCEDURE,\n\n\tCREATE PROCEDURE [tmp].[usp_ppm_SNV_pm_p...


In [7]:
sql_text=robust_clean_sql(df.iloc[0,4])
print(sql_text)

CREATE PROCEDURE [MITAO_TDM].[usp_SNV_sc_task_sla] 
AS 
SET ROWCOUNT 0 
SET NOCOUNT ON 
BEGIN 
TRUNCATE TABLE [MITAO_TDM].[SNV_sc_task_sla] 
INSERT INTO [MITAO_TDM].[SNV_sc_task_sla] 
( 
[Number] 
,[Short Description] 
,[Business Elapsed Time] 
,[Due Date] 
,[Business Pause Duration] 
,[Has Breached] 
,[Duration] 
,[Catalog Item]) 
SELECT 
 [Number] 
,[Short Description] 
,[Business Elapsed Time] 
,[Due Date] 
,[Business Pause Duration] 
,[Has Breached] 
,[Duration] 
,[Catalog Item] 
 FROM [pst].[vw_SNV_sc_task_sla_MI_TAO_TDM] 
 END


In [3]:
prompt = f"""
You are a SQL data lineage extractor specializing in T-SQL stored procedures.

OBJECTIVE:
Extract DIRECT source-to-target mappings between PERSISTENT database objects ONLY.
Trace data flow through ALL intermediate steps (temp tables, CTEs, subqueries) but report only the FINAL persistent objects.

OBJECT CLASSIFICATION:

PERSISTENT OBJECTS (Report these):
- Tables: schema.table, [schema].[table], database.schema.table
- Views: schema.view, [schema].[view]
- Stored procedures (when used as data sources via EXEC INSERT)

INTERMEDIATE OBJECTS (Trace through, but DO NOT report):
- Temp tables: #temp, ##global_temp
- Table variables: @table
- CTEs: WITH cte_name AS (...)
- Subqueries and derived tables
- Variables: @variable

EXTRACTION RULES:

1. TRACE THROUGH INTERMEDIATES:
   - If temp table #T is populated from table A, then #T is inserted into table B
   - Report: A → B (not A → #T or #T → B)

2. HANDLE MULTI-STEP FLOWS:
   - Step 1: A → #temp1
   - Step 2: #temp1 → #temp2  
   - Step 3: #temp2 → B
   - Report: A → B

3. MULTIPLE SOURCES TO ONE TARGET:
   - Create separate lineage entries for each source
   - Example: A → C, B → C (two separate JSON objects)

4. ONE SOURCE TO MULTIPLE TARGETS:
   - Create separate lineage entries for each target
   - Example: A → X, A → Y (two separate JSON objects)

5. COMPLEX QUERIES:
   - Trace through all JOINs, subqueries, CTEs
   - Extract base tables from nested SELECT statements
   - Follow data flow through UNION, EXCEPT, INTERSECT operations

6. IGNORE:
   - Table hints: (NOLOCK), WITH (NOLOCK), (INDEX=...), etc.
   - System tables/views unless explicitly part of business logic
   - The stored procedure name itself as a source

7. DELETE/TRUNCATE OPERATIONS:
   - These affect targets but have no sources
   - Omit from lineage (or include with "source": null if you need to track modifications)

8. EXEC STORED PROCEDURES:
   - If "INSERT INTO table EXEC stored_proc", treat stored_proc as a source
   - Otherwise, you may need to trace into that procedure separately

OUTPUT FORMAT:

{{
  "lineage": [
    {{
      "source": "schema.table_name",
      "target": "schema.table_name"
    }}
  ]
}}

RULES ENFORCEMENT:
✓ Output ONLY valid JSON
✓ No explanations, comments, or markdown
✓ No temp tables (#temp) in final output
✓ No CTEs or table variables in final output
✓ Each lineage pair must have exactly one source and one target
✓ Use fully qualified names when available (schema.table)
✓ Remove all table hints from object names

EXAMPLE:

Given SQL:
```sql
-- Step 1: Read from A, B into temp
SELECT * INTO #temp FROM A JOIN B ON A.id = B.id

-- Step 2: Read from #temp and C into final table
INSERT INTO Z 
SELECT * FROM #temp JOIN C ON #temp.id = C.id
```

Correct output:
{{
  "lineage": [
    {{"source": "A", "target": "Z"}},
    {{"source": "B", "target": "Z"}},
    {{"source": "C", "target": "Z"}}
  ]
}}

Incorrect output (DO NOT DO THIS):
{{
  "lineage": [
    {{"source": "A", "target": "#temp"}},
    {{"source": "B", "target": "#temp"}},
    {{"source": "#temp", "target": "Z"}},
    {{"source": "C", "target": "Z"}}
  ]
}}

SQL TO ANALYZE:
{sql_text}
"""

In [4]:
import ollama
import json

MODEL_NAME = "qwen2.5-coder:14b"
result= ollama.generate(model=MODEL_NAME,
prompt= prompt,
format="json"
)

lineage_json = json.loads(result["response"])
print(json.dumps(lineage_json, indent=4))

{
    "lineage": [
        {
            "source": "[PR].[vw_SNV_incident]",
            "target": "[PR].[Fleet_incident_snapshot]"
        },
        {
            "source": "[PR].[vw_SNV_incident_sla]",
            "target": "[PR].[Fleet_incident_snapshot]"
        },
        {
            "source": "[PR].[Fleet_Portfolio_CI]",
            "target": "[PR].[Fleet_incident_snapshot]"
        }
    ]
}


In [9]:

import os
import pandas as pd
import re
EXCEL_FILE = "object_definitions.csv"
df = pd.read_csv(EXCEL_FILE)

df = df[df["ObjectType"] == "SQL_STORED_PROCEDURE"]
OUTPUT_FOLDER = "lineage_outputs"
import hashlib

# Add this function after your imports
def get_file_identifier(schema, object_name):
    """Create a unique hash-based identifier for a SQL object."""
    unique_string = f"{schema}|{object_name}"
    return hashlib.md5(unique_string.encode('utf-8')).hexdigest()[:12]

def get_processed_identifiers():
    """Extract identifiers from existing JSON files in the output folder."""
    processed = set()
    if os.path.exists(OUTPUT_FOLDER):
        for filename in os.listdir(OUTPUT_FOLDER):
            if filename.endswith('.json'):
                # Extract identifier from filename pattern: "identifier--schema--object.json"
                match = re.match(r'^([a-f0-9]{12})--', filename)
                if match:
                    processed.add(match.group(1))
    return processed

processed_identifiers = get_processed_identifiers()

print(processed_identifiers)

set()
