In [66]:
import pandas as pd
import re

df = pd.read_csv("lineage_analysis/object_definitions.csv")


def robust_clean_sql(sql_query):
    sql_text = str(sql_query)

    sql_text = sql_text.replace('\\n', '\n').replace('\\t', '\t')

    # Remove single-line comments (-- ...)
    sql_text = re.sub(r'--.*', '', sql_text)
    # Remove multi-line comments (/* ... */)
    sql_text = re.sub(r'/\*.*?\*/', '', sql_text, flags=re.DOTALL)

    # Replace multiple newlines with a single newline
    sql_text = re.sub(r'\n\s*\n', '\n', sql_text)
    # Collapse horizontal spaces (tabs/spaces) into one space
    sql_text = re.sub(r'[ \t]+', ' ', sql_text)
    
    return sql_text.strip()

object_schema = "PR"
object_name = "usp_Fleet_snapshot"
df = df.query(f"Schema == '{object_schema}' and Object == '{object_name}'")
df

Unnamed: 0,DatabaseName,Schema,Object,ObjectType,definition
169,shell-01-eun-sqdb-cgljxfpzifaquvuwplpe,PR,usp_Fleet_snapshot,SQL_STORED_PROCEDURE,\n\n\nCREATE PROCEDURE [PR].[usp_Fleet_snapsh...


In [84]:
sql_text=robust_clean_sql(df.iloc[0,4])
print(sql_text)

CREATE PROCEDURE [PR].[usp_Fleet_snapshot]
AS
Set Rowcount 0
Set Nocount On
DECLARE @RM date 
SELECT @RM = '2021-07-01' 
BEGIN
SELECT INC.number 
,INC.cmdb_ci 
,INC.cmdb_ci_tier 
,INC.application_service_id 
,INC.business_application_id 
,INC.cmdb_ci_business_app
,INC.Business_Application_Status
,INC.short_description 
,INC.assigned_to 
,INC.caller_id 
,INC.caller_id_u_employee_type 
,INC.priority 
,INC.sys_created_on 
,INC.closed_at 
,INC.resolved_at 
,INC.contact_type 
,INC.category 
,INC.close_code 
,INC.reopen_count 
,INC.reassignment_count 
,INC.assignment_group 
,INC.major_incident_state 
,INC.incident_state 
,INC.portfolio
,INC.sys_tags
,INCSLA.inc_subcategory 
,INCSLA.taskslatable_business_duration 
,INCSLA.taskslatable_duration 
,INCSLA.taskslatable_business_pause_duration 
,INCSLA.taskslatable_pause_duration 
,INCSLA.taskslatable_sla 
,INCSLA.taskslatable_stage 
,INCSLA.taskslatable_start_time 
,INCSLA.taskslatable_end_time 
,INCSLA.taskslatable_has_breached 
,INCSLA.taskslat

In [85]:
prompt = f"""
You are a SQL data lineage extractor specializing in T-SQL stored procedures.

OBJECTIVE:
Extract DIRECT source-to-target mappings between PERSISTENT database objects ONLY.
Trace data flow through ALL intermediate steps (temp tables, CTEs, subqueries) but report only the FINAL persistent objects.

OBJECT CLASSIFICATION:

PERSISTENT OBJECTS (Report these):
- Tables: schema.table, [schema].[table], database.schema.table
- Views: schema.view, [schema].[view]
- Stored procedures (when used as data sources via EXEC INSERT)

INTERMEDIATE OBJECTS (Trace through, but DO NOT report):
- Temp tables: #temp, ##global_temp
- Table variables: @table
- CTEs: WITH cte_name AS (...)
- Subqueries and derived tables
- Variables: @variable

EXTRACTION RULES:

1. TRACE THROUGH INTERMEDIATES:
   - If temp table #T is populated from table A, then #T is inserted into table B
   - Report: A → B (not A → #T or #T → B)

2. HANDLE MULTI-STEP FLOWS:
   - Step 1: A → #temp1
   - Step 2: #temp1 → #temp2  
   - Step 3: #temp2 → B
   - Report: A → B

3. MULTIPLE SOURCES TO ONE TARGET:
   - Create separate lineage entries for each source
   - Example: A → C, B → C (two separate JSON objects)

4. ONE SOURCE TO MULTIPLE TARGETS:
   - Create separate lineage entries for each target
   - Example: A → X, A → Y (two separate JSON objects)

5. COMPLEX QUERIES:
   - Trace through all JOINs, subqueries, CTEs
   - Extract base tables from nested SELECT statements
   - Follow data flow through UNION, EXCEPT, INTERSECT operations

6. IGNORE:
   - Table hints: (NOLOCK), WITH (NOLOCK), (INDEX=...), etc.
   - System tables/views unless explicitly part of business logic
   - The stored procedure name itself as a source

7. DELETE/TRUNCATE OPERATIONS:
   - These affect targets but have no sources
   - Omit from lineage (or include with "source": null if you need to track modifications)

8. EXEC STORED PROCEDURES:
   - If "INSERT INTO table EXEC stored_proc", treat stored_proc as a source
   - Otherwise, you may need to trace into that procedure separately

OUTPUT FORMAT:

{{
  "lineage": [
    {{
      "source": "schema.table_name",
      "target": "schema.table_name"
    }}
  ]
}}

RULES ENFORCEMENT:
✓ Output ONLY valid JSON
✓ No explanations, comments, or markdown
✓ No temp tables (#temp) in final output
✓ No CTEs or table variables in final output
✓ Each lineage pair must have exactly one source and one target
✓ Use fully qualified names when available (schema.table)
✓ Remove all table hints from object names

EXAMPLE:

Given SQL:
```sql
-- Step 1: Read from A, B into temp
SELECT * INTO #temp FROM A JOIN B ON A.id = B.id

-- Step 2: Read from #temp and C into final table
INSERT INTO Z 
SELECT * FROM #temp JOIN C ON #temp.id = C.id
```

Correct output:
{{
  "lineage": [
    {{"source": "A", "target": "Z"}},
    {{"source": "B", "target": "Z"}},
    {{"source": "C", "target": "Z"}}
  ]
}}

Incorrect output (DO NOT DO THIS):
{{
  "lineage": [
    {{"source": "A", "target": "#temp"}},
    {{"source": "B", "target": "#temp"}},
    {{"source": "#temp", "target": "Z"}},
    {{"source": "C", "target": "Z"}}
  ]
}}

SQL TO ANALYZE:
{sql_text}
"""

In [88]:
import ollama
import json

MODEL_NAME = "qwen2.5-coder:7b"
result= ollama.generate(model=MODEL_NAME,
prompt= prompt,
format="json"
)

lineage_json = json.loads(result["response"])
print(json.dumps(lineage_json, indent=4))

{
    "lineage": [
        {
            "source": "[PR].[vw_SNV_incident]",
            "target": "#temp3a"
        },
        {
            "source": "[PR].[vw_SNV_incident_sla]",
            "target": "#temp3a"
        },
        {
            "source": "#temp3a",
            "target": "#temp3b"
        },
        {
            "source": "#temp3b",
            "target": "#temp3"
        },
        {
            "source": "[PR].[Fleet_incident_snapshot]",
            "target": null
        }
    ]
}


In [87]:
print(json.dumps(lineage_json, indent=4))

{
    "lineage": [
        {
            "source": "[PR].[vw_SNV_incident]",
            "target": "#temp3a"
        },
        {
            "source": "[PR].[vw_SNV_incident_sla]",
            "target": "#temp3a"
        },
        {
            "source": "#temp3a",
            "target": "#temp3b"
        },
        {
            "source": "#temp3b",
            "target": "#temp3"
        },
        {
            "source": null,
            "target": "[PR].[Fleet_incident_snapshot]"
        }
    ]
}
