In [None]:
import pandas as pd
import json
import os
import re


# 1. Setup the dedicated folder
folder_name = "view_dependency_jsons"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# 2. Load your data
df = pd.read_csv('database_lineage_extracted_2.csv')

# Compare length before and after removing duplicates
print(f"Rows before: {len(df)}")
df = df.drop_duplicates()
print(f"Rows after: {len(df)}")

# 3. Filter for VIEW objects
df_views = df[df['Dependent_Object_Type'] == 'VIEW'].copy()

# 4. Create concatenated full names for JSON content only (Schema.ObjectName)
df_views['Target_Full'] = df_views['Dependent_Schema'] + "." + df_views['Dependent_Object_Name']
df_views['Source_Full'] = df_views['Depends_On_Schema'] + "." + df_views['Depends_On_Object_Name']

# 5. Group and Export
# Enumerate starts at 1 for the index
for idx, (target_full, group) in enumerate(df_views.groupby('Target_Full'), 1):
    
    dependencies = []
    # Get schema and object name from the first row of the group for naming
    # We grab these directly to avoid parsing the '.' later
    schema_name = group['Dependent_Schema'].iloc[0]
    object_name = group['Dependent_Object_Name'].iloc[0]

    for _, row in group.iterrows():
        dependencies.append({
            "source": row['Source_Full'],
            "target": row['Target_Full']
        })
    
    # SANITIZATION:
    # Remove illegal characters from schema and object name individually
    safe_schema = re.sub(r'[<>:"/\\|?*]', '_', str(schema_name))
    safe_object = re.sub(r'[<>:"/\\|?*]', '_', str(object_name))
    
    separator = "--" 
    
    filename = f"{idx}{separator}{safe_schema}{separator}{safe_object}.json"
    
    file_path = os.path.join(folder_name, filename)
    
    with open(file_path, 'w') as f:
        json.dump(dependencies, f, indent=4)
        
    print(f"Generated: {file_path}")

Rows before: 4248
Rows after: 3974
Generated: view_dependency_jsons/shell-01-eun-sqdb-kjkwcaevfwsmqnpbidvj\1--AIM--vw_AIM_SNV_Report.json
Generated: view_dependency_jsons/shell-01-eun-sqdb-kjkwcaevfwsmqnpbidvj\2--AIM--vw_AIM_SNV_cmdb_ci_netgear.json
Generated: view_dependency_jsons/shell-01-eun-sqdb-kjkwcaevfwsmqnpbidvj\3--AIO--vw_Demand.json
Generated: view_dependency_jsons/shell-01-eun-sqdb-kjkwcaevfwsmqnpbidvj\4--AIO--vw_Issues.json
Generated: view_dependency_jsons/shell-01-eun-sqdb-kjkwcaevfwsmqnpbidvj\5--AIO--vw_Open_Resourcing.json
Generated: view_dependency_jsons/shell-01-eun-sqdb-kjkwcaevfwsmqnpbidvj\6--AIO--vw_PCR.json
Generated: view_dependency_jsons/shell-01-eun-sqdb-kjkwcaevfwsmqnpbidvj\7--AIO--vw_POAP_Project_Deligence.json
Generated: view_dependency_jsons/shell-01-eun-sqdb-kjkwcaevfwsmqnpbidvj\8--AIO--vw_Project_Health.json
Generated: view_dependency_jsons/shell-01-eun-sqdb-kjkwcaevfwsmqnpbidvj\9--AIO--vw_Resource_Allocation.json
Generated: view_dependency_jsons/shell-01-

In [None]:
# # 2. Load your data
# import pandas as pd

# df1 = pd.read_csv('databases_lineage_extracted.csv')
# df2 = pd.read_csv('database_lineage_extracted_2.csv')
# df.drop_duplicates()

# # Union both these dataframe, they have equal columns
# # ignore_index=True is useful to reset the index 0..N in the combined dataframe
# df = pd.concat([df1, df2], ignore_index=True)

# print(f"Union complete. Combined shape: {df.shape}")

Union complete. Combined shape: (10618, 10)
