### Create set of all object present in json files inside dedicated folder.

In [None]:
import os
import json

# pd.set_option("display.max_rows", None)

folder = "../lineage_outputs/"
json_list = []
set_creation_error = dict()

for file in os.listdir(folder):
    with open(folder+file, "r") as f:
        json_file = json.load(f)
        json_file["file_name"] = file.replace(".json", "").split("--")[1] + "." + file.replace(".json", "").split("--")[2]

        try:
            src_trg_set = set()
            for src_trg_pairs in json_file["lineage"]:
                src_trg_set.add(src_trg_pairs["source"])
                src_trg_set.add(src_trg_pairs["target"])
            src_trg_set.remove(json_file["file_name"])
        except Exception as e:
            #if the name of error is simply the name of file then pass
            if str(e).replace("'","") == json_file["file_name"]:
                pass
            else:
                set_creation_error[json_file["file_name"]] = e
            
        src_trg_set = {element.lower() if element is not None else element for element in src_trg_set}

        json_file["content_set"] = src_trg_set
        json_list.append(json_file)

print("Errors encountered while creating set for each json file: ", len(set_creation_error))

print("Key template of each dictionary in list: ",json_list[0].keys(), "\n")
print(json_list)

### Create set of all lineage objects present in csv file for each dependent object.

In [None]:
import pandas as pd
import re
# Load data
df_dev_linage = pd.read_csv("databases_lineage_extracted.csv")
df_uat_linage = pd.read_csv("database_lineage_extracted_2.csv")
df_unioned = pd.concat([df_dev_linage, df_uat_linage], ignore_index=True)
df_unioned

subset_cols = ['Dependent_Schema', 'Dependent_Object_Name', 'Dependent_Object_Type',
               'Depends_On_Schema', 'Depends_On_Object_Name', 'Depends_On_Object_Type']
df_lineage_merged = df_unioned.drop_duplicates(subset=subset_cols)

#--------------------------------- get lineage content set from excel file for each object ----------------------
lineage_list = {}
for (dependent_schema, dependent_object_name), group in df_lineage_merged.groupby(
    ["Dependent_Schema", "Dependent_Object_Name"]
):
    dependent_object_name = re.sub(r"[^\w\s-]", "", dependent_object_name).replace(
        " ", "_"
    )
    lineage_list[dependent_schema + "." + dependent_object_name] = set()
    for row in group.itertuples():
        content = row[7] + "." + row[8]
        lineage_list[dependent_schema + "." + dependent_object_name].add(
            content.lower()
        )

In [None]:
# # OBJECTS that are present in both the databases but with different lineage mappings

# # 1. Define your group
# grouped = df_lineage_merged.groupby(['Dependent_Schema', 'Dependent_Object_Name'])
# # 2. Filter for groups that appear in more than one database
# # We use nunique() in case a database is listed twice for some reason
# df_multi_db = grouped.filter(lambda x: x['Database'].nunique() > 1)
# # 3. Sort to see the duplicates side-by-side
# df_multi_db = df_multi_db.sort_values(by=subset_cols)
# list_different_lineage_mapping = df_multi_db[['Dependent_Schema', 'Dependent_Object_Name']].drop_duplicates().values.tolist()
# # print("Total objects that have different lineage mappings in both lineage sheets:\n", len(list_different_lineage_mapping), "\n")
# # for element in list_different_lineage_mapping:
# #     print(".".join(element))
# # for row in df_multi_db.values.tolist():
# #     print(row)
# subset_cols_multi_db = subset_cols.copy()
# subset_cols_multi_db.insert(0,"Database")
# df_multi_db = df_multi_db[subset_cols_multi_db].sort_values(by=["Dependent_Schema", "Dependent_Object_Name", "Database"])


### Analyze both data structure and find out unequal sets containing lineage objects

In [None]:
correct = []
incorrect = []
errors = []
for json_item in json_list:
    try:
        if json_item["content_set"] == lineage_list[json_item["file_name"]]:
            correct.append(json_item["file_name"])
        else:
            incorrect.append(json_item["file_name"])
    except Exception as e:
        errors.append(e)

incorrect.sort()
correct.sort()

#---------------------------------Create dict using above code variables and print in json format-----------------

stats_dict = {
    "correct": len(correct),
    "incorrect": len(incorrect),
    "total_jsons_processed": len(correct) + len(incorrect),
    "errors": len(errors),
    "names": {
        "correct_file_names": correct,
        "incorrect_file_names": incorrect,
        "errors_file_names": [str(e) for e in errors],
    },
}

print(json.dumps(stats_dict,indent=4))

## Search content from csv files

In [None]:
import yaml

#####################
with open("target_object.yaml", "r") as f:
    f = yaml.safe_load(f)
depend_schema = f['target_tables']['depend_schema']
dependent_object_name = f['target_tables']['dependent_object_name']
#####################

print(depend_schema+'.'+dependent_object_name)

In [None]:
for object in correct:
    if object == depend_schema + '.' + dependent_object_name:
        print("Object present in `CORRECT` list")
    else:
        continue

for object in incorrect:
    if object == depend_schema + '.' + dependent_object_name:
        print("Object present in `INCORRECT` list")
    else:
        continue

In [None]:
## Find item and it's lineage set in json_list read from json files.
for item in json_list:
    if item['file_name'] == depend_schema + '.' + dependent_object_name:
        print(f"file_name: {item['file_name']}\n")
        print("-------CONTENT PRESENT IN json_list-------\n")
        print("\n".join(sorted(item['content_set'])))

In [None]:
for item in lineage_list:
    if item == depend_schema + '.' + dependent_object_name:
        print('Dependent_object_name: ', item)
        print("\n-------CONTENT PRESENT IN lineage_list-------\n")
        print("\n".join(sorted(lineage_list[item])))

In [None]:
df_dev_linage = pd.read_csv("databases_lineage_extracted.csv")
df_uat_linage = pd.read_csv("database_lineage_extracted_2.csv")
df_unioned = pd.concat([df_dev_linage, df_uat_linage], ignore_index=True)
df_unioned

subset_cols = ['Dependent_Schema', 'Dependent_Object_Name', 'Dependent_Object_Type',
               'Depends_On_Schema', 'Depends_On_Object_Name', 'Depends_On_Object_Type']
df = df_unioned.drop_duplicates(subset=subset_cols)
df = df.query(f"Dependent_Schema == '{depend_schema}' and Dependent_Object_Name == '{dependent_object_name}'")
df

In [None]:
import pandas as pd
import re
def robust_clean_sql(sql_query):
    sql_text = str(sql_query)

    sql_text = sql_text.replace('\\n', '\n').replace('\\t', '\t')

    # Remove single-line comments (-- ...)
    sql_text = re.sub(r'--.*', '', sql_text)
    # Remove multi-line comments (/* ... */)
    sql_text = re.sub(r'/\*.*?\*/', '', sql_text, flags=re.DOTALL)

    # Replace multiple newlines with a single newline
    sql_text = re.sub(r'\n\s*\n', '\n', sql_text)
    # Collapse horizontal spaces (tabs/spaces) into one space
    sql_text = re.sub(r'[ \t]+', ' ', sql_text)
    
    return sql_text.strip()

df_definitions = pd.concat([pd.read_csv("object_definitions.csv"), pd.read_csv("UAT_object_definitions.csv")], ignore_index=True)

df_definitions = df_definitions.query("ObjectType == 'SQL_STORED_PROCEDURE'")
# # 1. Sort so that preferred database comes first
df_definitions = df_definitions.sort_values(by="DatabaseName", ascending=False)
df_definitions = df_definitions.drop_duplicates(subset=['Schema', 'Object'], keep='first')
# # #get all objects present in multiple databases
# # df_duplicates = df_definitions.groupby(['Schema', 'Object']).filter(lambda x: len(x) >= 2)
# counts = df_definitions['DatabaseName'].value_counts()
# print(counts)

#get count of all different schemas present in unioned dataframe of object_definitions CSVs
schema_counts = df_definitions.query("ObjectType == 'SQL_STORED_PROCEDURE'")['Schema'].value_counts()
schema_counts

# df_definitions = df_definitions.query(f"Schema == '{depend_schema}' and Object == '{dependent_object_name}'")
# df_definitions

# 1. Filter, Group, and Count
df_schema_counts = (
    df_definitions
    .query("ObjectType == 'SQL_STORED_PROCEDURE'")  # Filter data
    .groupby('Schema')                               # Group by Schema
    .size()                                          # Count rows per group
    .reset_index(name='Count')                       # Convert to DataFrame & name column
)

# 2. Sort by count (Optional, usually helpful for Excel)
df_schema_counts = df_schema_counts.sort_values(by='Count', ascending=False).reset_index()
df_schema_counts
# small_schemas_list = []
# for row in df_schema_counts.query("Count<4").itertuples():
#     small_schemas_list.append(row[2])

In [None]:
print(" ".join(small_schemas_list))

In [None]:
# print(df_definitions.iloc[0,4])
print(robust_clean_sql(df_definitions.iloc[0,4]))

<!-- # schema_counts = df_definitions.query("ObjectType == 'SQL_STORED_PROCEDURE'")['Schema'].value_counts()
# schema_counts -->

In [None]:
dir_files = set()
for file in os.listdir("lineage_outputs"):
    obj_name = file.split("--")[1] + file.split("--")[2]
    if obj_name in dir_files:
        print(obj_name)
    dir_files.add(obj_name)

In [None]:
len(dir_files)