In [None]:
from datetime import datetime
import json
import re

from notebookutils import mssparkutils
import pandas as pd

In [None]:
container_landing = "abfs://landing@storageXtarget.dfs.core.windows.net"
container_curated = "abfs://curated@storageXtarget.dfs.core.windows.net"

flow_folder_path= "Flux_flow_&_2"

# Get the file path of the last EXTRACTION_DATE folder

In [None]:
extraction_path = f"{container_landing}/{flow_folder_path}/"

list_dates = mssparkutils.fs.ls(extraction_path)
last_date_path = sorted(list_dates, key=lambda x: x.name, reverse=True)[0].path
file_path = mssparkutils.fs.ls(last_date_path)[0].path

print("landing file path:")
print(file_path)

# Mounting the landing container

In [None]:
mssparkutils.fs.mount( 
    "abfss://landing@storageXtarget.dfs.core.windows.net", 
    "/mnt",
    {"LinkedService": "ls_asa_ws_X_WorkspaceDefaultStorage"} 
)

In [None]:
job_id = mssparkutils.env.getJobId()

# Getting the last extraction date JSON file path

In [None]:
relative_path = "/".join(file_path.split("/")[3:])
path_prefix = f"/synfs/{job_id}/mnt"

new_file_path = f"{path_prefix}/{relative_path}"

print("The updated file path is :", new_file_path)

# Reading the JSON file 

In [None]:
# Read the file by using a mount path
with open(new_file_path) as f:
    data = json.load(f)

# Applying transformations to the JSON file

In [None]:
# Flatten the JSON data including the nested array:
df_array_normalized = pd.json_normalize(data, 
                                        record_path=["Contents", "reply", "endpoints"], 
                                        record_prefix="Contents_reply_endpoints_", 
                                        sep="_")

In [None]:
# Flatten the JSON file excluding the nested array:
df_keys = pd.json_normalize(data, 
                            sep="_")

In [None]:
# Merge the DataFrames
df_flattened = pd.merge(df_keys, df_array_normalized, how="cross")

# Mapping the data

# Getting the snapshot current date

In [None]:
# Get the current UTC timestamp
now = datetime.utcnow()

# Create the folder name with today's date
snapshot = now.strftime("%Y-%m-%d")

# Saving the file as parquet

In [None]:
file_name = (file_path.split("/")[-1]
                      .replace(" ", "_")
                      .replace(".json", ""))

# Create the full folder path
flow_2_curated_path = f"{container_curated}/{flow_folder_path}/flow_&_2.parquet/SNAPSHOT={snapshot}/{file_name}.snappy.parquet"

print("Curated file path :")
print(flow_2_curated_path)

# Saving the file in a parquet format:
df_flattened.to_parquet(flow_2_curated_path, index=False)

# Unmounting the container

In [None]:
mssparkutils.fs.unmount("/mnt")