<a href="https://colab.research.google.com/github/brendanlooker/colab-examples/blob/main/bq/bq_2_json_xml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install xmltodict

Collecting xmltodict
  Downloading xmltodict-0.13.0-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.13.0


In [None]:
from google.cloud import bigquery,storage
from datetime import datetime
import json
import xmltodict
import gzip
import io


from google.colab import auth
auth.authenticate_user()


# Get current timestamp
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d:%H:%M:%S")


# Export Table from BQ to GCS
def export_table_to_gcs_as_json(file_type,bucket_name):

    global formatted_datetime

    project_id = 'brendanlooker'
    dataset_id = 'puma'
    # bucket_name = 'puma-pim-datapipeline'
    file_name = f'{file_type}-{formatted_datetime}'
    destination_uri = f"gs://{bucket_name}/AUS/bqExport/json/{file_name}.json"


    # Initialize a BigQuery client
    client = bigquery.Client(project=project_id)

    # Define the source table
    table_ref = client.dataset(dataset_id).table(file_type)

    # Create a job configuration
    job_config = bigquery.job.ExtractJobConfig()
    job_config.destination_format = bigquery.job.DestinationFormat.NEWLINE_DELIMITED_JSON

    # Define the export job
    extract_job = client.extract_table(
        table_ref,
        destination_uri,
        job_config=job_config
    )

    # Start the job and wait for it to complete
    extract_job.result()




     # Load the exported JSON data
    with open(f"AUS/bqExport/json/{file_name}.json", 'r') as f:
        data = json.load(f)

    # Filter the JSON data
    filtered_data = filter_json_data(data)

    # Write the filtered JSON data back to the file
    with open(f"AUS/bqExport/json/{file_name}_filtered.json", 'w') as f:
        json.dump(filtered_data, f, indent=4)

    return file_name

# Convert json file to XML
def convert_json_to_xml(gcs_file_name,bucket_name):

    global formatted_datetime

    # Export data as JSON function call
    exported_file_name = gcs_file_name

    # Initialise Client
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob_path = f'AUS/bqExport/json/{exported_file_name}.json'
    blob = bucket.get_blob(blob_path)

    # Decode the entire file content
    file_content = blob.download_as_string().decode('utf-8')

    # Define GCS destination object
    write_xml = bucket.blob(f'AUS/bqExport/xml/{exported_file_name}.xml')


    # Close Root Tag in XML file (<catalog> tag introduced in header string and therefore needs to be subsequently closed)
    # Header is specific to the file type and is processed as appropriate below
    footer = "</catalog>"


    # Process Site Catalog
    if gcs_file_name.startswith('site-catalog'):

        # Hard-coded XML file header for Site Catalog
        header = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
        <catalog xmlns="http://www.demandware.com/xml/impex/catalog/2006-10-31" catalog-id="puma-catalog-autocat-au">
        """

        # Updates are required to the XML to align with Salesforce input requirements
        replacements = [
            ('<?xml version="1.0" encoding="utf-8"?>', ''),  # Remove XML declaration
            ]

    else: # Process Master Catalog

        # Hard-coded XML file header for Master Catalog
        header = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
        <catalog xmlns="http://www.demandware.com/xml/impex/catalog/2006-10-31" catalog-id="puma-master-catalog">
            <header>
                <image-settings>
                    <external-location>
                        <http-url>https://images.puma.net/images</http-url>
                        <https-url>https://images.puma.net/images</https-url>
                    </external-location>
                    <view-types>
                        <view-type>extralarge-AUS</view-type>
                        <view-type>large-AUS</view-type>
                        <view-type>medium-AUS</view-type>
                        <view-type>small-AUS</view-type>
                        <view-type>swatch-AUS</view-type>
                    </view-types>
                    <variation-attribute-id>color</variation-attribute-id>
                    <alt-pattern>${productname}, ${variationvalue}, ${viewtype}</alt-pattern>
                    <title-pattern>${productname}, ${variationvalue}</title-pattern>
                </image-settings>
            </header>
        """

        # Updates are required to the XML to align with Salesforce input requirements
        replacements = [
            ('<?xml version="1.0" encoding="utf-8"?>', ''),  # Remove XML declaration
            ('<product>', ''),                              # Remove <product> tag
            ('</product>', ''),                             # Remove </product> tag
            ('<product_style>', '<product>'),               # Replace <product_style> with <product>
            ('<product_style ', '<product '),
            ('</product_style>', '</product>'),             # Replace </product_style> with </product>
            ('<product_size>', '<product>'),                # Replace <product_size> with <product>
            ('</product_size>', '</product>'),               # Replace </product_size> with </product>
            ('<image>',''),
            ('</image>',''),
            ('<path>','<image path="'),
            ('</path>','"/>'),
            ('<image-group-master>',''),
            ('</image-group-master>',''),
            ('<product-id-variations>','<variant product-id="'),
            ('</product-id-variations>','"/>'),
            ('<variant>',''),
            ('</variant>',''),
            ('<custom-attribute>',''),
            ('</custom-attribute>',''),
            ]

        # List of Custom Attribute fields which require XML formatting changes
        custom_attributes = ['activityGroup', 'ageCode', 'ageCodeGlobal', 'ageGlobal', 'ageGroup', 'Cushioning', 'articleType', 'articleTypeCode', 'apparelLength', 'bodyStyle1', 'bodyStyle2', 'collection', 'cupType', 'leatherType', 'mainMaterial', 'mainMaterialOfShell', 'deptCode', 'deptName', 'dimensions', 'gender', 'genderCode', 'hsCode1', 'hsCode2', 'productDivision', 'productdivCode', 'productgroupKey', 'productlineKey', 'refinementHTD', 'refinementLevel', 'refinementShoeType', 'refinementSilo', 'refinementSpecialFeature', 'refinementSurface', 'refinementSurfaceRunning', 'refinementWeight', 'sizetableNumber', 'sportCode', 'subcatID', 'taxClassCode', 'Neck', 'upper', 'classCode', 'className', 'sportName', 'subcatName', 'styleLogo', 'pocketType', 'productType', 'sleeves', 'supportLevelBra', 'team', 'trouserRise', 'waterResistant', 'weatherConditions', 'careInstructions', 'careSymbol', 'RunnerType', 'sportType', 'articleGroup', 'avgNoOfMilesPerKm', 'fabricsType', 'fastener', 'fit', 'franchise', 'heelToToeDrop', 'heelType', 'hood', 'lineName', 'pumaTechnology', 'refinementCushioningLevel', 'removableSole', 'shoePronation', 'surface', 'technologyPurpose', 'toeType', 'vegan', 'volume', 'sizeSpecTable', 'sizeSpecTableImp', 'mainColorID', 'season', 'taxClassCode', 'animalParts', 'color', 'colorName', 'countryOfOrigin', 'mainColorAUS', 'mainColorNZ', 'productHeight', 'productLength', 'productWeight', 'productWidth', 'refinementColor', 'secondColor', 'size', 'styleNumber', 'pattern', 'colorDescription', 'new', 'materialComposition']




    # Open XML file for writing
    with write_xml.open("wt") as file_obj:

        # Add XML header
        file_obj.write(header + "\n")

        # Split the content by line and parse each line as JSON
        for line in file_content.splitlines():
            try:
                data = json.loads(line)

                # Use xmltodict to generate an XML string from the JSON dict
                xml_string = xmltodict.unparse(data, cdata_key='_VALUE',attr_prefix='_', pretty=True)

                # Apply the replacements
                for old, new in replacements:
                    xml_string = xml_string.replace(old, new)

                # Fotmat updates for Custom Attributes
                if gcs_file_name.startswith('master-catalog'):
                    for item in custom_attributes:
                        if item in xml_string:
                            xml_string = xml_string.replace('<'+item+' ', '<custom-attribute ')
                            xml_string = xml_string.replace('</'+item+'>', '</custom-attribute>')

                # Remove empty lines
                if xml_string.strip():
                    # Write data to GCS blob
                    file_obj.write(xml_string + "\n")

            except json.JSONDecodeError as e:
                print("Error parsing JSON:", e)

        # Write the footer to finalise the file
        file_obj.write(footer)

    return f"{exported_file_name}.xml"

def gzip_gcs_file(bucket_name, source_blob_name, destination_blob_name):
    # Initialize GCS client
    client = storage.Client()
    bucket = client.bucket(bucket_name)

    # Download file from GCS
    blob = bucket.blob(source_blob_name)
    content = blob.download_as_string()

    # Compress file content
    with io.BytesIO() as buf:
        with gzip.GzipFile(fileobj=buf, mode='wb') as f:
            f.write(content)
        buf.seek(0)
        compressed_content = buf.read()

    # Upload compressed content back to GCS
    compressed_blob = bucket.blob(destination_blob_name)
    compressed_blob.upload_from_string(compressed_content)

    print(f'File {source_blob_name} compressed and saved as {destination_blob_name} in {bucket_name}')




def filter_json_data(data):
    # Iterate over each row in the JSON data
    for row in data:
        # Filter out key-value pairs where the value is empty
        filtered_row = {key: value for key, value in row.items() if value}

        # Update the row with the filtered key-value pairs
        row.clear()
        row.update(filtered_row)

    return data








def main(request):
    msg = ''
    bucket_name = 'puma-pim-datapipeline'
    for file_type in ['site-catalog','master-catalog']:
      gcs_file_name = export_table_to_gcs_as_json(file_type, bucket_name)
      # xml_filename = convert_json_to_xml(gcs_file_name, bucket_name)

      # Compress the XML file
      # xml_filename_zip = gzip_gcs_file(bucket_name,f"AUS/bqExport/xml/{xml_filename}", f"AUS/bqExport/xml/{xml_filename}.gz")
      # # Remove the XML file
      # os.remove(xml_filename)
      # msg += xml_filename + ' / '
    return {"Success": f"XML export files created: {msg}"}, 200

main('run')


In [None]:
from google.cloud import bigquery
from google.cloud import storage
import json

def read_json_from_gcs(bucket_name, file_name):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(file_name)
    json_data = blob.download_as_string().decode('utf-8')
    json_objects = [json.loads(line) for line in json_data.strip().split('\n')]
    return json_objects

def write_json_to_gcs(bucket_name, file_name, data):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(file_name)

    # Convert each JSON object to a string and concatenate with newline character
    ndjson_data = '\n'.join(json.dumps(obj) for obj in data)

    # Upload the NDJSON data to GCS
    blob.upload_from_string(ndjson_data, content_type='application/json')

def export_table_to_gcs_as_json(file_type, bucket_name, formatted_datetime):
    project_id = 'brendanlooker'
    dataset_id = 'puma'
    file_name = f'{file_type}-{formatted_datetime}'
    destination_uri = f"gs://{bucket_name}/AUS/bqExport/json/{file_name}.json"

    # Initialize a BigQuery client
    client = bigquery.Client(project=project_id)

    # Define the source table
    table_ref = client.dataset(dataset_id).table(file_type)

    # Create a job configuration
    job_config = bigquery.job.ExtractJobConfig()
    job_config.destination_format = bigquery.job.DestinationFormat.NEWLINE_DELIMITED_JSON

    # Define the export job
    extract_job = client.extract_table(
        table_ref,
        destination_uri,
        job_config=job_config
    )

    # Start the job and wait for it to complete
    extract_job.result()

    # Read the exported JSON data from GCS
    data = read_json_from_gcs(bucket_name, f"AUS/bqExport/json/{file_name}.json")

    # Filter the JSON data
    filtered_data = filter_json_data(data)

    # Write the filtered JSON data back to GCS
    filtered_file_name = f"{file_name}_filtered.json"
    write_json_to_gcs(bucket_name, f"AUS/bqExport/json/{filtered_file_name}", filtered_data)

    return filtered_file_name

def filter_json_data(data):
    # Iterate over each row in the JSON data
    for row in data:
        # Filter out key-value pairs where the value is empty
        filtered_row = {key: value for key, value in row.items() if value}

        # Update the row with the filtered key-value pairs
        row.clear()
        row.update(filtered_row)

    return data

# Example usage
file_type = 'master-catalog'
bucket_name = 'puma-pim-datapipeline'
formatted_datetime = '2021-01-01'

filtered_file_name = export_table_to_gcs_as_json(file_type, bucket_name, formatted_datetime)


In [None]:
from google.cloud import bigquery,storage
from datetime import datetime
import json
import xmltodict

from google.colab import auth
auth.authenticate_user()

# Get current timestamp
current_datetime = datetime.now()
formatted_datetime = current_datetime.strftime("%Y-%m-%d:%H:%M:%S")


# Export Table from BQ to GCS
def export_table_to_gcs_as_json(file_type):

    global formatted_datetime

    project_id = 'brendanlooker'
    dataset_id = 'puma'
    bucket_name = 'puma-pim-datapipeline'
    file_name = f'{file_type}-{formatted_datetime}'
    destination_uri = f"gs://{bucket_name}/AUS/bqExport/json/{file_name}.json"


    # Initialize a BigQuery client
    client = bigquery.Client(project=project_id)

    # Define the source table
    table_ref = client.dataset(dataset_id).table(file_type)

    # Create a job configuration
    job_config = bigquery.job.ExtractJobConfig()
    job_config.destination_format = bigquery.job.DestinationFormat.NEWLINE_DELIMITED_JSON

    # Define the export job
    extract_job = client.extract_table(
        table_ref,
        destination_uri,
        job_config=job_config
    )

    # Start the job and wait for it to complete
    extract_job.result()

    return file_name

# Convert json file to XML
def convert_json_to_xml(gcs_file_name):

    global formatted_datetime

    # Export data as JSON function call
    exported_file_name = gcs_file_name

    # Initialise Client
    client = storage.Client()
    bucket = client.get_bucket('puma-pim-datapipeline')
    blob_path = f'AUS/bqExport/json/{exported_file_name}.json'
    blob = bucket.get_blob(blob_path)

    # Decode the entire file content
    file_content = blob.download_as_string().decode('utf-8')

    # Define GCS destination object
    write_xml = bucket.blob(f'AUS/bqExport/xml/{exported_file_name}.xml')


    # Close Root Tag in XML file (<catalog> tag introduced in header string and therefore needs to be subsequently closed)
    # Header is specific to the file type and is processed as appropriate below
    footer = "</catalog>"


    # Process Site Catalog
    if gcs_file_name.startswith('site-catalog'):

        # Hard-coded XML file header for Site Catalog
        header = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
        <catalog xmlns="http://www.demandware.com/xml/impex/catalog/2006-10-31" catalog-id="puma-catalog-autocat-au">
        """

        # Updates are required to the XML to align with Salesforce input requirements
        replacements = [
            ('<?xml version="1.0" encoding="utf-8"?>', ''),  # Remove XML declaration
            ]

    else: # Process Master Catalog

        # Hard-coded XML file header for Master Catalog
        header = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
        <catalog xmlns="http://www.demandware.com/xml/impex/catalog/2006-10-31" catalog-id="puma-master-catalog">
            <header>
                <image-settings>
                    <external-location>
                        <http-url>https://images.puma.net/images</http-url>
                        <https-url>https://images.puma.net/images</https-url>
                    </external-location>
                    <view-types>
                        <view-type>extralarge-AUS</view-type>
                        <view-type>large-AUS</view-type>
                        <view-type>medium-AUS</view-type>
                        <view-type>small-AUS</view-type>
                        <view-type>swatch-AUS</view-type>
                    </view-types>
                    <variation-attribute-id>color</variation-attribute-id>
                    <alt-pattern>${productname}, ${variationvalue}, ${viewtype}</alt-pattern>
                    <title-pattern>${productname}, ${variationvalue}</title-pattern>
                </image-settings>
            </header>
        """

        # Updates are required to the XML to align with Salesforce input requirements
        replacements = [
            ('<?xml version="1.0" encoding="utf-8"?>', ''),  # Remove XML declaration
            ('<product>', ''),                              # Remove <product> tag
            ('</product>', ''),                             # Remove </product> tag
            ('<product_style>', '<product>'),               # Replace <product_style> with <product>
            ('<product_style ', '<product '),
            ('</product_style>', '</product>'),             # Replace </product_style> with </product>
            ('<product_size>', '<product>'),                # Replace <product_size> with <product>
            ('</product_size>', '</product>'),               # Replace </product_size> with </product>
            ('<image>',''),
            ('</image>',''),
            ('<path>','<image path="'),
            ('</path>','"/>'),
            ('<image-group-master>',''),
            ('</image-group-master>',''),
            ('<product-id-variations>','<variant product-id="'),
            ('</product-id-variations>','"/>'),
            ('<variant>',''),
            ('</variant>',''),
            ('<custom-attribute>',''),
            ('</custom-attribute>',''),
            ]

        # List of Custom Attribute fields which require XML formatting changes
        custom_attributes = ['activityGroup', 'ageCode', 'ageCodeGlobal', 'ageGlobal', 'ageGroup', 'Cushioning', 'articleType', 'articleTypeCode', 'apparelLength', 'bodyStyle1', 'bodyStyle2', 'collection', 'cupType', 'leatherType', 'mainMaterial', 'mainMaterialOfShell', 'deptCode', 'deptName', 'dimensions', 'gender', 'genderCode', 'hsCode1', 'hsCode2', 'productDivision', 'productdivCode', 'productgroupKey', 'productlineKey', 'refinementHTD', 'refinementLevel', 'refinementShoeType', 'refinementSilo', 'refinementSpecialFeature', 'refinementSurface', 'refinementSurfaceRunning', 'refinementWeight', 'sizetableNumber', 'sportCode', 'subcatID', 'taxClassCode', 'Neck', 'upper', 'classCode', 'className', 'sportName', 'subcatName', 'styleLogo', 'pocketType', 'productType', 'sleeves', 'supportLevelBra', 'team', 'trouserRise', 'waterResistant', 'weatherConditions', 'careInstructions', 'careSymbol', 'RunnerType', 'sportType', 'articleGroup', 'avgNoOfMilesPerKm', 'fabricsType', 'fastener', 'fit', 'franchise', 'heelToToeDrop', 'heelType', 'hood', 'lineName', 'pumaTechnology', 'refinementCushioningLevel', 'removableSole', 'shoePronation', 'surface', 'technologyPurpose', 'toeType', 'vegan', 'volume', 'sizeSpecTable', 'sizeSpecTableImp', 'mainColorID', 'season', 'taxClassCode', 'animalParts', 'color', 'colorName', 'countryOfOrigin', 'mainColorAUS', 'mainColorNZ', 'productHeight', 'productLength', 'productWeight', 'productWidth', 'refinementColor', 'secondColor', 'size', 'styleNumber', 'pattern', 'colorDescription', 'new', 'materialComposition']




    # Open XML file for writing
    with write_xml.open("wt") as file_obj:

        # Add XML header
        file_obj.write(header + "\n")

        # Split the content by line and parse each line as JSON
        for line in file_content.splitlines():
            try:
                data = json.loads(line)

                # Use xmltodict to generate an XML string from the JSON dict
                xml_string = xmltodict.unparse(data, cdata_key='_VALUE',attr_prefix='_', pretty=True)

                # Apply the replacements
                for old, new in replacements:
                    xml_string = xml_string.replace(old, new)

                # Fotmat updates for Custom Attributes
                if gcs_file_name.startswith('master-catalog'):
                    for item in custom_attributes:
                        if item in xml_string:
                            xml_string = xml_string.replace('<'+item+' ', '<custom-attribute ')
                            xml_string = xml_string.replace('</'+item+'>', '</custom-attribute>')

                # Remove empty lines
                if xml_string.strip():
                    # Write data to GCS blob
                    file_obj.write(xml_string + "\n")

            except json.JSONDecodeError as e:
                print("Error parsing JSON:", e)

        # Write the footer to finalise the file
        file_obj.write(footer)

    return f"{exported_file_name}.xml"



def main(request):
    msg = ''
    for file_type in ['site-catalog','master-catalog']:
      gcs_file_name = export_table_to_gcs_as_json(file_type)
      xml_filename = convert_json_to_xml(gcs_file_name)
      msg += xml_filename + ' / '
    return {"Success": f"XML export files created: {msg}"}, 200

main('run')


({'Success': 'XML export files created: site-catalog-2024-02-14:14:32:10.xml / master-catalog-2024-02-14:14:32:10.xml / '},
 200)