In [67]:
#import Libriaries
import xml.etree.ElementTree as ET
import pandas as pd
import re
import pyodbc #oldwayto connect to sql server
import pydash #to format table and field names
import pandasgui as pg #to visualize dataframes
import sqlalchemy  #new way to connect to sql server
import os 
import numpy as np

In [68]:
#functions

def get_root(file_path):
    """Parse an XML file and return its root element."""
    xml_tree = ET.parse(file_path)
    xml_root = xml_tree.getroot()
    return xml_root


def get_child_elements(xml_root):
    # Create a list to store all child elements
    elements = []
    # Get all child elements of a given xml element
    for child in xml_root:
        elements.append(child)
        elements.extend(get_child_elements(child))
    #append xml root to elements list
    #elements.append(xml_root)
    return elements

def get_attribute_cfdi(elements, tag_contains, attribute):
    attribute_values = []
    for element in elements:
        cleaned_tag = re.sub(r"{.*?}", "", element.tag)
        # Check if the element's tag contains the required text and then extract the attribute
        if tag_contains in cleaned_tag:
            attribute_value = element.attrib.get(attribute)
            if attribute_value is not None:
                attribute_values.append(attribute_value)
    values = ', '.join(attribute_values)      
    return values
#print(get_attribute_cfdi(elements, 'TimbreFiscalDigital', 'UUID'))
#print(get_attribute_cfdi(elements, 'Receptor', 'Rfc'))


def pascal_case(text):
    return ''.join(word.capitalize() for word in text.split())


def get_xml_metadata(elements):

    # Define a list to store row data
    data = []

    # Define a dictionary for the attribute field counters
    field_counters = {}

    # Get UUID from 'TimbreFiscalDigital' element
    uuid = get_attribute_cfdi(elements, 'TimbreFiscalDigital', 'UUID')
    
    # Iterate over elements
    for element in elements:
        if element.attrib:
            cleaned_tag = pascal_case(re.sub(r"{.*?}", "", element.tag)).lower()
            file_name = os.path.basename(xml_file_path)
            file_path = xml_file_path

            # If this tag has not been seen before, initialize its counter
            if element.tag not in field_counters:
                field_counters[element.tag] = 1

            # Iterate over attributes of the element
            for key, value in element.attrib.items():
                cleaned_key = pascal_case(re.sub(r"{.*?}", "", key)).lower()
                # Add the data to the list, including the current attribute field number
                data.append({'field_number': field_counters[element.tag], 'file_path': file_path, 'file_name': file_name, 'cleaned_tag': cleaned_tag, 'tag': element.tag, 'key': key, 'cleaned_key': cleaned_key,'value': value, 'UUID': uuid})

            # Increment the counter for this attribute field
            field_counters[element.tag] += 1

    # Convert the list of dictionaries to a DataFrame
    df = pd.DataFrame(data)

    # Return the DataFrame
    return df


def get_unique_tags(elements):
    # Define a list to store tag names
    tag_list = []

    for element in elements:
        # If the attribute dictionary is not empty, add the tag to the list
        if element.attrib:
            tag_list.append(element.tag)

    # Convert the list to a set to get unique tags, then convert it back to a list
    tag_list = list(set(tag_list))
    
    return tag_list


def create_dataframes(elements, tag_list, metadata_df):
    # Create an empty dictionary to hold the dataframes
    cfdi_df = {}

    # key fields
    uuid = get_attribute_cfdi(elements, 'TimbreFiscalDigital', 'UUID')
    emisor_rfc = get_attribute_cfdi(elements, 'Emisor', 'Rfc')
    receptor_rfc = get_attribute_cfdi(elements, 'Receptor', 'Rfc')
    comprobante_tipo = get_attribute_cfdi(elements, 'Comprobante', 'TipoDeComprobante')

    for tag in tag_list:
        # Filter the DataFrame
        filtered_df = metadata_df[metadata_df['tag'] == tag]

        # Custom aggregation function to concatenate values into a list
        aggfunc = lambda x: list(x) if len(x) > 1 else np.max(x)

        # Create a pivot table
        pivot_table = pd.pivot_table(filtered_df, values='value', index=['field_number'], columns=['cleaned_key'], aggfunc=aggfunc)

        # Reset index and change the column names
        df = pivot_table.reset_index().rename_axis(None, axis=1)

        # Add the UUID as a new column to the DataFrame
        df['uuid'] = uuid
        df['emisor_rfc'] = emisor_rfc
        df['receptor_rfc'] = receptor_rfc
        df['comprobante_tipo'] = comprobante_tipo

        cfdi_df[tag] = df
        
    return cfdi_df

In [69]:
# Define the connection details
server = 'jacobo-dev.database.windows.net'
port = '1433'
database = 'jacobo-dev-sqlserver-azure-001'
username = 'azure-admin'
password = 'ja-2023-un0ypzjo'
driver = '{ODBC Driver 18 for SQL Server}'

# Define the connection string
conn_str = f"DRIVER={driver};SERVER={server},{port};DATABASE={database};UID={username};PWD={password};Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30"

In [70]:
# Test SQL DB Connection
try:
    conn = pyodbc.connect(conn_str)
    print("Connection successful!")
    conn.close()
except pyodbc.Error as e:
    print("Error connecting to database:", e)

Connection successful!


In [71]:
xml_file_path = r"C:\Users\Roberto\OneDrive\cargoIabono\Proyectos y Desarrollos\P001_V001_CFDI-Reader\01_Inputs\Facturas\0F23FE0D-8324-4BCE-AFAC-68CB67E89714.xml"
#xml_file_path = r"C:\Users\Roberto\OneDrive\cargoIabono\Proyectos y Desarrollos\P001_V001_CFDI-Reader\01_Inputs\Nomina\EMS2103108P3_Pago de nómina_20220815_N_AOAR951019842.xml"
xml_root = get_root(xml_file_path)
elements = get_child_elements(xml_root)
elements.append(xml_root)
metadata_df = get_xml_metadata(elements)
metadata_df.head()
#pg.show(metadata_df.head())

Unnamed: 0,field_number,file_path,file_name,cleaned_tag,tag,key,cleaned_key,value,UUID
0,1,C:\Users\Roberto\OneDrive\cargoIabono\Proyecto...,0F23FE0D-8324-4BCE-AFAC-68CB67E89714.xml,emisor,{http://www.sat.gob.mx/cfd/3}Emisor,Rfc,rfc,FIS780810KQ9,0F23FE0D-8324-4BCE-AFAC-68CB67E89714
1,1,C:\Users\Roberto\OneDrive\cargoIabono\Proyecto...,0F23FE0D-8324-4BCE-AFAC-68CB67E89714.xml,emisor,{http://www.sat.gob.mx/cfd/3}Emisor,Nombre,nombre,"FISACERO, S.A.P.I. DE C.V.",0F23FE0D-8324-4BCE-AFAC-68CB67E89714
2,1,C:\Users\Roberto\OneDrive\cargoIabono\Proyecto...,0F23FE0D-8324-4BCE-AFAC-68CB67E89714.xml,emisor,{http://www.sat.gob.mx/cfd/3}Emisor,RegimenFiscal,regimenfiscal,601,0F23FE0D-8324-4BCE-AFAC-68CB67E89714
3,1,C:\Users\Roberto\OneDrive\cargoIabono\Proyecto...,0F23FE0D-8324-4BCE-AFAC-68CB67E89714.xml,receptor,{http://www.sat.gob.mx/cfd/3}Receptor,Rfc,rfc,FAN540305I15,0F23FE0D-8324-4BCE-AFAC-68CB67E89714
4,1,C:\Users\Roberto\OneDrive\cargoIabono\Proyecto...,0F23FE0D-8324-4BCE-AFAC-68CB67E89714.xml,receptor,{http://www.sat.gob.mx/cfd/3}Receptor,Nombre,nombre,"GENVAMEX, S.A. DE C.V. AV. NOGALAR SUR No. 301",0F23FE0D-8324-4BCE-AFAC-68CB67E89714


In [72]:
# Define your tag
tag = 'doctorelacionado'

# Filter the DataFrame and get the UUID
filtered_df = metadata_df[metadata_df['cleaned_tag'] == tag]

#key fields
uuid = get_attribute_cfdi(elements, 'TimbreFiscalDigital', 'UUID')
emisor_rfc = get_attribute_cfdi(elements, 'Emisor', 'Rfc')
receptor_rfc = get_attribute_cfdi(elements, 'Receptor', 'Rfc')
comprobante_tipo = get_attribute_cfdi(elements, 'Comprobante', 'TipoDeComprobante')

# Custom aggregation function to concatenate values into a list
aggfunc = lambda x: list(x) if len(x) > 1 else np.max(x)

# Create a pivot table
pivot_table = pd.pivot_table(filtered_df, values='value', index=['field_number'], columns=['cleaned_key'], aggfunc=aggfunc)

# Reset index and change the column names
table = pivot_table.reset_index().rename_axis(None, axis=1)

# Add the UUID as a new column to the DataFrame
table['uuid'] = uuid
table['emisor_rfc'] = emisor_rfc
table['receptor_rfc'] = receptor_rfc
table['comprobante_tipo'] = comprobante_tipo

# Display the DataFrame
table


Unnamed: 0,field_number,folio,iddocumento,imppagado,impsaldoant,impsaldoinsoluto,metododepagodr,monedadr,numparcialidad,serie,uuid,emisor_rfc,receptor_rfc,comprobante_tipo
0,1,481,9FBCCDBB-2B12-4D3D-B872-D01048B95B61,1017.02,1017.02,0.0,PPD,USD,1,IA,0F23FE0D-8324-4BCE-AFAC-68CB67E89714,FIS780810KQ9,FAN540305I15,P
1,2,483,D8CD5D20-6BC0-41A5-913D-AF93F2C173A4,8244.28,8244.28,0.0,PPD,USD,1,IA,0F23FE0D-8324-4BCE-AFAC-68CB67E89714,FIS780810KQ9,FAN540305I15,P
2,3,431,F37A03D5-58C1-4819-BF52-88C1E7056C1E,12094.67,12094.67,0.0,PPD,USD,1,IA,0F23FE0D-8324-4BCE-AFAC-68CB67E89714,FIS780810KQ9,FAN540305I15,P


In [73]:



tag_list = get_unique_tags(elements)
print(tag_list)


cfdi_df = create_dataframes(elements, tag_list, metadata_df)


['{http://www.sat.gob.mx/cfd/3}Concepto', '{http://www.sat.gob.mx/Pagos}Pago', '{http://www.sat.gob.mx/TimbreFiscalDigital}TimbreFiscalDigital', '{http://www.sat.gob.mx/cfd/3}Receptor', '{http://www.sat.gob.mx/cfd/3}Emisor', '{http://www.sat.gob.mx/cfd/3}Comprobante', '{http://www.sat.gob.mx/Pagos}DoctoRelacionado', '{http://www.sat.gob.mx/Pagos}Pagos']


In [75]:
cfdi_df['{http://www.sat.gob.mx/cfd/3}Receptor']

Unnamed: 0,field_number,nombre,rfc,usocfdi,uuid,emisor_rfc,receptor_rfc,comprobante_tipo
0,1,"GENVAMEX, S.A. DE C.V. AV. NOGALAR SUR No. 301",FAN540305I15,P01,0F23FE0D-8324-4BCE-AFAC-68CB67E89714,FIS780810KQ9,FAN540305I15,P


pendiente crear def function para el diccionario de dataframes de cfdi

pendiente  agregar a los data frames el uuid y otros datos necesarios para el futuro modelado de datos de la base de datos de cfdis

pendiente crear la funcionalidad para subir los dataframes a la base de datos a nivel de xml

pendiente crear el programa de procesamiento multiple de xmls

pendiente crear los primeros drafts de reportes o vistas segun el mvp en alteryxs