In [33]:
import pandas as pd
import xml.etree.ElementTree as ET
from sqlalchemy import text
from connection import connect

In [2]:
co_oltp, etl_conn, etl_conn_or = connect()

# Extract

In [19]:
query_reseller = text("""
-- CTE para pre-agregar información de clientes y órdenes
WITH ResellerCustomerInfo AS (
    SELECT
        c.store_id,
        MIN(c.account_number) AS reseller_alternate_key,
        EXTRACT(YEAR FROM MIN(soh.order_date)) AS first_order_year,
        EXTRACT(YEAR FROM MAX(soh.order_date)) AS last_order_year,
        EXTRACT(MONTH FROM MAX(soh.order_date)) AS order_month,
        '0' as order_frequency
    FROM sales.customer AS c
    LEFT JOIN sales.sales_order_header AS soh ON c.customer_id = soh.customer_id
    WHERE c.store_id IS NOT NULL
    GROUP BY c.store_id
),
-- CTE para seleccionar una única dirección por tienda (Revendedor)
RankedAddresses AS (
    SELECT
        bea.business_entity_id,
        a.address_line_1,
        a.address_line_2,
        a.city,
        a.postal_code,
        st.name AS state_province_name,
        cr.name AS country_region_name,
        -- Rankeamos las direcciones por tipo, priorizando 'Main Office'
        ROW_NUMBER() OVER(
            PARTITION BY bea.business_entity_id
            ORDER BY
                CASE
                    WHEN at.name = 'Main Office' THEN 1
                    ELSE 99
                END
        ) as rn_addr
    FROM person.business_entity_address AS bea
    JOIN person.address AS a ON bea.address_id = a.address_id
    JOIN person.address_type AS at ON bea.address_type_id = at.address_type_id
    JOIN person.state_province AS st ON a.state_province_id = st.state_province_id
    JOIN person.country_region AS cr ON st.country_region_code = cr.country_region_code
),
-- CTE para seleccionar un unico teléfono por tienda, a través del contacto
RankedContacts AS (
    SELECT
        bec.business_entity_id, -- El ID de la tienda (Store)
        pp.phone_number,
        -- Rankeamos los contactos
        ROW_NUMBER() OVER(
            PARTITION BY bec.business_entity_id
            ORDER BY bec.contact_type_id
        ) as rn_contact
    FROM person.business_entity_contact AS bec
    JOIN person.person AS p ON bec.person_id = p.business_entity_id
    JOIN person.person_phone AS pp ON p.business_entity_id = pp.business_entity_id
)
SELECT
    s.business_entity_id,
    cus.account_number AS reseller_alternate_key,
    s.name AS reseller_name,
    s.demographics,

    -- Datos de la dirección desde la CTE de direcciones
    ra.address_line_1 AS address_line1,
    ra.address_line_2 AS address_line2,
    ra.city,
    ra.postal_code,
    ra.state_province_name,
    ra.country_region_name,

    -- Datos del cliente pre-agregados desde la primera CTE
    rci.first_order_year,
    rci.last_order_year,
    rci.order_month,
    rci.order_frequency,

    -- El teléfono desde la CTE de contactos
    rc.phone_number AS phone

FROM sales.store AS s

LEFT JOIN ResellerCustomerInfo AS rci ON s.business_entity_id = rci.store_id

LEFT JOIN RankedAddresses AS ra
    ON s.business_entity_id = ra.business_entity_id
    AND ra.rn_addr = 1

LEFT JOIN RankedContacts AS rc
    ON s.business_entity_id = rc.business_entity_id
    AND rc.rn_contact = 1

LEFT JOIN sales.customer as cus
    ON cus.store_id = s.business_entity_id
    AND cus.person_id IS NULL;
""")

df_reseller = pd.read_sql(query_reseller, co_oltp)
print(f"Registros extraidos: {len(df_reseller)}")
df_reseller

Registros extraidos: 701


Unnamed: 0,business_entity_id,reseller_alternate_key,reseller_name,demographics,address_line1,address_line2,city,postal_code,state_province_name,country_region_name,first_order_year,last_order_year,order_month,order_frequency,phone
0,934,AW00000001,A Bike Store,"<StoreSurvey xmlns=""http://schemas.microsoft.c...",2251 Elliot Avenue,,Seattle,98104,Washington,United States,2011.0,2012.0,3.0,0,245-555-0173
1,1028,AW00000002,Progressive Sports,"<StoreSurvey xmlns=""http://schemas.microsoft.c...",3207 S Grady Way,,Renton,98055,Washington,United States,2012.0,2014.0,3.0,0,230-555-0100
2,642,AW00000003,Advanced Bike Components,"<StoreSurvey xmlns=""http://schemas.microsoft.c...",12345 Sterling Avenue,,Irving,75061,Texas,United States,2011.0,2014.0,5.0,0,279-555-0130
3,932,AW00000004,Modular Cycle Systems,"<StoreSurvey xmlns=""http://schemas.microsoft.c...",800 Interchange Blvd.,Suite 2501,Austin,78701,Texas,United States,2012.0,2014.0,3.0,0,710-555-0173
4,1026,AW00000005,Metropolitan Sports Supply,"<StoreSurvey xmlns=""http://schemas.microsoft.c...",482505 Warm Springs Blvd.,,Fremont,94536,California,United States,2012.0,2014.0,5.0,0,755-555-0100
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696,1032,AW00000697,Brakes and Gears,"<StoreSurvey xmlns=""http://schemas.microsoft.c...",9927 N. Main St.,,Tooele,84074,Utah,United States,2011.0,2014.0,3.0,0,774-555-0133
697,640,AW00000698,Western Bike Supplies,"<StoreSurvey xmlns=""http://schemas.microsoft.c...",566 S. Main,,Cedar City,84720,Utah,United States,2012.0,2013.0,2.0,0,944-555-0148
698,842,AW00000699,Sensational Discount Store,"<StoreSurvey xmlns=""http://schemas.microsoft.c...",6333 Cloverleaf Parkway,,Kannapolis,28081,North Carolina,United States,2014.0,2014.0,1.0,0,716-555-0123
699,1030,AW00000700,Underglaze and Finish Company,"<StoreSurvey xmlns=""http://schemas.microsoft.c...",8520 University City Blvd,,Charlotte,28202,North Carolina,United States,2013.0,2014.0,3.0,0,703-555-0158


In [38]:
df_reseller.columns

Index(['business_entity_id', 'reseller_name', 'demographics', 'address_line1',
       'address_line2', 'city', 'postal_code', 'state_province_name',
       'country_region_name', 'reseller_alternate_key', 'first_order_year',
       'last_order_year', 'order_month', 'order_frequency', 'phone'],
      dtype='object')

# Transform

In [20]:
def parse_store_demographics(xml_string):
    if pd.isna(xml_string):
        return {}

    try:
        root = ET.fromstring(xml_string)
        ns = {'ns': root.tag.split('}')[0].strip('{')}

        def namespace_find(tag):
            el = root.find(f'ns:{tag}', ns)
            return el.text if el is not None else None


        result = {
            'annual_sales': namespace_find('AnnualSales'),
            'annual_revenue': namespace_find('AnnualRevenue'),
            'bank_name': namespace_find('BankName'),
            'business_type': namespace_find('BusinessType'),
            'year_opened': namespace_find('YearOpened'),
            'product_line': namespace_find('Specialty'),
            'number_employees': namespace_find('NumberEmployees'),
            'min_payment_type': None,
            'min_payment_amount': None,
        }
        return result

    except Exception as e:
        return {}


In [21]:
demographics_df = df_reseller['demographics'].apply(parse_store_demographics).apply(pd.Series)

In [22]:
tipos = demographics_df['business_type'].unique()
print(tipos)

['BM' 'BS' 'OS']


In [23]:
# Mapeando el tipo de negocio
business_type_map = {
    'BM': 'Warehouse',
    'OS': 'Value Added Reseller',
    'BS': 'Specialty Bike Shop'
}

demographics_df['business_type'] = demographics_df['business_type'].map(business_type_map)

print(demographics_df['business_type'].value_counts())


business_type
Warehouse               238
Value Added Reseller    232
Specialty Bike Shop     231
Name: count, dtype: int64


In [24]:
df_with_demographics = pd.concat([df_reseller.drop('demographics', axis=1), demographics_df], axis=1)
df_with_demographics

Unnamed: 0,business_entity_id,reseller_alternate_key,reseller_name,address_line1,address_line2,city,postal_code,state_province_name,country_region_name,first_order_year,...,phone,annual_sales,annual_revenue,bank_name,business_type,year_opened,product_line,number_employees,min_payment_type,min_payment_amount
0,934,AW00000001,A Bike Store,2251 Elliot Avenue,,Seattle,98104,Washington,United States,2011.0,...,245-555-0173,300000,30000,International Bank,Warehouse,1970,Road,2,,
1,1028,AW00000002,Progressive Sports,3207 S Grady Way,,Renton,98055,Washington,United States,2012.0,...,230-555-0100,800000,80000,International Security,Specialty Bike Shop,1972,Mountain,10,,
2,642,AW00000003,Advanced Bike Components,12345 Sterling Avenue,,Irving,75061,Texas,United States,2011.0,...,279-555-0130,1500000,150000,Primary International,Value Added Reseller,1974,Road,40,,
3,932,AW00000004,Modular Cycle Systems,800 Interchange Blvd.,Suite 2501,Austin,78701,Texas,United States,2012.0,...,710-555-0173,300000,30000,United Security,Warehouse,1976,Road,5,,
4,1026,AW00000005,Metropolitan Sports Supply,482505 Warm Springs Blvd.,,Fremont,94536,California,United States,2012.0,...,755-555-0100,800000,80000,Primary Bank & Reserve,Specialty Bike Shop,1978,Road,13,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696,1032,AW00000697,Brakes and Gears,9927 N. Main St.,,Tooele,84074,Utah,United States,2011.0,...,774-555-0133,800000,80000,United Security,Warehouse,1988,Mountain,12,,
697,640,AW00000698,Western Bike Supplies,566 S. Main,,Cedar City,84720,Utah,United States,2012.0,...,944-555-0148,1500000,150000,Primary Bank & Reserve,Specialty Bike Shop,1994,Road,48,,
698,842,AW00000699,Sensational Discount Store,6333 Cloverleaf Parkway,,Kannapolis,28081,North Carolina,United States,2014.0,...,716-555-0123,3000000,300000,Guardian Bank,Value Added Reseller,1987,Road,80,,
699,1030,AW00000700,Underglaze and Finish Company,8520 University City Blvd,,Charlotte,28202,North Carolina,United States,2013.0,...,703-555-0158,800000,80000,Reserve Security,Warehouse,1993,Road,11,,


In [25]:
df_with_demographics

Unnamed: 0,business_entity_id,reseller_alternate_key,reseller_name,address_line1,address_line2,city,postal_code,state_province_name,country_region_name,first_order_year,...,phone,annual_sales,annual_revenue,bank_name,business_type,year_opened,product_line,number_employees,min_payment_type,min_payment_amount
0,934,AW00000001,A Bike Store,2251 Elliot Avenue,,Seattle,98104,Washington,United States,2011.0,...,245-555-0173,300000,30000,International Bank,Warehouse,1970,Road,2,,
1,1028,AW00000002,Progressive Sports,3207 S Grady Way,,Renton,98055,Washington,United States,2012.0,...,230-555-0100,800000,80000,International Security,Specialty Bike Shop,1972,Mountain,10,,
2,642,AW00000003,Advanced Bike Components,12345 Sterling Avenue,,Irving,75061,Texas,United States,2011.0,...,279-555-0130,1500000,150000,Primary International,Value Added Reseller,1974,Road,40,,
3,932,AW00000004,Modular Cycle Systems,800 Interchange Blvd.,Suite 2501,Austin,78701,Texas,United States,2012.0,...,710-555-0173,300000,30000,United Security,Warehouse,1976,Road,5,,
4,1026,AW00000005,Metropolitan Sports Supply,482505 Warm Springs Blvd.,,Fremont,94536,California,United States,2012.0,...,755-555-0100,800000,80000,Primary Bank & Reserve,Specialty Bike Shop,1978,Road,13,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696,1032,AW00000697,Brakes and Gears,9927 N. Main St.,,Tooele,84074,Utah,United States,2011.0,...,774-555-0133,800000,80000,United Security,Warehouse,1988,Mountain,12,,
697,640,AW00000698,Western Bike Supplies,566 S. Main,,Cedar City,84720,Utah,United States,2012.0,...,944-555-0148,1500000,150000,Primary Bank & Reserve,Specialty Bike Shop,1994,Road,48,,
698,842,AW00000699,Sensational Discount Store,6333 Cloverleaf Parkway,,Kannapolis,28081,North Carolina,United States,2014.0,...,716-555-0123,3000000,300000,Guardian Bank,Value Added Reseller,1987,Road,80,,
699,1030,AW00000700,Underglaze and Finish Company,8520 University City Blvd,,Charlotte,28202,North Carolina,United States,2013.0,...,703-555-0158,800000,80000,Reserve Security,Warehouse,1993,Road,11,,


In [26]:
# Vincula con DimGeography
df_geo_with_keys = pd.read_sql(
    text("""
        SELECT geography_key, city, postal_code, state_province_name, english_country_region_name as country_region_name
        FROM dim_geography;
    """),
    etl_conn
)

df_reseller_linked = pd.merge(
    df_with_demographics,
    df_geo_with_keys,
    on=['city', 'postal_code', 'state_province_name', 'country_region_name'],
    how='left'
)

In [27]:
print("Despues del merge con DimGeography:", df_reseller_linked.shape)
df_reseller_linked

Despues del merge con DimGeography: (701, 24)


Unnamed: 0,business_entity_id,reseller_alternate_key,reseller_name,address_line1,address_line2,city,postal_code,state_province_name,country_region_name,first_order_year,...,annual_sales,annual_revenue,bank_name,business_type,year_opened,product_line,number_employees,min_payment_type,min_payment_amount,geography_key
0,934,AW00000001,A Bike Store,2251 Elliot Avenue,,Seattle,98104,Washington,United States,2011.0,...,300000,30000,International Bank,Warehouse,1970,Road,2,,,18
1,1028,AW00000002,Progressive Sports,3207 S Grady Way,,Renton,98055,Washington,United States,2012.0,...,800000,80000,International Security,Specialty Bike Shop,1972,Mountain,10,,,210
2,642,AW00000003,Advanced Bike Components,12345 Sterling Avenue,,Irving,75061,Texas,United States,2011.0,...,1500000,150000,Primary International,Value Added Reseller,1974,Road,40,,,219
3,932,AW00000004,Modular Cycle Systems,800 Interchange Blvd.,Suite 2501,Austin,78701,Texas,United States,2012.0,...,300000,30000,United Security,Warehouse,1976,Road,5,,,209
4,1026,AW00000005,Metropolitan Sports Supply,482505 Warm Springs Blvd.,,Fremont,94536,California,United States,2012.0,...,800000,80000,Primary Bank & Reserve,Specialty Bike Shop,1978,Road,13,,,654
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696,1032,AW00000697,Brakes and Gears,9927 N. Main St.,,Tooele,84074,Utah,United States,2011.0,...,800000,80000,United Security,Warehouse,1988,Mountain,12,,,391
697,640,AW00000698,Western Bike Supplies,566 S. Main,,Cedar City,84720,Utah,United States,2012.0,...,1500000,150000,Primary Bank & Reserve,Specialty Bike Shop,1994,Road,48,,,649
698,842,AW00000699,Sensational Discount Store,6333 Cloverleaf Parkway,,Kannapolis,28081,North Carolina,United States,2014.0,...,3000000,300000,Guardian Bank,Value Added Reseller,1987,Road,80,,,263
699,1030,AW00000700,Underglaze and Finish Company,8520 University City Blvd,,Charlotte,28202,North Carolina,United States,2013.0,...,800000,80000,Reserve Security,Warehouse,1993,Road,11,,,313


In [28]:
# Limpia valores
df_reseller_linked['business_type'] = df_reseller_linked['business_type'].fillna('Unknown')
df_reseller_linked['product_line'] = df_reseller_linked['product_line'].fillna('None')

In [29]:
# Columnas finales segun DimReseller del DW
# se quito account_number recordad
final_columns = [
    'geography_key', 'reseller_alternate_key', 'phone', 'business_type',
    'reseller_name', 'number_employees', 'order_frequency', 'order_month',
    'first_order_year', 'last_order_year', 'product_line', 'address_line1',
    'address_line2', 'annual_sales', 'bank_name', 'min_payment_type',
    'min_payment_amount', 'annual_revenue', 'year_opened'
]

df_to_load = df_reseller_linked[final_columns]

In [30]:
df_to_load

Unnamed: 0,geography_key,reseller_alternate_key,phone,business_type,reseller_name,number_employees,order_frequency,order_month,first_order_year,last_order_year,product_line,address_line1,address_line2,annual_sales,bank_name,min_payment_type,min_payment_amount,annual_revenue,year_opened
0,18,AW00000001,245-555-0173,Warehouse,A Bike Store,2,0,3.0,2011.0,2012.0,Road,2251 Elliot Avenue,,300000,International Bank,,,30000,1970
1,210,AW00000002,230-555-0100,Specialty Bike Shop,Progressive Sports,10,0,3.0,2012.0,2014.0,Mountain,3207 S Grady Way,,800000,International Security,,,80000,1972
2,219,AW00000003,279-555-0130,Value Added Reseller,Advanced Bike Components,40,0,5.0,2011.0,2014.0,Road,12345 Sterling Avenue,,1500000,Primary International,,,150000,1974
3,209,AW00000004,710-555-0173,Warehouse,Modular Cycle Systems,5,0,3.0,2012.0,2014.0,Road,800 Interchange Blvd.,Suite 2501,300000,United Security,,,30000,1976
4,654,AW00000005,755-555-0100,Specialty Bike Shop,Metropolitan Sports Supply,13,0,5.0,2012.0,2014.0,Road,482505 Warm Springs Blvd.,,800000,Primary Bank & Reserve,,,80000,1978
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696,391,AW00000697,774-555-0133,Warehouse,Brakes and Gears,12,0,3.0,2011.0,2014.0,Mountain,9927 N. Main St.,,800000,United Security,,,80000,1988
697,649,AW00000698,944-555-0148,Specialty Bike Shop,Western Bike Supplies,48,0,2.0,2012.0,2013.0,Road,566 S. Main,,1500000,Primary Bank & Reserve,,,150000,1994
698,263,AW00000699,716-555-0123,Value Added Reseller,Sensational Discount Store,80,0,1.0,2014.0,2014.0,Road,6333 Cloverleaf Parkway,,3000000,Guardian Bank,,,300000,1987
699,313,AW00000700,703-555-0158,Warehouse,Underglaze and Finish Company,11,0,3.0,2013.0,2014.0,Road,8520 University City Blvd,,800000,Reserve Security,,,80000,1993


# Load

In [32]:
df_to_load.to_sql(
    'dim_reseller',
    etl_conn,
    schema='dw',
    if_exists='append',
    index=False
)

701

In [None]:
# RECORDAR QUITAR ACCOUNT NUMBER DE LA ESTRUCTURA