In [1]:
import pandas as pd
from sqlalchemy import text
from connection import connect
from utils.to_spanish import to_spanish

In [2]:
_, etl_conn, co_sa = connect()


Esquemas encontrados: ['hr', 'person', 'production', 'public', 'purchasing', 'sales']

--- Tablas de Negocio Encontradas ---
Esquema 'hr' (6 tablas):
  > ['department', 'employee', 'employee_department_history', 'employee_pay_history', 'job_candidate']...
Esquema 'person' (13 tablas):
  > ['business_entity', 'address', 'address_type', 'business_entity_address', 'business_entity_contact']...
Esquema 'production' (25 tablas):
  > ['illustration', 'bill_of_materials', 'culture', 'document', 'location']...
Esquema 'public' (4 tablas):
  > ['awbuild_version', 'database_log', 'error_log', 'sysdiagrams']...
Esquema 'purchasing' (5 tablas):
  > ['product_vendor', 'purchase_order_detail', 'purchase_order_header', 'ship_method', 'vendor']...
Esquema 'sales' (19 tablas):
  > ['country_region_currency', 'credit_card', 'customer', 'currency', 'currency_rate']...
Tables de datos en la dw (debe estar vacia al iniciar la conexión
[]


# Extract

In [3]:
query_geography = text("""
    SELECT DISTINCT
        a.city,
        a.postal_code,
        sp.state_province_code,
        sp.name AS state_province_name,
        sp.country_region_code,
        cr.name AS country_region_name,
        st.territory_id AS sales_territory_key
    FROM
        person.address AS a
        INNER JOIN person.state_province AS sp
            ON a.state_province_id = sp.state_province_id
        INNER JOIN person.country_region AS cr
            ON sp.country_region_code = cr.country_region_code
        INNER JOIN sales.sales_territory AS st
            ON sp.territory_id = st.territory_id
        INNER JOIN person.business_entity_address AS bea
            ON a.address_id = bea.address_id
        LEFT JOIN sales.customer AS c
            ON bea.business_entity_id = c.person_id -- Unir si la entidad es un cliente individual
        LEFT JOIN sales.store AS s
            ON bea.business_entity_id = s.business_entity_id -- Unir si la entidad es un revendedor (tienda)
    WHERE
        c.customer_id IS NOT NULL OR s.business_entity_id IS NOT NULL;
""")

df_geo = pd.read_sql(query_geography, co_sa)

In [4]:
df_geo.head()

Unnamed: 0,city,postal_code,state_province_code,state_province_name,country_region_code,country_region_name,sales_territory_key
0,Lake George,12845,NY,New York,US,United States,2
1,North Ryde,2113,NSW,New South Wales,AU,Australia,9
2,Clay,13041,NY,New York,US,United States,2
3,Bell Gardens,90201,CA,California,US,United States,4
4,Burbank,91502,CA,California,US,United States,4


In [23]:
df_geo.shape

(655, 7)

# Transform

In [5]:
country_map = to_spanish(df_geo, 'country_region_name')

df_geo['spanish_country_region_name'] = df_geo['country_region_name'].map(country_map)

print(country_map)

# TODO: VINCULAR LLAVE sales_territory_key CON LA DIMENSION SALES_TERRITORY

{'United States': 'Estados Unidos', 'Australia': 'Australia', 'Canada': 'Canadá', 'Germany': 'Alemania', 'United Kingdom': 'Reino Unido', 'France': 'Francia'}


In [6]:
df_geo.head()

Unnamed: 0,city,postal_code,state_province_code,state_province_name,country_region_code,country_region_name,sales_territory_key,spanish_country_region_name
0,Lake George,12845,NY,New York,US,United States,2,Estados Unidos
1,North Ryde,2113,NSW,New South Wales,AU,Australia,9,Australia
2,Clay,13041,NY,New York,US,United States,2,Estados Unidos
3,Bell Gardens,90201,CA,California,US,United States,4,Estados Unidos
4,Burbank,91502,CA,California,US,United States,4,Estados Unidos


In [7]:
df_geo.shape

(655, 8)

# Load

In [8]:
df_geo.to_sql(
    'dim_geography',
    etl_conn,
    schema='dw',
    if_exists='append',
    index=False
)

655