In [None]:
import pandas as pd
from sqlalchemy import text
from connection import connect
from utils.model_loader import ModelRegistry
from utils.translate_language import convert_language


In [2]:
co_oltp,etl_conn,etl_conn_or= connect()

In [None]:
# Carga de modelos
registry = ModelRegistry()
registry.preload_model('en', 'es')
registry.preload_model('en', 'fr')
tokenizer_es, model_es = registry.get_model('en', 'es')
tokenizer_fr, model_fr = registry.get_model('en', 'fr')

# Extract

In [3]:
query_geography = text("""
    SELECT DISTINCT
        a.city,
        a.postal_code,
        sp.state_province_code,
        sp.name AS state_province_name,
        sp.country_region_code,
        cr.name AS country_region_name,
        st.territory_id AS sales_territory_alternate_key
    FROM
        person.address AS a
        INNER JOIN person.state_province AS sp
            ON a.state_province_id = sp.state_province_id
        INNER JOIN person.country_region AS cr
            ON sp.country_region_code = cr.country_region_code
        INNER JOIN sales.sales_territory AS st
            ON sp.territory_id = st.territory_id
        INNER JOIN person.business_entity_address AS bea
            ON a.address_id = bea.address_id
        LEFT JOIN sales.customer AS c
            ON bea.business_entity_id = c.person_id -- Unir si la entidad es un cliente individual
        LEFT JOIN sales.store AS s
            ON bea.business_entity_id = s.business_entity_id -- Unir si la entidad es un revendedor (tienda)
    WHERE
        c.customer_id IS NOT NULL OR s.business_entity_id IS NOT NULL;
""")

df_geo = pd.read_sql(query_geography, co_oltp)

In [4]:
df_geo.head()

Unnamed: 0,city,postal_code,state_province_code,state_province_name,country_region_code,country_region_name,sales_territory_key
0,Lake George,12845,NY,New York,US,United States,2
1,North Ryde,2113,NSW,New South Wales,AU,Australia,9
2,Clay,13041,NY,New York,US,United States,2
3,Bell Gardens,90201,CA,California,US,United States,4
4,Burbank,91502,CA,California,US,United States,4


In [23]:
df_geo.shape

(655, 7)

# Transform

In [5]:

df_geo.rename(columns={'country_region_name':'english_country_region_name'},inplace=True)

df_geo=convert_language('english_country_region_name','french_country_region_name', tokenizer_fr, model_fr, df_geo)
df_geo=convert_language('english_country_region_name','spanish_country_region_name', tokenizer_es, model_es,df_geo)

df_geo

--- Loading model Helsinki-NLP/opus-mt-en-fr (this should only happen once) ---




Found 655 total rows, but only 6 unique values to translate.
Translation complete.
--- Loading model Helsinki-NLP/opus-mt-en-es (this should only happen once) ---




Found 655 total rows, but only 6 unique values to translate.
Translation complete.


Unnamed: 0,city,postal_code,state_province_code,state_province_name,country_region_code,english_country_region_name,sales_territory_key,french_country_region_name,spanish_country_region_name
0,Lake George,12845,NY,New York,US,United States,2,États-Unis,Estados Unidos
1,North Ryde,2113,NSW,New South Wales,AU,Australia,9,Australie,Australia
2,Clay,13041,NY,New York,US,United States,2,États-Unis,Estados Unidos
3,Bell Gardens,90201,CA,California,US,United States,4,États-Unis,Estados Unidos
4,Burbank,91502,CA,California,US,United States,4,États-Unis,Estados Unidos
...,...,...,...,...,...,...,...,...,...
650,Newport Hills,98006,WA,Washington,US,United States,1,États-Unis,Estados Unidos
651,Greeley,80631,CO,Colorado,US,United States,3,États-Unis,Estados Unidos
652,Gilroy,95020,CA,California,US,United States,4,États-Unis,Estados Unidos
653,Fremont,94536,CA,California,US,United States,4,États-Unis,Estados Unidos


In [6]:
df_geo.head()

Unnamed: 0,city,postal_code,state_province_code,state_province_name,country_region_code,english_country_region_name,sales_territory_key,french_country_region_name,spanish_country_region_name
0,Lake George,12845,NY,New York,US,United States,2,États-Unis,Estados Unidos
1,North Ryde,2113,NSW,New South Wales,AU,Australia,9,Australie,Australia
2,Clay,13041,NY,New York,US,United States,2,États-Unis,Estados Unidos
3,Bell Gardens,90201,CA,California,US,United States,4,États-Unis,Estados Unidos
4,Burbank,91502,CA,California,US,United States,4,États-Unis,Estados Unidos


In [7]:
df_geo.shape

(655, 8)

In [None]:
# Recuperando dimension sales territory

df_territory_with_keys = pd.read_sql(
    text("""
    SELECT sales_territory_key, sales_territory_alternate_key FROM dw.dim_sales_territory
    """),
    etl_conn
)

print(f"Se recuperaron {len(df_territory_with_keys)} registros de sales territory.")
print(df_territory_with_keys.head())

In [None]:
# Join de territorio y geofrafia

df_geo_final = pd.merge(
    df_geo,
    df_territory_with_keys,
    on='sales_territory_alternate_key',
    how='left'
)

In [None]:
columns_to_load = ['city', 'state_province_code', 'state_province_name', 'country_region_code', 'english_country_region_name', 'spanish_country_region_name', 'french_country_region_name','postal_code', 'sales_territory_key']

df_geo_to_load = df_geo_final[columns_to_load]

# Load

In [7]:
df_geo_to_load.to_sql(
    'dim_geography',
    etl_conn,
    if_exists='append',
    index=False
)

655