In [3]:
import pandas as pd
import xml.etree.ElementTree as ET
from sqlalchemy import text
from connection import connect
from utils.to_spanish import to_spanish

In [6]:
inspector_co, etl_con, co_sa = connect()

Esquemas encontrados: ['hr', 'person', 'production', 'public', 'purchasing', 'sales']

--- Tablas de Negocio Encontradas ---
Esquema 'hr' (6 tablas):
  > ['department', 'employee', 'employee_department_history', 'employee_pay_history', 'job_candidate']...
Esquema 'person' (13 tablas):
  > ['business_entity', 'address', 'address_type', 'business_entity_address', 'business_entity_contact']...
Esquema 'production' (25 tablas):
  > ['illustration', 'bill_of_materials', 'culture', 'document', 'location']...
Esquema 'public' (4 tablas):
  > ['awbuild_version', 'database_log', 'error_log', 'sysdiagrams']...
Esquema 'purchasing' (5 tablas):
  > ['product_vendor', 'purchase_order_detail', 'purchase_order_header', 'ship_method', 'vendor']...
Esquema 'sales' (19 tablas):
  > ['country_region_currency', 'credit_card', 'customer', 'currency', 'currency_rate']...
Tables de datos en la dw (debe estar vacia al iniciar la conexión
['dim_geography']


# Data exploration

In [6]:
inspector_co.get_columns('person', schema='person')

[{'name': 'business_entity_id',
  'type': INTEGER(),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'person_type',
  'type': CHAR(length=2),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'name_style',
  'type': BOOLEAN(),
  'nullable': False,
  'default': 'false',
  'autoincrement': False,
  'comment': None},
 {'name': 'title',
  'type': TEXT(),
  'nullable': True,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'first_name',
  'type': TEXT(),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'middle_name',
  'type': TEXT(),
  'nullable': True,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'last_name',
  'type': TEXT(),
  'nullable': False,
  'default': None,
  'autoincrement': False,
  'comment': None},
 {'name': 'suffix',
  'type': TEXT(),
  'nullable': True,
  'default': None,
  'autoi

In [8]:
# Lista de tablas que queremos inspeccionar
candidate_tables = [
    ('person', 'person'),
    ('sales', 'customer'),
    ('person', 'address'),
    ('person', 'state_province'),
    ('person', 'email_address'),
    ('person', 'person_phone'),
    ('sales', 'sales_territory'),
    ('sales', 'sales_order_header')
]

for schema, table in candidate_tables:
    try:
        cols = inspector_co.get_columns(table, schema=schema)
        print(f"\n{schema}.{table} -> {len(cols)} columnas")
        for c in cols:
            print(f"  - {c['name']:30} {c['type']}")
    except Exception as e:
        print(f"\nNo se pudo obtener columnas de {schema}.{table}: {e}")



person.person -> 13 columnas
  - business_entity_id             INTEGER
  - person_type                    CHAR(2)
  - name_style                     BOOLEAN
  - title                          TEXT
  - first_name                     TEXT
  - middle_name                    TEXT
  - last_name                      TEXT
  - suffix                         TEXT
  - email_promotion                INTEGER
  - additional_contact_info        TEXT
  - demographics                   TEXT
  - rowguid                        CHAR(36)
  - modified_date                  TIMESTAMP

sales.customer -> 7 columnas
  - customer_id                    INTEGER
  - person_id                      INTEGER
  - store_id                       INTEGER
  - territory_id                   INTEGER
  - account_number                 TEXT
  - rowguid                        CHAR(36)
  - modified_date                  TIMESTAMP

person.address -> 9 columnas
  - address_id                     INTEGER
  - address_line_1       

Consulta de union de valores principales

    SELECT
        c.account_number as alternate_key,
        p.title,
        p.first_name,
        p.middle_name,
        p.last_name,
        p.suffix,
        p.name_style
        p.demographics,
        e.email_address,
        pp.phone_number,
        a.address_line_1,
        a.address_line_2
    FROM sales.customer c
    INNER JOIN person.person p
        ON c.person_id = p.business_entity_id
    INNER JOIN person.email_address e
        ON p.business_entity_id = e.business_entity_id
    INNER JOIN person.person_phone pp
        ON p.business_entity_id = pp.business_entity_id
    INNER JOIN person.business_entity_address bea
        ON p.business_entity_id = bea.business_entity_id
    INNER JOIN person.address a
        ON bea.address_id = a.address_id


Tabla relacionada al cliente

    CREATE TABLE IF NOT EXISTS dw.dim_customer (
        customer_key SERIAL PRIMARY KEY,
        customer_alternate_key TEXT,
        geograph_key INT,
        title TEXT,
        first_name TEXT,
        middle_name TEXT,
        last_name TEXT,
        suffix TEXT,
        name_style INT,
        birth_date DATE,
        gender VARCHAR(1),
        marital_status VARCHAR(1),
        yearly_income INT,
        total_children INT,
        number_children_at_home INT,
        english_education TEXT,
        spanish_education TEXT,
        english_occupation TEXT,
        spanish_occupation TEXT,
        home_owner_flag INT,
        number_cars_owned INT,
        email_address TEXT,
        phone TEXT,
        address_line_1 TEXT,
        address_line_2 TEXT,
        date_first_purchase DATE,
        commute_distance TEXT
    )

# Extract

In [25]:
query_dimension = text("""
    SELECT
        c.account_number as alternate_key,
        p.title,
        p.first_name,
        p.middle_name,
        p.last_name,
        p.suffix,
        p.name_style,
        p.demographics,
        e.email_address,
        pp.phone_number,
        a.address_line_1,
        a.address_line_2
    FROM sales.customer c
    INNER JOIN person.person p
        ON c.person_id = p.business_entity_id
    INNER JOIN person.email_address e
        ON p.business_entity_id = e.business_entity_id
    INNER JOIN person.person_phone pp
        ON p.business_entity_id = pp.business_entity_id
    INNER JOIN person.business_entity_address bea
        ON p.business_entity_id = bea.business_entity_id
    INNER JOIN person.address a
        ON bea.address_id = a.address_id
""")

df_ctm = pd.read_sql(query_dimension, co_sa)


In [26]:
df_ctm.columns

Index(['alternate_key', 'title', 'first_name', 'middle_name', 'last_name',
       'suffix', 'name_style', 'demographics', 'email_address', 'phone_number',
       'address_line_1', 'address_line_2'],
      dtype='object')

In [27]:
print(df_ctm['demographics'].apply(type).value_counts())

demographics
<class 'str'>    18508
Name: count, dtype: int64


In [28]:
# Extraer elementos del xml demographics

def extract_text_or_none(element):
    return element.text if element is not None else None

none_xml_columns_dict = {
            'birth_date': None,
            'gender' : None,
            'marital_status' : None,
            'yearly_income' : None,
            'education' : None,
            'occupation' : None,
            'home_owner_flag' : None,
            'number_cars_owned' : None,
            'date_first_purchase' : None,
            'commute_distance' : None,
        }

education_list = []
occupations_list = []

def extract_demographics(xml_string):
    if pd.isna(xml_string):
        return none_xml_columns_dict

    try:
        root = ET.fromstring(xml_string)
        ns = {'ns': root.tag.split('}')[0].strip('{')}

        def namespace_find(tag):
            el = root.find(f'ns:{tag}', ns)
            return el.text if el is not None else None

        return {
            'birth_date': namespace_find('BirthDate'),
            'gender': namespace_find('Gender'),
            'marital_status': namespace_find('MaritalStatus'),
            'yearly_income': namespace_find('YearlyIncome'),
            'total_children': namespace_find('TotalChildren'),
            'number_children_at_home': namespace_find('NumberChildrenAtHome'),
            'education': namespace_find('Education'),
            'occupation': namespace_find('Occupation'),
            'home_owner_flag': namespace_find('HomeOwnerFlag'),
            'number_cars_owned': namespace_find('NumberCarsOwned'),
            'date_first_purchase': namespace_find('DateFirstPurchase'),
            'commute_distance': namespace_find('CommuteDistance'),
        }

    except Exception:
        return none_xml_columns_dict

demographics_df = df_ctm['demographics'].apply(extract_demographics).apply(pd.Series)
df_ctm = pd.concat([df_ctm.drop('demographics', axis=1), demographics_df], axis=1)


In [29]:
df_ctm.tail()

Unnamed: 0,alternate_key,title,first_name,middle_name,last_name,suffix,name_style,email_address,phone_number,address_line_1,...,marital_status,yearly_income,total_children,number_children_at_home,education,occupation,home_owner_flag,number_cars_owned,date_first_purchase,commute_distance
18503,AW00019379,,Crystal,,Guo,,False,crystal18@adventure-works.com,1 (11) 500 555-0171,988 Mt. Everest Court,...,S,0-25000,0,0,Partial High School,Manual,1,2,2004-04-19Z,0-1 Miles
18504,AW00013933,,Isabella,F,Richardson,,False,isabella91@adventure-works.com,910-555-0166,7413 Alpine Drive,...,M,50001-75000,1,0,Partial College,Skilled Manual,1,1,2003-08-30Z,2-5 Miles
18505,AW00024634,,Crystal,S,He,,False,crystal19@adventure-works.com,813-555-0148,4764 East Avenue,...,M,50001-75000,3,0,Bachelors,Management,1,2,2004-04-12Z,10+ Miles
18506,AW00021127,,Crystal,,Zheng,,False,crystal20@adventure-works.com,1 (11) 500 555-0171,"34334, rue Jean Mermoz",...,S,25001-50000,0,0,Partial College,Clerical,1,1,2004-02-15Z,2-5 Miles
18507,AW00027980,,Crystal,,Hu,,False,crystal21@adventure-works.com,1 (11) 500 555-0126,6022 La Salle Ct.,...,M,75001-100000,0,0,Bachelors,Professional,0,4,2003-11-17Z,10+ Miles


In [30]:
null_sum = df_ctm['marital_status'].isnull().sum()
print(null_sum)

0


In [31]:
no_null_values = df_ctm['marital_status'].dropna().head(5)

In [33]:
spanish_education_dict = to_spanish(df_ctm, 'education')
spanish_occupations_dict = to_spanish(df_ctm, 'occupation')

print(spanish_education_dict)
print(spanish_occupations_dict)

{'Graduate Degree': 'Licenciatura', 'Bachelors ': 'Licenciaturas', 'Partial College': 'Universidad Parcial', 'Partial High School': 'Preparatoria Parcial', 'High School': 'Escuela secundaria'}
{'Clerical': 'Clerical', 'Professional': 'Profesional', 'Skilled Manual': 'Manual especializado', 'Manual': 'Manual', 'Management': 'Gestión'}


In [34]:
df_ctm['spanish_education_dict'] = df_ctm['education'].map(spanish_education_dict)
df_ctm['spanish_occupation_dict'] = df_ctm['occupation'].map(spanish_occupations_dict)

In [35]:
df_ctm.head()

Unnamed: 0,alternate_key,title,first_name,middle_name,last_name,suffix,name_style,email_address,phone_number,address_line_1,...,total_children,number_children_at_home,education,occupation,home_owner_flag,number_cars_owned,date_first_purchase,commute_distance,spanish_education_dict,spanish_occupation_dict
0,AW00011377,Mr.,David,R.,Robinett,,False,david22@adventure-works.com,238-555-0100,Pappelallee 6667,...,4,0,Graduate Degree,Clerical,1,0,2003-09-01Z,0-1 Miles,Licenciatura,Clerical
1,AW00011913,Ms.,Rebecca,A.,Robinson,,False,rebecca3@adventure-works.com,648-555-0100,1861 Chinquapin Ct,...,3,3,Bachelors,Professional,1,1,2004-06-05Z,5-10 Miles,Licenciaturas,Profesional
2,AW00011952,Ms.,Dorothy,B.,Robinson,,False,dorothy3@adventure-works.com,423-555-0100,4693 Mills Dr.,...,2,0,Partial College,Skilled Manual,0,2,2002-04-07Z,1-2 Miles,Universidad Parcial,Manual especializado
3,AW00020164,Ms.,Carol Ann,F.,Rockne,,False,carolann0@adventure-works.com,439-555-0100,1312 Skycrest Drive,...,1,0,Bachelors,Clerical,1,0,2001-10-27Z,0-1 Miles,Licenciaturas,Clerical
4,AW00020211,Mr.,Scott,M.,Rodgers,,False,scott10@adventure-works.com,989-555-0100,9860 Brookview Drive,...,2,2,Bachelors,Professional,1,1,2002-04-18Z,2-5 Miles,Licenciaturas,Profesional
