In [None]:
import pandas as pd
import xml.etree.ElementTree as ET
from sqlalchemy import text
from connection import connect
from translate_language import convert_language
import numpy as np

In [4]:
# Conexión
co_oltp,etl_conn,etl_conn_or= connect()

# Extract

In [5]:
# Recuperando dimension de geografía

df_geo_with_keys = pd.read_sql(
    text("""
        SELECT geography_key, city, postal_code, state_province_code, country_region_code
    FROM dim_geography;
    """),
    etl_conn
)

print(f"Se recuperaron {len(df_geo_with_keys)} registros de geografía.")
print(df_geo_with_keys.head())


Se recuperaron 655 registros de geografía.
   geography_key          city postal_code state_province_code  \
0            656   Lake George       12845                 NY    
1            657    North Ryde        2113                 NSW   
2            658          Clay       13041                 NY    
3            659  Bell Gardens       90201                 CA    
4            660       Burbank       91502                 CA    

  country_region_code  
0                  US  
1                  AU  
2                  US  
3                  US  
4                  US  


In [6]:
query_dimension = text("""

--- SOLO COMPRAS HECHAS DESDE INTERNET
WITH internet_customers AS (
    SELECT DISTINCT customer_id
    FROM sales.sales_order_header
    WHERE online_order_flag = true
)


SELECT
    c.account_number AS customer_alternate_key,
    p.title,
    p.first_name,
    p.middle_name,
    p.last_name,
    p.suffix,
    p.name_style,
    p.demographics,
    e.email_address,
    rp.phone_number AS phone,
    a.address_line_1 AS address_line1,
    a.address_line_2 AS address_line2,

    a.city,
    a.postal_code,
    sp.state_province_code,
    cr.country_region_code
FROM
    sales.customer AS c
    INNER JOIN internet_customers AS ic
        ON c.customer_id = ic.customer_id
    INNER JOIN person.person AS p
        ON c.person_id = p.business_entity_id
    INNER JOIN person.email_address AS e
        ON p.business_entity_id = e.business_entity_id
    LEFT JOIN person.person_phone AS rp
        ON p.business_entity_id = rp.business_entity_id
    INNER JOIN person.business_entity_address AS bea
        ON p.business_entity_id = bea.business_entity_id
    INNER JOIN person.address AS a
        ON bea.address_id = a.address_id
    INNER JOIN person.state_province AS sp
        ON a.state_province_id = sp.state_province_id
    INNER JOIN person.country_region AS cr
        ON sp.country_region_code = cr.country_region_code
    INNER JOIN person.address_type AS at
        ON bea.address_type_id = at.address_type_id
WHERE
    at.name = 'Home';
""")

df_ctm = pd.read_sql(query_dimension, co_oltp)


In [7]:
print(df_ctm.columns)
print(df_ctm.shape)

Index(['customer_alternate_key', 'title', 'first_name', 'middle_name',
       'last_name', 'suffix', 'name_style', 'demographics', 'email_address',
       'phone', 'address_line1', 'address_line2', 'city', 'postal_code',
       'state_province_code', 'country_region_code'],
      dtype='object')
(18484, 16)


In [8]:
df_ctm[df_ctm['customer_alternate_key'].isnull()]

Unnamed: 0,customer_alternate_key,title,first_name,middle_name,last_name,suffix,name_style,demographics,email_address,phone,address_line1,address_line2,city,postal_code,state_province_code,country_region_code


# Transform

In [9]:
print(df_ctm['demographics'].apply(type).value_counts())

demographics
<class 'str'>    18484
Name: count, dtype: int64


In [10]:
# Extraer elementos del xml demographics

def extract_text_or_none(element):
    return element.text if element is not None else None

none_xml_columns_dict = {
            'birth_date': None,
            'gender' : None,
            'marital_status' : None,
            'yearly_income' : None,
            'education' : None,
            'occupation' : None,
            'house_owner_flag' : None,
            'number_cars_owned' : None,
            'date_first_purchase' : None,
            'commute_distance' : None,
        }

def extract_demographics(xml_string):
    if pd.isna(xml_string):
        return none_xml_columns_dict

    try:
        root = ET.fromstring(xml_string)
        ns = {'ns': root.tag.split('}')[0].strip('{')}

        def namespace_find(tag):
            el = root.find(f'ns:{tag}', ns)
            return el.text if el is not None else None

        return {
            'birth_date': namespace_find('BirthDate'),
            'gender': namespace_find('Gender'),
            'marital_status': namespace_find('MaritalStatus'),
            'yearly_income': namespace_find('YearlyIncome'),
            'total_children': namespace_find('TotalChildren'),
            'number_children_at_home': namespace_find('NumberChildrenAtHome'),
            'education': namespace_find('Education'),
            'occupation': namespace_find('Occupation'),
            'house_owner_flag': namespace_find('HomeOwnerFlag'),
            'number_cars_owned': namespace_find('NumberCarsOwned'),
            'date_first_purchase': namespace_find('DateFirstPurchase'),
            'commute_distance': namespace_find('CommuteDistance'),
        }

    except Exception:
        return none_xml_columns_dict

demographics_df = df_ctm['demographics'].apply(extract_demographics).apply(pd.Series)
df_ctm = pd.concat([df_ctm.drop('demographics', axis=1), demographics_df], axis=1)


In [11]:
# Pasando valores a booleanos
df_ctm['house_owner_flag'] = df_ctm['house_owner_flag'].astype(bool)

df_ctm.tail()

Unnamed: 0,customer_alternate_key,title,first_name,middle_name,last_name,suffix,name_style,email_address,phone,address_line1,...,marital_status,yearly_income,total_children,number_children_at_home,education,occupation,house_owner_flag,number_cars_owned,date_first_purchase,commute_distance
18479,AW00019379,,Crystal,,Guo,,False,crystal18@adventure-works.com,1 (11) 500 555-0171,988 Mt. Everest Court,...,S,0-25000,0,0,Partial High School,Manual,True,2,2004-04-19Z,0-1 Miles
18480,AW00013933,,Isabella,F,Richardson,,False,isabella91@adventure-works.com,910-555-0166,7413 Alpine Drive,...,M,50001-75000,1,0,Partial College,Skilled Manual,True,1,2003-08-30Z,2-5 Miles
18481,AW00024634,,Crystal,S,He,,False,crystal19@adventure-works.com,813-555-0148,4764 East Avenue,...,M,50001-75000,3,0,Bachelors,Management,True,2,2004-04-12Z,10+ Miles
18482,AW00021127,,Crystal,,Zheng,,False,crystal20@adventure-works.com,1 (11) 500 555-0171,"34334, rue Jean Mermoz",...,S,25001-50000,0,0,Partial College,Clerical,True,1,2004-02-15Z,2-5 Miles
18483,AW00027980,,Crystal,,Hu,,False,crystal21@adventure-works.com,1 (11) 500 555-0126,6022 La Salle Ct.,...,M,75001-100000,0,0,Bachelors,Professional,True,4,2003-11-17Z,10+ Miles


In [12]:
# Verificando valores nulos
columns_with_nulls = df_ctm.columns[df_ctm.isnull().any()].tolist()
print("Columnas con valores nulos")
print(columns_with_nulls)

Columnas con valores nulos
['title', 'middle_name', 'suffix', 'address_line2']


In [13]:
df_ctm.rename(columns={'occupation':'english_occupation','education':'english_education'},inplace=True)

df_ctm=convert_language('en','fr','english_education','french_education',df_ctm)
df_ctm=convert_language('en','es','english_education','spanish_education',df_ctm)
df_ctm=convert_language('en','es','english_occupation','spanish_occupation',df_ctm)
df_ctm=convert_language('en','fr','english_occupation','french_occupation',df_ctm)

df_ctm

--- Loading model Helsinki-NLP/opus-mt-en-fr (this should only happen once) ---




Found 18484 total rows, but only 5 unique values to translate.
Translation complete.
--- Loading model Helsinki-NLP/opus-mt-en-es (this should only happen once) ---




Found 18484 total rows, but only 5 unique values to translate.
Translation complete.
Found 18484 total rows, but only 5 unique values to translate.
Translation complete.
Found 18484 total rows, but only 5 unique values to translate.
Translation complete.


Unnamed: 0,customer_alternate_key,title,first_name,middle_name,last_name,suffix,name_style,email_address,phone,address_line1,...,english_education,english_occupation,house_owner_flag,number_cars_owned,date_first_purchase,commute_distance,french_education,spanish_education,spanish_occupation,french_occupation
0,AW00011377,Mr.,David,R.,Robinett,,False,david22@adventure-works.com,238-555-0100,Pappelallee 6667,...,Graduate Degree,Clerical,True,0,2003-09-01Z,0-1 Miles,Diplôme d'études supérieures,Graduado,Clerical,Clercs
1,AW00011913,Ms.,Rebecca,A.,Robinson,,False,rebecca3@adventure-works.com,648-555-0100,1861 Chinquapin Ct,...,Bachelors,Professional,True,1,2004-06-05Z,5-10 Miles,Baccalauréat,Bachelors,Cuadro orgánico,Professionnel
2,AW00011952,Ms.,Dorothy,B.,Robinson,,False,dorothy3@adventure-works.com,423-555-0100,4693 Mills Dr.,...,Partial College,Skilled Manual,True,2,2002-04-07Z,1-2 Miles,Collège partiel,Colegio parcial,Manual especializado,Manuel qualifié
3,AW00020164,Ms.,Carol Ann,F.,Rockne,,False,carolann0@adventure-works.com,439-555-0100,1312 Skycrest Drive,...,Bachelors,Clerical,True,0,2001-10-27Z,0-1 Miles,Baccalauréat,Bachelors,Clerical,Clercs
4,AW00020211,Mr.,Scott,M.,Rodgers,,False,scott10@adventure-works.com,989-555-0100,9860 Brookview Drive,...,Bachelors,Professional,True,1,2002-04-18Z,2-5 Miles,Baccalauréat,Bachelors,Cuadro orgánico,Professionnel
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18479,AW00019379,,Crystal,,Guo,,False,crystal18@adventure-works.com,1 (11) 500 555-0171,988 Mt. Everest Court,...,Partial High School,Manual,True,2,2004-04-19Z,0-1 Miles,École secondaire partielle,Escuela secundaria parcial,Manual,Manuel
18480,AW00013933,,Isabella,F,Richardson,,False,isabella91@adventure-works.com,910-555-0166,7413 Alpine Drive,...,Partial College,Skilled Manual,True,1,2003-08-30Z,2-5 Miles,Collège partiel,Colegio parcial,Manual especializado,Manuel qualifié
18481,AW00024634,,Crystal,S,He,,False,crystal19@adventure-works.com,813-555-0148,4764 East Avenue,...,Bachelors,Management,True,2,2004-04-12Z,10+ Miles,Baccalauréat,Bachelors,Gestión,Gestion
18482,AW00021127,,Crystal,,Zheng,,False,crystal20@adventure-works.com,1 (11) 500 555-0171,"34334, rue Jean Mermoz",...,Partial College,Clerical,True,1,2004-02-15Z,2-5 Miles,Collège partiel,Colegio parcial,Clerical,Clercs


In [14]:
df_ctm.head()

Unnamed: 0,customer_alternate_key,title,first_name,middle_name,last_name,suffix,name_style,email_address,phone,address_line1,...,english_education,english_occupation,house_owner_flag,number_cars_owned,date_first_purchase,commute_distance,french_education,spanish_education,spanish_occupation,french_occupation
0,AW00011377,Mr.,David,R.,Robinett,,False,david22@adventure-works.com,238-555-0100,Pappelallee 6667,...,Graduate Degree,Clerical,True,0,2003-09-01Z,0-1 Miles,Diplôme d'études supérieures,Graduado,Clerical,Clercs
1,AW00011913,Ms.,Rebecca,A.,Robinson,,False,rebecca3@adventure-works.com,648-555-0100,1861 Chinquapin Ct,...,Bachelors,Professional,True,1,2004-06-05Z,5-10 Miles,Baccalauréat,Bachelors,Cuadro orgánico,Professionnel
2,AW00011952,Ms.,Dorothy,B.,Robinson,,False,dorothy3@adventure-works.com,423-555-0100,4693 Mills Dr.,...,Partial College,Skilled Manual,True,2,2002-04-07Z,1-2 Miles,Collège partiel,Colegio parcial,Manual especializado,Manuel qualifié
3,AW00020164,Ms.,Carol Ann,F.,Rockne,,False,carolann0@adventure-works.com,439-555-0100,1312 Skycrest Drive,...,Bachelors,Clerical,True,0,2001-10-27Z,0-1 Miles,Baccalauréat,Bachelors,Clerical,Clercs
4,AW00020211,Mr.,Scott,M.,Rodgers,,False,scott10@adventure-works.com,989-555-0100,9860 Brookview Drive,...,Bachelors,Professional,True,1,2002-04-18Z,2-5 Miles,Baccalauréat,Bachelors,Cuadro orgánico,Professionnel


In [15]:
# Vinculando dim_customer con dim_geography

linking_columns = ['city', 'postal_code', 'state_province_code', 'country_region_code']

df_customer_final = pd.merge(
    df_ctm,
    df_geo_with_keys,
    on=linking_columns,
    how='left'
)

print("Columnas después del merge:", df_customer_final.columns)
print(df_customer_final[['first_name', 'city', 'geography_key']].head())

Columnas después del merge: Index(['customer_alternate_key', 'title', 'first_name', 'middle_name',
       'last_name', 'suffix', 'name_style', 'email_address', 'phone',
       'address_line1', 'address_line2', 'city', 'postal_code',
       'state_province_code', 'country_region_code', 'birth_date', 'gender',
       'marital_status', 'yearly_income', 'total_children',
       'number_children_at_home', 'english_education', 'english_occupation',
       'house_owner_flag', 'number_cars_owned', 'date_first_purchase',
       'commute_distance', 'french_education', 'spanish_education',
       'spanish_occupation', 'french_occupation', 'geography_key'],
      dtype='object')
  first_name           city  geography_key
0      David       Solingen            905
1    Rebecca        Seaford           1277
2    Dorothy        Geelong            826
3  Carol Ann      Lancaster            955
4      Scott  East Brisbane            833


In [16]:
df_customer_final

Unnamed: 0,customer_alternate_key,title,first_name,middle_name,last_name,suffix,name_style,email_address,phone,address_line1,...,english_occupation,house_owner_flag,number_cars_owned,date_first_purchase,commute_distance,french_education,spanish_education,spanish_occupation,french_occupation,geography_key
0,AW00011377,Mr.,David,R.,Robinett,,False,david22@adventure-works.com,238-555-0100,Pappelallee 6667,...,Clerical,True,0,2003-09-01Z,0-1 Miles,Diplôme d'études supérieures,Graduado,Clerical,Clercs,905
1,AW00011913,Ms.,Rebecca,A.,Robinson,,False,rebecca3@adventure-works.com,648-555-0100,1861 Chinquapin Ct,...,Professional,True,1,2004-06-05Z,5-10 Miles,Baccalauréat,Bachelors,Cuadro orgánico,Professionnel,1277
2,AW00011952,Ms.,Dorothy,B.,Robinson,,False,dorothy3@adventure-works.com,423-555-0100,4693 Mills Dr.,...,Skilled Manual,True,2,2002-04-07Z,1-2 Miles,Collège partiel,Colegio parcial,Manual especializado,Manuel qualifié,826
3,AW00020164,Ms.,Carol Ann,F.,Rockne,,False,carolann0@adventure-works.com,439-555-0100,1312 Skycrest Drive,...,Clerical,True,0,2001-10-27Z,0-1 Miles,Baccalauréat,Bachelors,Clerical,Clercs,955
4,AW00020211,Mr.,Scott,M.,Rodgers,,False,scott10@adventure-works.com,989-555-0100,9860 Brookview Drive,...,Professional,True,1,2002-04-18Z,2-5 Miles,Baccalauréat,Bachelors,Cuadro orgánico,Professionnel,833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18479,AW00019379,,Crystal,,Guo,,False,crystal18@adventure-works.com,1 (11) 500 555-0171,988 Mt. Everest Court,...,Manual,True,2,2004-04-19Z,0-1 Miles,École secondaire partielle,Escuela secundaria parcial,Manual,Manuel,976
18480,AW00013933,,Isabella,F,Richardson,,False,isabella91@adventure-works.com,910-555-0166,7413 Alpine Drive,...,Skilled Manual,True,1,2003-08-30Z,2-5 Miles,Collège partiel,Colegio parcial,Manual especializado,Manuel qualifié,1098
18481,AW00024634,,Crystal,S,He,,False,crystal19@adventure-works.com,813-555-0148,4764 East Avenue,...,Management,True,2,2004-04-12Z,10+ Miles,Baccalauréat,Bachelors,Gestión,Gestion,693
18482,AW00021127,,Crystal,,Zheng,,False,crystal20@adventure-works.com,1 (11) 500 555-0171,"34334, rue Jean Mermoz",...,Clerical,True,1,2004-02-15Z,2-5 Miles,Collège partiel,Colegio parcial,Clerical,Clercs,1238


# Load

In [17]:
final_columns = [
    'customer_alternate_key', 'geography_key', 'title', 'first_name', 'middle_name',
    'last_name', 'suffix', 'name_style', 'birth_date', 'gender', 'marital_status',
    'yearly_income', 'total_children', 'number_children_at_home', 'english_education',
    'spanish_education', 'english_occupation', 'spanish_occupation', 'house_owner_flag',
    'number_cars_owned', 'email_address', 'phone', 'address_line1', 'address_line2',
    'date_first_purchase', 'commute_distance','french_occupation','french_education'
]

df_to_load = df_customer_final[final_columns]

df_to_load.head()

Unnamed: 0,customer_alternate_key,geography_key,title,first_name,middle_name,last_name,suffix,name_style,birth_date,gender,...,house_owner_flag,number_cars_owned,email_address,phone,address_line1,address_line2,date_first_purchase,commute_distance,french_occupation,french_education
0,AW00011377,905,Mr.,David,R.,Robinett,,False,1961-02-23Z,M,...,True,0,david22@adventure-works.com,238-555-0100,Pappelallee 6667,,2003-09-01Z,0-1 Miles,Clercs,Diplôme d'études supérieures
1,AW00011913,1277,Ms.,Rebecca,A.,Robinson,,False,1965-06-11Z,F,...,True,1,rebecca3@adventure-works.com,648-555-0100,1861 Chinquapin Ct,,2004-06-05Z,5-10 Miles,Professionnel,Baccalauréat
2,AW00011952,826,Ms.,Dorothy,B.,Robinson,,False,1954-09-23Z,M,...,True,2,dorothy3@adventure-works.com,423-555-0100,4693 Mills Dr.,,2002-04-07Z,1-2 Miles,Manuel qualifié,Collège partiel
3,AW00020164,955,Ms.,Carol Ann,F.,Rockne,,False,1943-07-15Z,M,...,True,0,carolann0@adventure-works.com,439-555-0100,1312 Skycrest Drive,,2001-10-27Z,0-1 Miles,Clercs,Baccalauréat
4,AW00020211,833,Mr.,Scott,M.,Rodgers,,False,1968-05-15Z,M,...,True,1,scott10@adventure-works.com,989-555-0100,9860 Brookview Drive,,2002-04-18Z,2-5 Miles,Professionnel,Baccalauréat


In [18]:
df_to_load.head()

Unnamed: 0,customer_alternate_key,geography_key,title,first_name,middle_name,last_name,suffix,name_style,birth_date,gender,...,house_owner_flag,number_cars_owned,email_address,phone,address_line1,address_line2,date_first_purchase,commute_distance,french_occupation,french_education
0,AW00011377,905,Mr.,David,R.,Robinett,,False,1961-02-23Z,M,...,True,0,david22@adventure-works.com,238-555-0100,Pappelallee 6667,,2003-09-01Z,0-1 Miles,Clercs,Diplôme d'études supérieures
1,AW00011913,1277,Ms.,Rebecca,A.,Robinson,,False,1965-06-11Z,F,...,True,1,rebecca3@adventure-works.com,648-555-0100,1861 Chinquapin Ct,,2004-06-05Z,5-10 Miles,Professionnel,Baccalauréat
2,AW00011952,826,Ms.,Dorothy,B.,Robinson,,False,1954-09-23Z,M,...,True,2,dorothy3@adventure-works.com,423-555-0100,4693 Mills Dr.,,2002-04-07Z,1-2 Miles,Manuel qualifié,Collège partiel
3,AW00020164,955,Ms.,Carol Ann,F.,Rockne,,False,1943-07-15Z,M,...,True,0,carolann0@adventure-works.com,439-555-0100,1312 Skycrest Drive,,2001-10-27Z,0-1 Miles,Clercs,Baccalauréat
4,AW00020211,833,Mr.,Scott,M.,Rodgers,,False,1968-05-15Z,M,...,True,1,scott10@adventure-works.com,989-555-0100,9860 Brookview Drive,,2002-04-18Z,2-5 Miles,Professionnel,Baccalauréat


In [19]:
#change house_owner_flag to binary value

df_to_load['house_owner_flag']=df_to_load['house_owner_flag'].apply(lambda x: 1 if x else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_load['house_owner_flag']=df_to_load['house_owner_flag'].apply(lambda x: 1 if x else 0)


In [20]:
def upper_income(x):
    if not isinstance(x,str):
        return np.nan
    parts=x.split('-')
    if len(parts)==2:
        try:
            return float(parts[1])
        except:
            return np.nan
    elif len(parts)==1:
        try:
            return float(parts[0])
        except:
            return np.nan
    else:
        return np.nan

 

df_to_load['yearly_income']=df_to_load['yearly_income'].apply(upper_income)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_to_load['yearly_income']=df_to_load['yearly_income'].apply(upper_income)


In [21]:
columns_with_nulls = df_ctm.columns[df_ctm.isnull().any()].tolist()
print("Columnas con valores nulos")
print(columns_with_nulls)

Columnas con valores nulos
['title', 'middle_name', 'suffix', 'address_line2']


In [22]:
df_to_load.to_sql(
    'dim_customer',
    etl_conn,
    if_exists='append',
    index=False
)

484