In [38]:
import pandas as pd
import numpy as np
from connection import connect
from sqlalchemy import text

from utils.model_loader import ModelRegistry
from utils.translate_language import convert_language

In [3]:
co_oltp,etl_conn,_=connect()

# Extract

In [165]:
# Extraccion de datos desde el OLTP
# Se basa en humanresources.employee + person.person + department
query_employee = text("""
WITH ranked_pay_history AS (
    SELECT
        business_entity_id,
        rate_change_date,
        rate AS base_rate,
        pay_frequency,
        ROW_NUMBER() OVER(
            PARTITION BY business_entity_id
            ORDER BY rate_change_date DESC
        ) as rn
    FROM hr.employee_pay_history
)
SELECT
    e.business_entity_id AS employee_alternate_key,
    e.national_idnumber AS employee_national_id_alternate_key,
    p.first_name,
    p.middle_name,
    p.last_name,
    p.suffix,
    e.gender,
    e.marital_status,
    e.birth_date,
    e.hire_date,
    e.salaried_flag,
    e.vacation_hours,
    e.sick_leave_hours,
    e.current_flag,
    e.organization_level,
    e.job_title AS title,
    e.login_id,
    ea.email_address,
    pp.phone_number AS phone,
    d.name AS department_name,
    h.start_date,
    h.end_date,
    sp.territory_id AS sales_territory_alternate_key,
    rph.pay_frequency AS pay_frequency,
    rph.base_rate,
    NULL AS employee_photo
FROM hr.employee AS e
INNER JOIN person.person AS p
    ON e.business_entity_id = p.business_entity_id
LEFT JOIN person.email_address AS ea
    ON p.business_entity_id = ea.business_entity_id
LEFT JOIN person.person_phone AS pp
    ON p.business_entity_id = pp.business_entity_id
LEFT JOIN hr.employee_department_history AS h
    ON e.business_entity_id = h.business_entity_id
LEFT JOIN hr.department AS d
    ON h.department_id = d.department_id
LEFT JOIN sales.sales_person AS sp
    ON e.business_entity_id = sp.business_entity_id
LEFT JOIN ranked_pay_history AS rph
    ON e.business_entity_id = rph.business_entity_id
    AND rph.rn = 1
""")

df_emp = pd.read_sql(query_employee, co_oltp)
print(f"Registros extraidos: {len(df_emp)}")
print(df_emp.head(3))

Registros extraidos: 296
   employee_alternate_key employee_national_id_alternate_key first_name  \
0                       1                          295847284        Ken   
1                       2                          245797967      Terri   
2                       3                          509647174    Roberto   

  middle_name   last_name suffix gender marital_status  birth_date  \
0           J     Sánchez   None      M              S  1969-01-29   
1         Lee       Duffy   None      F              S  1971-08-01   
2        None  Tamburello   None      M              M  1974-11-12   

    hire_date  ...                  login_id                 email_address  \
0  2009-01-14  ...      adventure-works\ken0      ken0@adventure-works.com   
1  2008-01-31  ...    adventure-works\terri0    terri0@adventure-works.com   
2  2007-11-11  ...  adventure-works\roberto0  roberto0@adventure-works.com   

          phone  department_name  start_date end_date  \
0  697-555-0142        

# Transform

In [166]:
# Enlaza con DimSalesTerritory (territorio de ventas)
df_terr = pd.read_sql(
    text("SELECT sales_territory_key, sales_territory_alternate_key FROM dim_sales_territory;"),
    etl_conn
)

In [167]:
# Algunos empleados pueden tener territorio asignado si son vendedores
# (se usa la tabla sales.sales_person)
sales_person = pd.read_sql(
    text("SELECT business_entity_id AS employee_alternate_key, territory_id FROM sales.sales_person;"),
    co_oltp
)

df_emp = df_emp.merge(
    sales_person,
    on='employee_alternate_key',
    how='left'
)

df_emp = pd.merge(
    df_emp,
    df_terr,
    on='sales_territory_alternate_key',  # Usamos 'on' porque los nombres de columna coinciden
    how='left'
)

In [168]:
df_emp['name_style'] = 0
df_emp['sales_person_flag'] = df_emp['sales_territory_key'].notnull().astype(int)
df_emp['current_flag'] = df_emp['current_flag'].astype(int)
df_emp['salaried_flag'] = df_emp['salaried_flag'].astype(int)

In [169]:
# Limpia columnas textuales nulas
for col in ['department_name', 'title']:
    df_emp[col] = df_emp[col].fillna('Unknown')

In [170]:
df_emp['status'] = np.where(
        pd.isnull(df_emp['end_date']),
        'Current',
        None
)

In [172]:
# Selecciona las columnas finales
final_columns = [
    #'employee_alternate_key',
    #'parent_employee_key',
    'employee_national_id_alternate_key',
    'sales_territory_key',
    'first_name',
    'last_name',
    'middle_name',
    'name_style',
    'title',
    'gender',
    'marital_status',
    'birth_date',
    'hire_date',
    'login_id',
    'email_address',
    'phone',
    #'emergency_contact_name',
    #'emergency_contact_phone',
    'salaried_flag',
    'vacation_hours',
    'sick_leave_hours',
    'current_flag',
    'sales_person_flag',
    'department_name',
    'pay_frequency',
    'base_rate',
    'start_date',
    'end_date',
    'status',
    'employee_photo',
]

In [173]:
df_to_load = df_emp[final_columns]
print("Columnas finales:", df_to_load.columns.tolist())

Columnas finales: ['employee_national_id_alternate_key', 'sales_territory_key', 'first_name', 'last_name', 'middle_name', 'name_style', 'title', 'gender', 'marital_status', 'birth_date', 'hire_date', 'login_id', 'email_address', 'phone', 'salaried_flag', 'vacation_hours', 'sick_leave_hours', 'current_flag', 'sales_person_flag', 'department_name', 'pay_frequency', 'base_rate', 'start_date', 'end_date', 'status', 'employee_photo']


In [174]:
df_to_load

Unnamed: 0,employee_national_id_alternate_key,sales_territory_key,first_name,last_name,middle_name,name_style,title,gender,marital_status,birth_date,...,sick_leave_hours,current_flag,sales_person_flag,department_name,pay_frequency,base_rate,start_date,end_date,status,employee_photo
0,295847284,,Ken,Sánchez,J,0,Chief Executive Officer,M,S,1969-01-29,...,69,1,0,Executive,2,125.5000,2009-01-14,,Current,
1,245797967,,Terri,Duffy,Lee,0,Vice President of Engineering,F,S,1971-08-01,...,20,1,0,Engineering,2,63.4615,2008-01-31,,Current,
2,509647174,,Roberto,Tamburello,,0,Engineering Manager,M,M,1974-11-12,...,21,1,0,Engineering,2,43.2692,2007-11-11,,Current,
3,112457891,,Rob,Walters,,0,Senior Tool Designer,M,S,1974-12-23,...,80,1,0,Tool Design,2,29.8462,2010-05-31,,Current,
4,112457891,,Rob,Walters,,0,Senior Tool Designer,M,S,1974-12-23,...,80,1,0,Engineering,2,29.8462,2007-12-05,2010-05-30,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
291,758596752,9.0,Lynn,Tsoflias,N,0,Sales Representative,F,S,1977-02-14,...,38,1,1,Sales,2,23.0769,2013-05-30,,Current,
292,982310417,,Amy,Alberts,E,0,European Sales Manager,F,M,1957-09-20,...,30,1,0,Sales,2,48.1010,2012-04-16,,Current,
293,954276278,8.0,Rachel,Valdez,B,0,Sales Representative,F,S,1975-07-09,...,37,1,1,Sales,2,23.0769,2013-05-30,,Current,
294,668991357,10.0,Jae,Pak,B,0,Sales Representative,F,M,1968-03-17,...,38,1,1,Sales,2,23.0769,2012-05-30,,Current,


# Load

In [175]:
# Carga al DW
df_to_load.to_sql(
    'dim_employee',
    etl_conn,
    schema='dw',
    if_exists='append',
    index=False
)

296