In [None]:
import pandas as pd
from connection import connect
from utils.model_loader import ModelRegistry
from utils.translate_language import convert_language

In [None]:
# connection
co_oltp,etl_conn,etl_conn_or=connect()

In [None]:
# Carga de modelos
registry = ModelRegistry()
registry.preload_model('en', 'es')
registry.preload_model('en', 'fr')
registry.preload_model('en', 'jap')
registry.preload_model('en', 'de')
registry.preload_model('en', 'trk')
tokenizer_es, model_es = registry.get_model('en', 'es')
tokenizer_fr, model_fr = registry.get_model('en', 'fr')
tokenizer_jap, model_jap = registry.get_model('en', 'jap')
tokenizer_de, model_de = registry.get_model('en', 'de')
tokenizer_trk, model_trk = registry.get_model('en', 'trk')

## Extract

In [None]:
#extract DimProductCategory

# from deep_translator import GoogleTranslator

dim_product=pd.read_sql_table('product',co_oltp,schema='production')
dim_product.columns


## Transform


In [None]:

dim_product=dim_product.rename(columns={'product_number':'product_alternate_key','name':'english_product_name'})

dim_product.columns

dim_product_subcategory=pd.read_sql_query("""SELECT product_subcategory_key,product_subcategory_alternate_key
                                          FROM dim_product_subcategory 
                                         """,etl_conn)

dim_product= dim_product.merge(
    dim_product_subcategory[['product_subcategory_alternate_key','product_subcategory_key']],
    left_on='product_subcategory_id',
    right_on='product_subcategory_alternate_key',
    how='left'
)

dim_product.drop(['product_subcategory_alternate_key','product_subcategory_id'], axis=1 ,inplace=True)

dim_product.columns

In [None]:
# size_range, deal_price, model_name,large_photo,start_date,end_date,status

In [None]:
#size_range
def get_size_range(size):
    if not size:
        return 'NA'
    try:
        s = int(size)
        if 38 <= s <= 40:
            return '38–40 CM'
        elif 42 <= s <= 46:
            return '42–46 CM'
        elif 48 <= s <= 52:
            return '48–52 CM'
        elif 54 <= s <= 58:
            return '54–58 CM'
        elif 60 <= s <= 62:
            return '60–62 CM'
        else:
            return 'NA'
    except ValueError:
        pass
    
dim_product['size_range']=dim_product['size'].apply(get_size_range)   

# dim_product[dim_product['product_alternate_key']=='FR-M94S-44']

In [None]:
#deal_price

t_sales_order_d = pd.read_sql("""
   
    SELECT 
    p.product_id,
    
    p.standard_cost,
    
    ROUND(AVG(sod.unit_price * (1 - sod.unit_price_discount)), 4) AS dealer_price
    FROM sales.sales_order_detail AS sod
    JOIN sales.sales_order_header AS soh 
        ON sod.sales_order_id = soh.sales_order_id
    JOIN sales.customer AS c 
        ON soh.customer_id = c.customer_id
    JOIN sales.store AS s 
        ON c.store_id = s.business_entity_id
    JOIN production.product AS p 
        ON sod.product_id = p.product_id
    GROUP BY p.product_id, p.standard_cost
    ORDER BY p.product_id;


    """, co_oltp)

t_sales_order_d

dim_product=dim_product.merge(
    t_sales_order_d[['product_id','dealer_price']],
    on='product_id',
    how='left'

)

In [None]:
#model name

t_product_model=pd.read_sql("SELECT product_model_id, name FROM production.product_model",co_oltp)

dim_product=dim_product.merge(
    t_product_model,
    on='product_model_id',
    how='left'
).rename(columns={'name':'model_name'})

dim_product

In [None]:
#large_photo

t_large_photo= pd.read_sql("""SELECT ppp.product_id, pp.large_photo
                           
                           FROM production.product_photo AS pp, production.product_product_photo AS ppp

                           WHERE pp.product_photo_id=ppp.product_photo_id

                           
                           """,co_oltp)


dim_product=dim_product.merge(
    t_large_photo,
    on='product_id',
    how='left'
)

# dim_product

In [11]:
#start_date,end_date PREGUNTAR!


t_product_price_list_history=pd.read_sql("""SELECT ph.product_id, ph.start_date, ph.end_date
                                         FROM production.product_list_price_history as ph   
                                         """,co_oltp)

dim_product=dim_product.merge(
    t_product_price_list_history,
    on='product_id',
    how='left'
    
)



# t_product_price_list_history

# dim_product


In [12]:
#satus --> this depend of end_date

dim_product['status']=dim_product['end_date'].apply(lambda x: 'Current' if pd.isna(x) else None )


In [None]:
# description languages

t_language_description=pd.read_sql("""
    SELECT p.product_id,  pd.description ,pc.name
    FROM production.product as p JOIN production.product_model_prod_desc_culture as pmpdc
    ON p.product_model_id=pmpdc.product_model_id
    JOIN production.product_description as pd
    ON pmpdc.product_description_id=pd.product_description_id
    JOIN production.culture as pc
    ON pmpdc.culture_id=pc.culture_id
""",co_oltp)


columns=t_language_description['name'].unique().tolist()

new_columns={name:f'{name.lower()}_description' for name in columns}


t_language_description=t_language_description.pivot(index='product_id',columns='name',values='description').reset_index()

t_language_description.rename(columns=new_columns,inplace=True)


dim_product=dim_product.merge(
    t_language_description,
    on='product_id',
    how='left'
)

In [None]:
dim_product=convert_language('english_product_name','french_product_name', tokenizer_fr, model_fr, dim_product)
dim_product=convert_language('english_product_name','spanish_product_name', tokenizer_es, model_es, dim_product)

dim_product=convert_language('english_description','japanese_description', tokenizer_jap, model_jap, dim_product)
dim_product=convert_language('english_description','german_description',tokenizer_de, model_de, dim_product)
dim_product=convert_language('english_description','turkish_description', tokenizer_trk, model_trk, dim_product)

dim_product

In [None]:
# dim_product_olap=pd.read_sql_table('dim_product',etl_conn_or)
# columns_olap=set(dim_product_olap.columns)

# columns_oltp=set(dim_product.columns)

# columns_olap-columns_oltp


dim_product=dim_product.drop(['product_id','make_flag','product_model_id','discontinued_date','rowguid','modified_date','sell_start_date','sell_end_date'],axis=1)

In [None]:
# convert null to 'NA' in color and replace 'nan' objects

# dim_product_olap['color'].unique()

dim_product['color']=dim_product['color'].apply(lambda x:'NA' if pd.isna(x) else x)
dim_product = dim_product.replace({'nan':None})

dim_product

## Load

In [64]:
dim_product.to_sql('dim_product', etl_conn, if_exists='append',index=False)

606