In [26]:
import pandas as pd
import gzip
import os
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

 # A la empresa le interesa el rastreo de lo que marketing considera los productos y lo que ventas considera los mejores clientes.

## Mejores Productos
- product_id = {20001, 20002, 20003, 20004, 20005, 20006, 20007, 20009, 20011, 20032} (diez productos)

## Mejores Clientes
- customer_id = {10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10009, 10011, 10012, 10013} (doce clientes)

Total de <producto, cliente> a predecir = 10 * 12 = 120

# Objetivo
- Es el 01-enero-2020 a las 00:01 y disponibilizamos las ventas del periodo 2021912.
- El 02-enero a las 18:00 nos deben entregar:
  - El primer forecast de ventas para cada producto que se harán durante el mes 202002, de forma que nuestras plantas puedan fabricarlos durante el mes de 202001.
  - El segundo forecast es las ventas esperadas en 202002, para los 120 pares de <mejores_clientes, mejores_productos>.

In [27]:
####################################################
############# Setear segun cada maquina ############
#os.chdir("C:/Users/herna/labo3_empresa3_repo/datasets")
os.chdir("C:/diego_tools/labo3/dataset")
####################################################

In [28]:
arch_sellout = "tb_sellout_02.txt.gz"
arch_maestro_prod = "maestro_productos_depurado.csv"
arch_exogenas = "emp3_exogenas.csv"

## Sellout

In [29]:
# Abrir el archivo .gz y cargarlo en un DataFrame
with gzip.open(arch_sellout, 'rt') as archivo:
    # Leer el archivo línea por línea
    df_sellout = pd.read_csv(archivo,sep="\t")

In [30]:
df_sellout.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2945818 entries, 0 to 2945817
Data columns (total 7 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   periodo                int64  
 1   customer_id            int64  
 2   product_id             int64  
 3   plan_precios_cuidados  int64  
 4   cust_request_qty       int64  
 5   cust_request_tn        float64
 6   tn                     float64
dtypes: float64(2), int64(5)
memory usage: 157.3 MB


In [31]:
# Por las dudas, eliminamos duplicados
print(len(df_sellout))
df_sellout.drop_duplicates(inplace=True)
print(len(df_sellout))

2945818
2945818


**--> sin duplicados**

In [44]:
df_sellout.isna().sum()

periodo                  0
customer_id              0
product_id               0
plan_precios_cuidados    0
cust_request_qty         0
cust_request_tn          0
tn                       0
periodo_fecha            0
mes                      0
dtype: int64

**--> sin nulos**

In [32]:
df_sellout.periodo.unique()                

array([201701, 201702, 201703, 201704, 201705, 201706, 201707, 201708,
       201709, 201710, 201711, 201712, 201801, 201802, 201803, 201804,
       201805, 201806, 201807, 201808, 201809, 201810, 201811, 201812,
       201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
       201909, 201910, 201911, 201912], dtype=int64)

In [33]:
len(df_sellout.product_id.unique())          

1233

--> algunos productos no van a tener descripción

In [34]:
len(df_sellout.customer_id.unique())            

597

In [35]:
df_sellout.plan_precios_cuidados.unique()          

array([0, 1], dtype=int64)

In [36]:
df_sellout['periodo_fecha'] = pd.to_datetime(df_sellout['periodo'], format='%Y%m')
df_sellout["mes"] = pd.DatetimeIndex(df_sellout.periodo_fecha).month

In [37]:
df_sellout.head()

Unnamed: 0,periodo,customer_id,product_id,plan_precios_cuidados,cust_request_qty,cust_request_tn,tn,periodo_fecha,mes
0,201701,10234,20524,0,2,0.053,0.053,2017-01-01,1
1,201701,10032,20524,0,1,0.13628,0.13628,2017-01-01,1
2,201701,10217,20524,0,1,0.03028,0.03028,2017-01-01,1
3,201701,10125,20524,0,1,0.02271,0.02271,2017-01-01,1
4,201701,10012,20524,0,11,1.54452,1.54452,2017-01-01,1


In [38]:
# Como control, sumo tns
print("Toneladas Total Control:", round(sum(df_sellout.tn),0))

Toneladas Total Control: 1324989.0


In [40]:
# Agrupo por producto y por periodo
df_sellout_prod = df_sellout.groupby(['periodo','periodo_fecha','product_id']).agg({'tn': 'sum', 'cust_request_tn':'sum', 'cust_request_qty':'sum','plan_precios_cuidados':'max','mes':'max'}).reset_index()
print("Toneladas Total Control:", round(sum(df_sellout_prod.tn),0))

Toneladas Total Control: 1324989.0


**Ahora vamos a completar los meses que no tengan info para los productos**

In [42]:
periodos = df_sellout_prod.periodo.unique()
cant_periodos = len(periodos)
periodos, cant_periodos

(array([201701, 201702, 201703, 201704, 201705, 201706, 201707, 201708,
        201709, 201710, 201711, 201712, 201801, 201802, 201803, 201804,
        201805, 201806, 201807, 201808, 201809, 201810, 201811, 201812,
        201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908,
        201909, 201910, 201911, 201912], dtype=int64),
 36)

In [43]:
productos = df_sellout_prod.product_id.unique()
cant_productos = len(productos)
cant_productos

1233

In [45]:
len(df_sellout_prod),cant_productos*cant_periodos

(31243, 44388)

**--> no todos los productos están en todos los períodos**

In [49]:
# Imputo por 0 a todos los periodos donde el producto no se vendio
df_cartesiano = pd.DataFrame(data={"product_id":productos}).merge(pd.DataFrame(data={"periodo":periodos}), how='cross')
df_cartesiano["imputado"] = 1
len(df_cartesiano)

44388

In [50]:
df_cartesiano.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44388 entries, 0 to 44387
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   product_id  44388 non-null  int64
 1   periodo     44388 non-null  int64
 2   imputado    44388 non-null  int64
dtypes: int64(3)
memory usage: 1.4 MB


In [48]:
df_cartesiano.head()

Unnamed: 0,product_id,periodo,imputado,tn_imputado
0,20001,201701,1,0
1,20001,201702,1,0
2,20001,201703,1,0
3,20001,201704,1,0
4,20001,201705,1,0


In [52]:
df_sellout_prod_completado = df_cartesiano.merge(df_sellout_prod, how="left",on=["product_id","periodo"])
len(df_sellout_prod_completado)

44388

In [53]:
df_sellout_prod_completado.head()

Unnamed: 0,product_id,periodo,imputado,periodo_fecha,tn,cust_request_tn,cust_request_qty,plan_precios_cuidados,mes
0,20001,201701,1,2017-01-01,934.77222,937.72717,479.0,0.0,1.0
1,20001,201702,1,2017-02-01,798.0162,833.72187,432.0,0.0,2.0
2,20001,201703,1,2017-03-01,1303.35771,1330.74697,509.0,0.0,3.0
3,20001,201704,1,2017-04-01,1069.9613,1132.9443,279.0,0.0,4.0
4,20001,201705,1,2017-05-01,1502.20132,1550.68936,701.0,0.0,5.0


## Incorporo Maestro y Exógenas

In [14]:
df_maestro_prod = pd.read_csv(arch_maestro_prod)

In [6]:
df_maestro_prod.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1251 entries, 0 to 1250
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   cat1               1251 non-null   object
 1   cat2               1251 non-null   object
 2   cat3               1251 non-null   object
 3   brand              1251 non-null   object
 4   sku_size           1251 non-null   int64 
 5   product_id         1251 non-null   int64 
 6   producto_estrella  1251 non-null   int64 
dtypes: int64(3), object(4)
memory usage: 68.5+ KB


In [None]:
df_maestro_prod.head()

In [None]:
print(len(df_sellout))
df_sellout = pd.merge(df_sellout, df_maestro_prod, on='product_id', how='left')
print(len(df_sellout))
df_sellout.head()

In [13]:
df_exogenas = pd.read_csv(arch_exogenas)
df_exogenas.periodo_fecha = pd.to_datetime(df_exogenas.periodo_fecha)

df_exogenas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48 entries, 0 to 47
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   periodo_fecha              48 non-null     datetime64[ns]
 1   temp_media                 48 non-null     float64       
 2   temp_max_media             48 non-null     float64       
 3   temp_min_media             48 non-null     float64       
 4   IPC                        48 non-null     float64       
 5   promedio_mens_dolar_venta  48 non-null     float64       
 6   catastrofe                 48 non-null     bool          
dtypes: bool(1), datetime64[ns](1), float64(5)
memory usage: 2.4 KB


## Generación de datasets base

**El primer forecast de ventas para cada producto que se harán durante el mes 202002, de forma que nuestras plantas puedan fabricarlos durante el mes de 202001.**

Toneladas Total Control: 1324989.0


In [34]:
# Lista con el nuevo orden de las columnas
new_order = ['periodo','periodo_fecha','mes','product_id','tn','cust_request_qty','cust_request_tn',
             'plan_precios_cuidados', 'cat1','cat2','cat3','sku_size','producto_estrella']

# Reordena las columnas del DataFrame
df_base_prod = df_base_prod[new_order]

In [35]:
df_base_prod.head()

Unnamed: 0,periodo,periodo_fecha,mes,product_id,tn,cust_request_qty,cust_request_tn,plan_precios_cuidados,cat1,cat2,cat3,sku_size,producto_estrella
0,201701,2017-01-01,1,20001,934.77222,479,937.72717,0,HC,ROPA LAVADO,Liquido,3000.0,1
1,201701,2017-01-01,1,20002,550.15707,391,555.18654,0,HC,ROPA LAVADO,Liquido,3000.0,1
2,201701,2017-01-01,1,20003,1063.45835,438,1067.81543,0,FOODS,ADEREZOS,Mayonesa,475.0,1
3,201701,2017-01-01,1,20004,555.91614,339,569.37394,0,FOODS,ADEREZOS,Mayonesa,240.0,1
4,201701,2017-01-01,1,20005,494.27011,249,494.60084,0,FOODS,ADEREZOS,Mayonesa,120.0,1


In [36]:
# Le agregamos las exogenas
print(len(df_base_prod))
df_base_prod = pd.merge(df_base_prod,df_exogenas,on="periodo_fecha",how="left")
print(len(df_base_prod))

31243
31243


In [37]:
df_base_prod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31243 entries, 0 to 31242
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   periodo                    31243 non-null  int64         
 1   periodo_fecha              31243 non-null  datetime64[ns]
 2   mes                        31243 non-null  int64         
 3   product_id                 31243 non-null  int64         
 4   tn                         31243 non-null  float64       
 5   cust_request_qty           31243 non-null  int64         
 6   cust_request_tn            31243 non-null  float64       
 7   plan_precios_cuidados      31243 non-null  int64         
 8   cat1                       31243 non-null  object        
 9   cat2                       31243 non-null  object        
 10  cat3                       31243 non-null  object        
 11  sku_size                   31243 non-null  float64       
 12  prod

In [38]:
# Exportar el DataFrame a un archivo CSV
df_base_prod.to_csv("emp3_sellout_base_period_product.csv", index=False)

**El segundo forecast es las ventas esperadas en 202002, para los 120 pares de <mejores_clientes, mejores_productos>.**

In [None]:
star_customers = [10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10009, 10011, 10012, 10013]

In [39]:
# Filtrar productos y clientes
filtered_df = df_sellout[(df_sellout['product_id'].isin(star_products))]
filtered_df = filtered_df[(filtered_df['customer_id'].isin(star_customers))]

In [40]:
# generamos nuevos tn y cust_request_tn con normalizacion

# Normalizacion MinMaxScaler lineal (0-1)
scaler = MinMaxScaler()
filtered_df[['tn_01', 'cust_request_tn_01']] = scaler.fit_transform(filtered_df[['tn', 'cust_request_tn']])

# Normalizacion standard con media y desvio
scaler = StandardScaler()
filtered_df[['tn_md', 'cust_request_tn_md']] = scaler.fit_transform(filtered_df[['tn', 'cust_request_tn']])

In [41]:
# Lista con el nuevo orden de las columnas
new_order = ['periodo','periodo_fecha', 'product_id', 'customer_id', 'tn','tn_01','tn_md','cust_request_qty','cust_request_tn',
             'cust_request_tn_01', 'cust_request_tn_md', 'plan_precios_cuidados', 'cat1','cat2','cat3','sku_size']

# Reordena las columnas del DataFrame
filtered_df = filtered_df[new_order]

In [42]:
filtered_df.head()

Unnamed: 0,periodo,periodo_fecha,product_id,customer_id,tn,tn_01,tn_md,cust_request_qty,cust_request_tn,cust_request_tn_01,cust_request_tn_md,plan_precios_cuidados,cat1,cat2,cat3,sku_size
4797,201701,2017-01-01,20007,10011,4.90379,0.008943,-0.647465,5,4.90379,0.008883,-0.650233,0,HC,ROPA ACONDICIONADOR,ACONDICIONADOR,900.0
4812,201701,2017-01-01,20007,10005,4.20325,0.007664,-0.66135,6,4.20325,0.007613,-0.663648,0,HC,ROPA ACONDICIONADOR,ACONDICIONADOR,900.0
4823,201701,2017-01-01,20007,10007,2.52973,0.00461,-0.694521,1,2.52973,0.004579,-0.695695,0,HC,ROPA ACONDICIONADOR,ACONDICIONADOR,900.0
4836,201701,2017-01-01,20007,10012,21.19132,0.038671,-0.324629,7,21.19132,0.038413,-0.338329,0,HC,ROPA ACONDICIONADOR,ACONDICIONADOR,900.0
4843,201701,2017-01-01,20007,10003,20.2281,0.036913,-0.343721,22,20.2281,0.036667,-0.356774,0,HC,ROPA ACONDICIONADOR,ACONDICIONADOR,900.0


In [43]:
# Le agregamos las exogenas
print(len(filtered_df))
filtered_df = pd.merge(filtered_df,df_exogenas,on="periodo_fecha",how="left")
print(len(filtered_df))

3744
3744


In [44]:
filtered_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3744 entries, 0 to 3743
Data columns (total 22 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   periodo                    3744 non-null   int64         
 1   periodo_fecha              3744 non-null   datetime64[ns]
 2   product_id                 3744 non-null   int64         
 3   customer_id                3744 non-null   int64         
 4   tn                         3744 non-null   float64       
 5   tn_01                      3744 non-null   float64       
 6   tn_md                      3744 non-null   float64       
 7   cust_request_qty           3744 non-null   int64         
 8   cust_request_tn            3744 non-null   float64       
 9   cust_request_tn_01         3744 non-null   float64       
 10  cust_request_tn_md         3744 non-null   float64       
 11  plan_precios_cuidados      3744 non-null   int64         
 12  cat1  

In [45]:
# Exportar el DataFrame a un archivo CSV
filtered_df.to_csv("emp3_sellout_base_period_filtered_customer_product.csv", index=False)