In [1]:
import pandas as pd
import gc
from reducirDT import optimize_memory_usage

In [2]:
df = pd.read_parquet("../../datasets/df_target_class.parquet")
df = optimize_memory_usage(df)
df.head()

Memoria inicial: 4195.18 MB
Memoria final:   1080.95 MB
Reducción:       74.23%


Unnamed: 0,product_id,customer_id,periodo,periodo_producto,nacimiento_producto,tn,cust_request_tn,cust_request_qty,cat1,cat2,cat3,brand,sku_size,stock_final,plan_precios_cuidados,periodo_dt,periodo_target_dt,compro,target
0,20524,10234,201701,6.480469,201701,0.053009,0.053,2.0,HC,VAJILLA,Cristalino,Importado,500.0,,0.0,2017-01-01,2017-03-01,1,1.0
1,20524,10234,201702,3.998047,201701,0.0,,,,,,,,,,2017-02-01,2017-04-01,0,0.0
2,20524,10234,201703,7.148438,201701,0.015137,0.01514,1.0,HC,VAJILLA,Cristalino,Importado,500.0,,0.0,2017-03-01,2017-05-01,1,0.0
3,20524,10234,201704,6.820312,201701,0.0,,,,,,,,,,2017-04-01,2017-06-01,0,1.0
4,20524,10234,201705,9.257812,201701,0.0,,,,,,,,,,2017-05-01,2017-07-01,0,0.0


In [3]:
df["periodo_dt"] = pd.to_datetime(df["periodo_dt"], format='%Y-%m-%d')
df = df.sort_values(['product_id', 'customer_id', 'periodo_dt'])


df["month"] = df["periodo_dt"].dt.month
df["year"] = df["periodo_dt"].dt.year
df['quarter'] = df['month'].apply(lambda x: (x-1)//3 + 1)
df['semester'] = df['month'].apply(lambda x: 1 if x <=6 else 2)
df['is_month_end'] = df['month'].isin([1, 3, 5, 7, 8, 10, 12])  # Meses con 31 días
df['season'] = df['month']%12 // 3 + 1  # 1:Invierno, 2:Primavera, etc.
df['size_vs_category'] = df['sku_size'] / df.groupby('cat3')['sku_size'].transform('mean')


# Crear lags
df['lag_1m'] = df.groupby(['product_id', 'customer_id'])['tn'].shift(1)
df['lag_3m'] = df.groupby(['product_id', 'customer_id'])['tn'].shift(3)
df['lag_6m'] = df.groupby(['product_id', 'customer_id'])['tn'].shift(6)
df['lag_12m'] = df.groupby(['product_id', 'customer_id'])['tn'].shift(12)



  df['size_vs_category'] = df['sku_size'] / df.groupby('cat3')['sku_size'].transform('mean')


In [4]:
# Precomputar la condición (x != 0) una sola vez
condition = (df['target'] != 0).astype(int)

# Calcular los grupos de meses consecutivos
groups = condition.ne(condition.shift()).cumsum()

# Aplicar cumsum dentro de cada grupo
df['consecutive_months'] = df.groupby(['product_id', 'customer_id', groups]).cumcount().where(condition == 1, 0)

In [5]:
# Precomputar la condición (x != 0) una sola vez
condition = (df['target'] == 0).astype(int)

# Calcular los grupos de meses consecutivos
groups = condition.ne(condition.shift()).cumsum()

# Aplicar cumsum dentro de cada grupo
df['consecutive_months_0'] = df.groupby(['product_id', 'customer_id', groups]).cumcount().where(condition == 1, 0)

In [None]:
# moda_tn demora 6 minutos
from scipy.stats import mode

moda_por_grupo = df.groupby(['product_id', 'customer_id'])['tn'].agg(
    lambda x: mode(x, keepdims=False).mode if len(x) > 0 else 0
).reset_index(name='moda_tn_temp')

df = df.merge(moda_por_grupo, on=['product_id', 'customer_id'], how='left')

In [6]:
# tn / moda_tn
df['tn_moda_tn'] = df['tn'] / df['moda_tn_temp']


In [6]:
# demora 1 minuto y medio
grouper = df.groupby(['product_id', 'customer_id'])['tn']
df['rolling_3m_mean']  = grouper.rolling(3, min_periods=1).mean().reset_index(level=[0,1], drop=True)
df['rolling_6m_mean']  = grouper.rolling(6, min_periods=1).mean().reset_index(level=[0,1], drop=True)
df['rolling_12m_mean'] = grouper.rolling(12, min_periods=1).mean().reset_index(level=[0,1], drop=True)

In [7]:
# Paso 1: Calcular las medias por grupo
mean_tn = df.groupby(['product_id', 'month'])['tn'].mean().reset_index(name='annual_trend_temp')

# Paso 2: Unir al DataFrame original
df = df.merge(mean_tn, on=['product_id', 'month'], how='left')
df['annual_trend'] = df['annual_trend_temp']
df.drop(columns=['annual_trend_temp'], inplace=True)

In [8]:
# Paso 1: Calcular suma total por grupo
sum_cliente_periodo = df.groupby(['customer_id', 'periodo'])['tn'].sum().reset_index(name='sum_tn')

# Paso 2: Unir al DataFrame original
df = df.merge(sum_cliente_periodo, on=['customer_id', 'periodo'], how='left')

# Paso 3: Calcular proporción
df['proporcion_producto_en_total_mes'] = df['tn'] / df['sum_tn']
df['total_cliente_mes'] = df['sum_tn']  # Opcional
df.drop(columns=['sum_tn'], inplace=True)  # Limpieza

In [9]:
df.to_parquet("../../datasets/dt_fe_class.parquet", index=False)