In [2]:
import pandas as pd


In [3]:
df = pd.read_csv('../datasets/dt_target.csv',sep=',')
df.head()

Unnamed: 0,product_id,customer_id,periodo,periodo_producto,nacimiento_producto,muerte_cliente,tn,cust_request_tn,cust_request_qty,cat1,cat2,cat3,brand,sku_size,stock_final,plan_precios_cuidados,periodo_dt,target
0,20524,10234,201701,6.48085,201701,201912,0.053,0.053,2.0,HC,VAJILLA,Cristalino,Importado,500.0,,0.0,2017-01-01,-0.03786
1,20524,10234,201702,3.99755,201701,201912,0.0,,,,,,,,,,2017-02-01,0.0
2,20524,10234,201703,7.14711,201701,201912,0.01514,0.01514,1.0,HC,VAJILLA,Cristalino,Importado,500.0,,0.0,2017-03-01,-0.01514
3,20524,10234,201704,6.82163,201701,201912,0.0,,,,,,,,,,2017-04-01,0.03786
4,20524,10234,201705,9.25949,201701,201912,0.0,,,,,,,,,,2017-05-01,0.0


In [4]:
df.shape

(16998193, 18)

In [5]:
# Ordenar por producto, cliente y tiempo
df["periodo_dt"] = pd.to_datetime(df["periodo_dt"], format='%Y-%m-%d')
df = df.sort_values(['product_id', 'customer_id', 'periodo_dt'])


df["month"] = df["periodo_dt"].dt.month
df["year"] = df["periodo_dt"].dt.year
df['quarter'] = df['month'].apply(lambda x: (x-1)//3 + 1)
df['semester'] = df['month'].apply(lambda x: 1 if x <=6 else 2)
df['is_month_end'] = df['month'].isin([1, 3, 5, 7, 8, 10, 12])  # Meses con 31 días
df['season'] = df['month']%12 // 3 + 1  # 1:Invierno, 2:Primavera, etc.
df['size_vs_category'] = df['sku_size'] / df.groupby('cat3')['sku_size'].transform('mean')


# Crear lags
df['lag_1m'] = df.groupby(['product_id', 'customer_id'])['tn'].shift(1)
df['lag_2m'] = df.groupby(['product_id', 'customer_id'])['tn'].shift(2)
df['lag_3m'] = df.groupby(['product_id', 'customer_id'])['tn'].shift(3)
df['lag_11m'] = df.groupby(['product_id', 'customer_id'])['tn'].shift(11)

# Promedio móvil
df['rolling_3m_mean'] = df.groupby(['product_id', 'customer_id'])['tn'].transform(
    lambda x: x.rolling(3, min_periods=1).mean())

df['rolling_6m_mean'] = df.groupby(['product_id', 'customer_id'])['tn'].transform(
    lambda x: x.rolling(6, min_periods=1).mean())

df['rolling_12m_mean'] = df.groupby(['product_id', 'customer_id'])['tn'].transform(
    lambda x: x.rolling(12, min_periods=1).mean())


# Tendencia anual
df['annual_trend'] = df.groupby(['product_id', 'month'])['tn'].transform('mean')

# Variación estacional
df['seasonal_variation'] = df['tn'] / df['annual_trend']

# segunda tanda de features

# Desviación estándar y coeficiente de variación
df['std_6m'] = df.groupby(['product_id', 'customer_id'])['tn'].transform(
    lambda x: x.shift(1).rolling(6, min_periods=2).std())

df['mean_6m'] = df['rolling_6m_mean']
df['cv_6m'] = df['std_6m'] / (df['mean_6m'] + 1e-5)  # Coef. de variación: std/mean


# Periodos desde última compra
df['comprado'] = (df['tn'] > 0).astype(int)
# df['periodos_desde_ultima_compra'] = df.groupby(['product_id', 'customer_id'])['comprado'].apply(
#     lambda x: x[::-1].cumsum()[::-1].where(x==1).ffill().fillna(0))
df['periodos_desde_ultima_compra'] = df.groupby(['product_id', 'customer_id'])['comprado'].transform(
    lambda x: x[::-1].cumsum()[::-1].where(x==1).ffill().fillna(0))
# Cantidad de meses con compra en últimos N meses
for window in [3, 6, 12]:
    df[f'freq_compra_{window}m'] = df.groupby(['product_id', 'customer_id'])['comprado'].transform(
        lambda x: x.shift(1).rolling(window, min_periods=1).sum())
    
from scipy.stats import mode

# Modo de cantidad (más común en últimos 6 meses)
def rolling_mode(x):
    return x.shift(1).rolling(6, min_periods=1).apply(lambda x: mode(x, keepdims=True)[0][0], raw=False)
#Moda o patrón de cantidad
df['modo_6m'] = df.groupby(['product_id', 'customer_id'])['tn'].transform(rolling_mode)
df['modo_diff'] = df['tn'] - df['modo_6m']
#Tendencia reciente
df['trend_3m'] = df.groupby(['product_id', 'customer_id'])['tn'].transform(
    lambda x: x.shift(1).rolling(3).apply(lambda y: y.iloc[-1] - y.iloc[0] if len(y) == 3 else 0))
#Promedio histórico total
df['media_historica_cliente_producto'] = df.groupby(['product_id', 'customer_id'])['tn'].transform(
    lambda x: x.expanding().mean())
#Ratio de compra cliente vs. total producto
df['participacion_producto'] = df['tn'] / (df['periodo_producto'] + 1e-5)
#Uso de stock
df['stock_vs_venta'] = df['stock_final'] / (df['tn'] + 1e-5)

df['meses_desde_nacimiento'] = df['periodo'] - df['nacimiento_producto']
df['meses_hasta_muerte_cliente'] = df['muerte_cliente'] - df['periodo']
df['productos_distintos_cliente_mes'] = df.groupby(['customer_id', 'periodo'])['product_id'].transform('nunique')
df['total_cliente_mes'] = df.groupby(['customer_id', 'periodo'])['tn'].transform('sum')
df['proporcion_producto_en_total_mes'] = df['tn'] / (df['total_cliente_mes'] + 1e-5)
# tn vs. cust_request_tn → indicador de si le entregaron lo que pidió
df['ratio_entregado_sobre_pedido'] = df['tn'] / (df['cust_request_tn'] + 1e-5)
# tn vs. sku_size → toneladas por unidad
df['tn_por_unidad'] = df['tn'] / (df['sku_size'] + 1e-5)

In [6]:
df.to_csv("../datasets/dt_fe2.csv", index=False, sep=",")

In [7]:
df.dtypes

product_id                                   int64
customer_id                                  int64
periodo                                      int64
periodo_producto                           float64
nacimiento_producto                          int64
muerte_cliente                               int64
tn                                         float64
cust_request_tn                            float64
cust_request_qty                           float64
cat1                                        object
cat2                                        object
cat3                                        object
brand                                       object
sku_size                                   float64
stock_final                                float64
plan_precios_cuidados                      float64
periodo_dt                          datetime64[ns]
target                                     float64
month                                        int32
year                           

In [8]:
df.describe()

Unnamed: 0,product_id,customer_id,periodo,periodo_producto,nacimiento_producto,muerte_cliente,tn,cust_request_tn,cust_request_qty,sku_size,...,media_historica_cliente_producto,participacion_producto,stock_vs_venta,meses_desde_nacimiento,meses_hasta_muerte_cliente,productos_distintos_cliente_mes,total_cliente_mes,proporcion_producto_en_total_mes,ratio_entregado_sobre_pedido,tn_por_unidad
count,16998190.0,16998190.0,16998190.0,16998190.0,16998190.0,16998190.0,16998190.0,2940634.0,2940634.0,2933187.0,...,16998190.0,16998190.0,1126334.0,16998190.0,16998190.0,16998190.0,16998190.0,16998190.0,2940634.0,2933187.0
mean,20530.31,10283.05,201805.3,45.23226,201714.4,201906.4,0.08204286,0.4844765,2.181631,446.7299,...,0.08818597,0.00183097,844.4327,90.867,101.1014,864.4326,70.63342,0.0009701238,0.9975881,0.003468941
min,20001.0,10001.0,201701.0,0.00044,201701.0,201701.0,0.0,0.0001,1.0,1.0,...,0.0,0.0,-4584.258,0.0,0.0,781.0,0.0,0.0,0.0003785145,9.1e-07
25%,20235.0,10138.0,201709.0,2.35628,201701.0,201912.0,0.0,0.0104,1.0,90.0,...,0.0,0.0,28.62348,6.0,8.0,847.0,0.68137,0.0,0.9989837,5.507547e-05
50%,20488.0,10276.0,201806.0,9.99716,201701.0,201912.0,0.0,0.04013,1.0,240.0,...,0.0001257692,0.0,189.1654,102.0,104.0,856.0,6.5207,0.0,0.9997394,0.00019656
75%,20805.0,10422.0,201903.0,30.80259,201701.0,201912.0,0.0,0.16275,2.0,450.0,...,0.008088889,0.0,773.4967,194.0,201.0,897.0,31.53658,0.0,0.999936,0.0008057067
max,21299.0,10637.0,201912.0,2424.467,201909.0,201912.0,547.8785,551.5614,142.0,10000.0,...,243.7955,0.9999995,245333.7,211.0,211.0,932.0,4704.263,0.9999987,11.39297,20.51552
std,345.0224,169.217,81.45169,121.3034,42.11088,26.82285,1.398516,3.458918,3.686673,744.5894,...,1.15148,0.01335544,2240.954,78.93427,80.77464,42.70546,268.1095,0.009563166,0.02702053,0.05697891


In [None]:
def optimize_memory_usage(df, verbose=True, convert_bool=True):
    """
    Optimiza el uso de memoria de un DataFrame reduciendo tipos numéricos, booleanos, categóricos y fechas.
    
    Parámetros:
    -----------
    df : pd.DataFrame
        DataFrame original a optimizar.
    verbose : bool, opcional (default=True)
        Si True, muestra información sobre la reducción de memoria.
    convert_bool : bool, opcional (default=True)
        Si True, convierte columnas binarias a booleanas.
    
    Retorna:
    --------
    pd.DataFrame
        DataFrame optimizado con tipos de datos reducidos.
    """
    import pandas as pd
    import numpy as np

    df_optimized = df.copy()
    start_mem = df_optimized.memory_usage(deep=True).sum() / 1024**2

    for col in df_optimized.columns:
        col_type = df_optimized[col].dtypes

        if pd.api.types.is_numeric_dtype(col_type):
            col_min = df_optimized[col].min()
            col_max = df_optimized[col].max()

            if pd.api.types.is_integer_dtype(col_type):
                if col_min >= 0:
                    if col_max <= np.iinfo(np.uint8).max:
                        df_optimized[col] = df_optimized[col].astype(np.uint8)
                    elif col_max <= np.iinfo(np.uint16).max:
                        df_optimized[col] = df_optimized[col].astype(np.uint16)
                    elif col_max <= np.iinfo(np.uint32).max:
                        df_optimized[col] = df_optimized[col].astype(np.uint32)
                    else:
                        df_optimized[col] = df_optimized[col].astype(np.uint64)
                else:
                    if col_min >= np.iinfo(np.int8).min and col_max <= np.iinfo(np.int8).max:
                        df_optimized[col] = df_optimized[col].astype(np.int8)
                    elif col_min >= np.iinfo(np.int16).min and col_max <= np.iinfo(np.int16).max:
                        df_optimized[col] = df_optimized[col].astype(np.int16)
                    elif col_min >= np.iinfo(np.int32).min and col_max <= np.iinfo(np.int32).max:
                        df_optimized[col] = df_optimized[col].astype(np.int32)
                    else:
                        df_optimized[col] = df_optimized[col].astype(np.int64)

            elif pd.api.types.is_float_dtype(col_type):
                if not df_optimized[col].isnull().any():
                    if col_min >= np.finfo(np.float16).min and col_max <= np.finfo(np.float16).max:
                        df_optimized[col] = df_optimized[col].astype(np.float16)
                    elif col_min >= np.finfo(np.float32).min and col_max <= np.finfo(np.float32).max:
                        df_optimized[col] = df_optimized[col].astype(np.float32)
                    else:
                        df_optimized[col] = df_optimized[col].astype(np.float64)
                else:
                    # Con NaNs, evitamos float16
                    if col_min >= np.finfo(np.float32).min and col_max <= np.finfo(np.float32).max:
                        df_optimized[col] = df_optimized[col].astype(np.float32)

        elif pd.api.types.is_object_dtype(col_type):
            num_unique = df_optimized[col].nunique()
            num_total = len(df_optimized[col])
            if num_unique / num_total < 0.5:
                df_optimized[col] = df_optimized[col].astype('category')

        elif pd.api.types.is_bool_dtype(col_type):
            df_optimized[col] = df_optimized[col].astype('bool')

        elif convert_bool and df_optimized[col].dropna().nunique() == 2:
            # Convertir columnas binarias a booleanas si no lo son aún
            unique_vals = df_optimized[col].dropna().unique()
            if set(unique_vals) <= {0, 1} or set(unique_vals) <= {True, False}:
                df_optimized[col] = df_optimized[col].astype('bool')

        elif pd.api.types.is_datetime64_any_dtype(col_type):
            # Ya está optimizada
            continue

        elif col_type == 'object':
            try:
                parsed_dates = pd.to_datetime(df_optimized[col], errors='coerce')
                if parsed_dates.notna().sum() > 0.9 * len(df_optimized[col]):
                    df_optimized[col] = parsed_dates
            except Exception:
                pass

    end_mem = df_optimized.memory_usage(deep=True).sum() / 1024**2

    if verbose:
        print(f"Memoria inicial: {start_mem:.2f} MB")
        print(f"Memoria final:   {end_mem:.2f} MB")
        print(f"Reducción:       {100 * (start_mem - end_mem) / start_mem:.2f}%")

    return df_optimized
