In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import lightgbm as lgb

In [2]:
df = pd.read_parquet('../../datasets/dt_fe_class.parquet')

In [3]:
from reducirDT import optimize_memory_usage
df = optimize_memory_usage(df)

Memoria inicial: 2941.57 MB
Memoria final:   1842.91 MB
Reducción:       37.35%


In [10]:
ultimos_12_periodos = df['periodo'].unique()[-12:]  # Identificamos los últimos 12 períodos

suma_mensual  = (
    df.query("periodo in @ultimos_12_periodos & tn > 0")
    .groupby(['periodo', 'product_id', 'customer_id'], as_index=False)
    ['tn'].sum()
)
moda = (
    suma_mensual.groupby(['product_id', "customer_id"], as_index=False)
    ['tn'].agg(lambda x: x.mode().mean())  # Promedio de los modos si hay varios
    .rename(columns={'tn': 'moda'})
)
moda

Unnamed: 0,product_id,customer_id,moda
0,20001,10001,214.875000
1,20001,10002,51.281250
2,20001,10003,132.875000
3,20001,10004,170.375000
4,20001,10005,12.609375
...,...,...,...
244159,21276,10428,0.003345
244160,21276,10456,0.005196
244161,21276,10462,0.000750
244162,21276,10495,0.001480


In [11]:
moda["moda"].sum()

52060.0

In [12]:
# Codificar categóricas
import numpy as np
cat_cols = ['cat1', 'cat2', 'cat3', 'brand', 'plan_precios_cuidados']
for col in cat_cols:
    if col in df.columns:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        

df = df.drop(['periodo_dt'], axis=1, errors='ignore')
datetime_cols = df.select_dtypes(include=['datetime', 'datetime64']).columns
for col in datetime_cols:
    df[col] = df[col].astype(np.int64)  # converts to nanoseconds
    
df_kgl = df[df["periodo"] == 201912]
df = df[~df["periodo"].isin([201911, 201912])]
# Separar features y target
X = df.drop(columns=["target"])
y = df["target"].fillna(0).astype(int)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False)


In [13]:

df_kgl.shape

(528942, 37)

In [14]:
y.isna().sum()

0

In [15]:

# Entrenar una vez el modelo con esos parámetros (ej. sobre X_train si no querés usar todo)
model = lgb.train(
    #best_params,
    {
    "objective": "binary",
    "metric": ["auc", "binary_logloss"],
    "scale_pos_weight": 5.32,
    "verbosity": -1,
    "n_jobs": -1,
    "seed": 42
},
    lgb.Dataset(X_train, label=y_train),
    valid_sets=[lgb.Dataset(X_val, label=y_val)],
    num_boost_round=1000,
    #early_stopping_rounds=50
)




Predicciones para periodo 201912:
[1.         1.         1.         ... 0.00670654 0.00425692 0.00425692]


In [16]:
# Esto devuelve probabilidades entre 0 y 1
preds_proba = model.predict(X_val)
from sklearn.metrics import f1_score
import numpy as np

best_thresh = 0.5
best_f1 = 0

for thresh in np.arange(0.1, 0.9, 0.01):
    preds_label = (preds_proba >= thresh).astype(int)
    f1 = f1_score(y_val, preds_label)
    if f1 > best_f1:
        best_f1 = f1
        best_thresh = thresh

print(f"Mejor umbral: {best_thresh:.2f} con F1-score: {best_f1:.4f}")


Mejor umbral: 0.73 con F1-score: 0.6430


In [17]:
from sklearn.metrics import classification_report, confusion_matrix
# Calcular métricas de clasificación
y_pred = (preds_proba >= best_thresh).astype(int)
print("Reporte de clasificación:")
print(classification_report(y_val, y_pred))
print("Matriz de confusión:")
print(confusion_matrix(y_val, y_pred))

Reporte de clasificación:
              precision    recall  f1-score   support

           0       0.97      0.96      0.97   3222624
           1       0.60      0.69      0.64    280334

    accuracy                           0.94   3502958
   macro avg       0.79      0.82      0.80   3502958
weighted avg       0.94      0.94      0.94   3502958

Matriz de confusión:
[[3096253  126371]
 [  87643  192691]]


In [18]:
#accuracy 
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9389


In [19]:
final_preds = (preds_proba >= best_thresh).astype(int)

In [20]:
# Asegurar las mismas columnas
X_kgl = df_kgl[X.columns]  # Misma estructura

# === 9. Hacer predicción sobre nuevos datos ===
preds_kgl = model.predict(X_kgl)
final_preds = (preds_kgl >= best_thresh).astype(int)
# Mostrar o guardar resultados
print("Predicciones para periodo 201912:")
print(final_preds)

Predicciones para periodo 201912:
[1 1 1 ... 0 0 0]


In [21]:
productos_ok = pd.read_csv("https://storage.googleapis.com/open-courses/austral2025-af91/labo3v/product_id_apredecir201912.txt", sep="\t")

result = X_kgl.copy()
result["preds"] = final_preds
result = result[result["product_id"].isin(productos_ok["product_id"])]
result = result.merge(moda, on=["product_id", "customer_id"], how="left")
result["tn_pred"] = result["preds"] * result["moda"]
result 

Unnamed: 0,product_id,customer_id,periodo,periodo_producto,nacimiento_producto,tn,cust_request_tn,cust_request_qty,cat1,cat2,...,consecutive_months_0,rolling_3m_mean,rolling_6m_mean,rolling_12m_mean,annual_trend,proporcion_producto_en_total_mes,total_cliente_mes,preds,moda,tn_pred
0,20001,10001,201912,1505.000000,201701,180.250000,214.721848,18.0,1,10,...,0,197.625000,146.750000,214.750000,2.255859,0.076056,2370.0,1,214.875000,214.875000
1,20001,10002,201912,1505.000000,201701,113.312500,115.303223,20.0,1,10,...,0,58.781250,47.593750,47.031250,2.255859,0.038260,2962.0,1,51.281250,51.281250
2,20001,10003,201912,1505.000000,201701,102.250000,113.981369,9.0,1,10,...,0,88.125000,133.000000,121.875000,2.255859,0.099369,1029.0,1,132.875000,132.875000
3,20001,10004,201912,1505.000000,201701,34.656250,34.648102,8.0,1,10,...,0,185.125000,194.750000,170.375000,2.255859,0.051810,669.0,1,170.375000,170.375000
4,20001,10005,201912,1505.000000,201701,19.609375,19.603680,25.0,1,10,...,0,16.328125,11.671875,10.507812,2.255859,0.027486,713.5,1,12.609375,12.609375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465655,21276,10633,201912,0.008919,201903,0.000000,,,4,15,...,0,0.000000,0.000000,0.000000,0.000015,,0.0,0,,
465656,21276,10634,201912,0.008919,201903,0.000000,,,4,15,...,0,0.000000,0.000000,0.000000,0.000015,,0.0,0,,
465657,21276,10635,201912,0.008919,201903,0.000000,,,4,15,...,0,0.000000,0.000000,0.000000,0.000015,,0.0,0,,
465658,21276,10636,201912,0.008919,201903,0.000000,,,4,15,...,0,0.000000,0.000000,0.000000,0.000015,,0.0,0,,


In [19]:
result = result.groupby("product_id", as_index=False).agg({"tn_pred": "sum"})
result

Unnamed: 0,product_id,tn_pred
0,20001,1656.496773
1,20002,1390.286888
2,20003,959.997685
3,20004,848.242962
4,20005,924.177352
...,...,...
775,21263,0.035743
776,21265,0.125913
777,21266,0.077428
778,21267,0.104742


In [23]:
final_preds

array([1, 1, 1, ..., 0, 0, 0])

In [22]:
result.tn_pred.sum()

45448.595096349716

In [21]:
result.to_csv("../../results/clasificacion_mean12.csv", index=False)

In [None]:


moda_tn = df["tn"].mode()[0]
X_kgl["moda"] = moda_tn  # Asignar moda de tn a los datos de 201912
X_kgl["ypred"] = preds_kgl
X_kgl["ypred"] = X_kgl["ypred"].fillna(0)
X_kgl["tn"] = X_kgl["moda"] * X_kgl["ypred"]

# Agrupar por product_id y sumar tn
X_kgl = X_kgl[["product_id", "tn", "ypred"]]
# Asegurar que los productos a predecir estén en el DataFrame
X_kgl = X_kgl[X_kgl["product_id"].isin(productos_ok["product_id"])]

result = pd.DataFrame({"product_id": X_kgl["product_id"], "tn": X_kgl["tn"],  "ypred": preds_kgl})
result["tn"] = result["ypred"] + result["tn"]
result = result[result["product_id"].isin(productos_ok["product_id"])]
result = result.groupby("product_id").agg({"tn":"sum"}).reset_index()
result
