In [1]:
# Librerías y Configuración

import pandas as pd
import numpy as np
import json
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score
import warnings
warnings.filterwarnings('ignore')


In [2]:
# Configuraciones generales: número de folds, semilla y opciones de validación

RANDOM_SEED = 42
N_FOLDS = 10
USE_ADVERSARIAL_VALIDATION = True
USE_PSEUDO_LABELING = True
PSEUDO_THRESHOLD = 0.95  # Solo pseudo-labels con alta confianza

In [5]:
# Carga de Datos

def load_jsonlines(file_path):
    # Lee los archivos JSONLines (un objeto JSON por línea) y los convierte en DataFrame.
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

print("Cargando datos...")
train = load_jsonlines('archivos-analisis-predictivo-2025q2/train_data.jsonlines')
test = load_jsonlines('archivos-analisis-predictivo-2025q2/test_data.jsonlines')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Cargando datos...
Train shape: (70000, 45)
Test shape: (30000, 44)


In [6]:
# Validación Adversarial (Detectar diferencias train/test)

if USE_ADVERSARIAL_VALIDATION:
    # Combinar train y test para detectar diferencias entre ambos
    train_adv = train.copy()
    test_adv = test.copy()
    train_adv['is_test'] = 0
    test_adv['is_test'] = 1
    combined = pd.concat([train_adv, test_adv], ignore_index=True)

    # Generar features numéricas y categóricas simples
    adv_features = []

    for col in ['price', 'base_price', 'initial_quantity', 'sold_quantity', 'available_quantity']:
        combined[f'{col}_log'] = np.log1p(combined[col])
        adv_features.append(f'{col}_log') # Se aplican logs para normalizar distribuciones

    for col in ['site_id', 'category_id', 'listing_type_id', 'buying_mode']:
        le = LabelEncoder()
        combined[col] = le.fit_transform(combined[col].fillna('missing'))
        adv_features.append(col)

    # Features textuales básicas
    combined['title_length'] = combined['title'].str.len()
    combined['title_words'] = combined['title'].str.split().str.len()
    adv_features.extend(['title_length', 'title_words'])

    # Entrenar un modelo simple para predecir si una fila es del test
    X_adv = combined[adv_features].fillna(-999)
    y_adv = combined['is_test']

    lgb_adv = lgb.LGBMClassifier(n_estimators=100, random_state=RANDOM_SEED, verbose=-1)
    lgb_adv.fit(X_adv, y_adv)

    # AUC mide qué tan distintas son las distribuciones train/test
    adv_score = roc_auc_score(y_adv, lgb_adv.predict_proba(X_adv)[:, 1])
    print(f"Adversarial Validation AUC: {adv_score:.5f}")

    if adv_score > 0.7:
        print("WARNING: Train y Test son diferentes (riesgo de overfitting)")
    else:
        print("Train y Test son similares (AUC < 0.7)")

    # Identificar samples más parecidos al test set
    train_similarity = lgb_adv.predict_proba(X_adv[:len(train)])[:, 1]
    train['similarity_to_test'] = train_similarity
    print(f"\nSamples de train más parecidos al test: {(train_similarity > 0.7).sum()}")


Adversarial Validation AUC: 0.63183
Train y Test son similares (AUC < 0.7)

Samples de train más parecidos al test: 0


In [7]:
# Feature Engineering Avanzado

def extract_advanced_features(df):
    df = df.copy()

    # Numéricas básicas y logarítmicas
    df['price_log'] = np.log1p(df['price'])
    df['base_price_log'] = np.log1p(df['base_price'])
    df['price_diff'] = df['price'] - df['base_price']
    df['price_ratio'] = df['price'] / (df['base_price'] + 1)

    # Temporales
    # Convertir strings a fechas y generar diferencias de días
    df['start_time'] = pd.to_datetime(df['start_time'])
    df['stop_time'] = pd.to_datetime(df['stop_time'])
    df['last_updated'] = pd.to_datetime(df['last_updated'])
    df['date_created'] = pd.to_datetime(df['date_created'])

    df['listing_duration_days'] = (df['stop_time'] - df['start_time']).dt.days
    df['days_since_created'] = (df['last_updated'] - df['date_created']).dt.days
    df['start_month'] = df['start_time'].dt.month
    df['start_year'] = df['start_time'].dt.year
    df['start_dayofweek'] = df['start_time'].dt.dayofweek
    df['start_hour'] = df['start_time'].dt.hour

    # Ventas y stock
    df['has_sales'] = (df['sold_quantity'] > 0).astype(int)
    df['sales_rate'] = df['sold_quantity'] / (df['initial_quantity'] + 1)
    df['stock_level'] = df['available_quantity'] / (df['initial_quantity'] + 1)

    # Título
    df['title_length'] = df['title'].str.len()
    df['title_words'] = df['title'].str.split().str.len()
    df['title_upper_ratio'] = df['title'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1))
    df['title_digit_ratio'] = df['title'].apply(lambda x: sum(1 for c in x if c.isdigit()) / (len(x) + 1))

    # Palabras clave específicas
    df['title_has_nuevo'] = df['title'].str.lower().str.contains('nuevo|new|0km', na=False).astype(int)
    df['title_has_usado'] = df['title'].str.lower().str.contains('usado|used', na=False).astype(int)
    df['title_has_garantia'] = df['title'].str.lower().str.contains('garantía|warranty', na=False).astype(int)
    df['title_has_original'] = df['title'].str.lower().str.contains('original', na=False).astype(int)

    # Target encoding
    # Encoding de category_id por tasa de "new"
    if 'condition' in df.columns:
        category_new_rate = df.groupby('category_id')['condition'].apply(lambda x: (x == 'new').mean())
        df['category_new_rate'] = df['category_id'].map(category_new_rate)

        seller_new_rate = df.groupby('seller_id')['condition'].apply(lambda x: (x == 'new').mean())
        df['seller_new_rate'] = df['seller_id'].map(seller_new_rate)

    # Vendedor
    df['is_official_store'] = df['official_store_id'].notna().astype(int)
    seller_counts = df['seller_id'].value_counts()
    df['seller_frequency'] = df['seller_id'].map(seller_counts)
    df['seller_state'] = df['seller_address'].apply(lambda x: x.get('state', {}).get('name') if isinstance(x, dict) else None)

    # Garantía
    df['has_warranty'] = df['warranty'].notna().astype(int)
    df['warranty_length'] = df['warranty'].fillna('').str.len()

    # Envíos
    df['shipping_free'] = df['shipping'].apply(lambda x: x.get('free_shipping', False) if isinstance(x, dict) else False).astype(int)
    df['shipping_local_pickup'] = df['shipping'].apply(lambda x: x.get('local_pick_up', False) if isinstance(x, dict) else False).astype(int)

    # Tags
    df['tags_count'] = df['tags'].apply(lambda x: len(x) if isinstance(x, list) else 0)
    df['has_good_quality_tag'] = df['tags'].apply(lambda x: 'good_quality_thumbnail' in x if isinstance(x, list) else False).astype(int)

    # Imágenes
    df['pictures_count'] = df['pictures'].apply(lambda x: len(x) if isinstance(x, list) else 0)

    # Atributos
    df['attributes_count'] = df['attributes'].apply(lambda x: len(x) if isinstance(x, list) else 0)

    def extract_attribute(attrs, attr_id):
        if isinstance(attrs, list):
            for attr in attrs:
                if attr.get('id') == attr_id:
                    return attr.get('value_name', None)
        return None

    df['brand'] = df['attributes'].apply(lambda x: extract_attribute(x, 'BRAND'))
    df['model'] = df['attributes'].apply(lambda x: extract_attribute(x, 'MODEL'))
    df['item_condition'] = df['attributes'].apply(lambda x: extract_attribute(x, 'ITEM_CONDITION'))

    # Booleanas
    df['accepts_mercadopago_int'] = df['accepts_mercadopago'].astype(int)
    df['automatic_relist_int'] = df['automatic_relist'].astype(int)
    df['has_video'] = df['video_id'].notna().astype(int)

    # Interacciones
    df['price_x_pictures'] = df['price_log'] * df['pictures_count']
    df['price_x_warranty'] = df['price_log'] * df['has_warranty']
    df['official_x_free_shipping'] = df['is_official_store'] * df['shipping_free']

    return df

print("Extrayendo features...")
train = extract_advanced_features(train)
test = extract_advanced_features(test)

# Aplicar target encoding del train al test
if 'category_new_rate' in train.columns:
    category_map = train.groupby('category_id')['category_new_rate'].first().to_dict()
    test['category_new_rate'] = test['category_id'].map(category_map).fillna(train['category_new_rate'].mean())

    seller_map = train.groupby('seller_id')['seller_new_rate'].first().to_dict()
    test['seller_new_rate'] = test['seller_id'].map(seller_map).fillna(train['seller_new_rate'].mean())


Extrayendo features...


In [9]:
# Encoding y preparación

cat_features = [
    'site_id', 'listing_type_id', 'buying_mode', 'category_id',
    'currency_id', 'status', 'sub_status', 'listing_source',
    'brand', 'model', 'item_condition', 'seller_state', 'tags' # Added tags here
]

num_features = [
    'price_log', 'base_price_log', 'price_diff', 'price_ratio',
    'initial_quantity', 'sold_quantity', 'available_quantity',
    'listing_duration_days', 'days_since_created',
    'start_month', 'start_year', 'start_dayofweek', 'start_hour',
    'has_sales', 'sales_rate', 'stock_level',
    'title_length', 'title_words', 'title_upper_ratio', 'title_digit_ratio',
    'title_has_nuevo', 'title_has_usado', 'title_has_garantia', 'title_has_original',
    'is_official_store', 'seller_frequency',
    'has_warranty', 'warranty_length',
    'shipping_free', 'shipping_local_pickup',
    'tags_count', 'has_good_quality_tag',
    'pictures_count', 'attributes_count',
    'accepts_mercadopago_int', 'automatic_relist_int', 'has_video',
    'price_x_pictures', 'price_x_warranty', 'official_x_free_shipping',
    'category_new_rate', 'seller_new_rate', 'seller_id'
]

# Convertir listas a strings para LabelEncoding
for col in ['sub_status', 'tags']:
    train[col] = train[col].apply(lambda x: ','.join(x) if isinstance(x, list) else str(x))
    test[col] = test[col].apply(lambda x: ','.join(x) if isinstance(x, list) else str(x))

# Label Encoding
for col in cat_features:
    le = LabelEncoder()
    train[col] = train[col].fillna('missing')
    test[col] = test[col].fillna('missing')
    all_values = pd.concat([train[col], test[col]]).unique()
    le.fit(all_values)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])

# TF-IDF sobre títulos
# Convertir texto a representación numérica basada en frecuencia
print("Generando TF-IDF...")
tfidf = TfidfVectorizer(max_features=100, ngram_range=(1, 3), min_df=3, max_df=0.9)
train_tfidf = tfidf.fit_transform(train['title'].fillna(''))
test_tfidf = tfidf.transform(test['title'].fillna(''))

tfidf_features = [f'tfidf_{i}' for i in range(train_tfidf.shape[1])]
tfidf_df_train = pd.DataFrame(train_tfidf.toarray(), columns=tfidf_features)
tfidf_df_test = pd.DataFrame(test_tfidf.toarray(), columns=tfidf_features)

train = pd.concat([train.reset_index(drop=True), tfidf_df_train], axis=1)
test = pd.concat([test.reset_index(drop=True), tfidf_df_test], axis=1)

# Features finales
all_features = num_features + cat_features + tfidf_features

X = train[all_features].fillna(-999)
y = (train['condition'] == 'new').astype(int)
X_test = test[all_features].fillna(-999)

print(f"\nTotal features: {len(all_features)}")

Generando TF-IDF...

Total features: 156


In [10]:
# Entrenamiento con mpultiples seeds

# Defino 5 seeds distintas para repetir el proceso y asegurar estabilidad.
# Entrenar con distintas semillas reduce la varianza de los resultados y confirma
# que el modelo no depende del azar.
SEEDS = [42, 123, 456, 789, 2024]  # 5 seeds diferentes

all_oof_preds = []    # guardará las predicciones out-of-fold (validación) por seed
all_test_preds = []   # guardará las predicciones sobre test por seed

# Recorro cada semilla y repito el entrenamiento completo
for seed_idx, seed in enumerate(SEEDS):
    print("\n" + "="*70)
    print(f"SEED {seed_idx + 1}/{len(SEEDS)}: {seed}")
    print("="*70)

    # Uso validación cruzada estratificada (mantiene la proporción de clases)
    skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=seed)

    # Inicializo vectores vacíos para guardar resultados de cada modelo
    oof_lgb = np.zeros(len(X))
    oof_xgb = np.zeros(len(X))
    oof_cat = np.zeros(len(X))

    predictions_lgb = np.zeros(len(X_test))
    predictions_xgb = np.zeros(len(X_test))
    predictions_cat = np.zeros(len(X_test))

    # Entrenamiento con validación cruzada
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        print(f"  Fold {fold}/{N_FOLDS}...", end=' ')

        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # MODELO 1: LightGBM
        lgb_params = {
            'objective': 'binary',    # problema de clasificación binaria
            'metric': 'auc',          # metrica AUC para medir el rendimiento
            'learning_rate': 0.02,    # tasa de aprendizaje baja (modelo más estable)
            'num_leaves': 45,
            'max_depth': 9,
            'min_child_samples': 30,
            'subsample': 0.85,
            'colsample_bytree': 0.85,
            'reg_alpha': 1.0,
            'reg_lambda': 1.0,
            'random_state': seed,
            'verbose': -1
        }

        # Creo datasets para LightGBM
        lgb_train = lgb.Dataset(X_tr, y_tr)
        lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train)

        # Entreno el modelo con early stopping → se detiene si no mejora por 100 rondas
        model_lgb = lgb.train(lgb_params,
                            lgb_train,
                            num_boost_round=5000,  # máximo de iteraciones (usualmente corta antes)
                            valid_sets=[lgb_val],
                            callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)]
        )

        # Guardo predicciones sobre validación y test
        oof_lgb[val_idx] = model_lgb.predict(X_val)
        predictions_lgb += model_lgb.predict(X_test) / N_FOLDS

        # MODELO 2: XGBoost
        model_xgb = xgb.XGBClassifier(
            learning_rate=0.02, max_depth=9, min_child_weight=5,
            subsample=0.85, colsample_bytree=0.85,
            reg_alpha=1.0, reg_lambda=1.0,
            n_estimators=2500, random_state=seed,
            tree_method='hist', eval_metric='auc'
        )
        model_xgb.fit(X_tr, y_tr)

        oof_xgb[val_idx] = model_xgb.predict_proba(X_val)[:, 1]
        predictions_xgb += model_xgb.predict_proba(X_test)[:, 1] / N_FOLDS

        # MODELO 3: CatBoost
        model_cat = CatBoostClassifier(
            iterations=2500, learning_rate=0.02, depth=9,
            l2_leaf_reg=7, random_state=seed,
            verbose=0, early_stopping_rounds=150
        )
        model_cat.fit(X_tr, y_tr, eval_set=(X_val, y_val), use_best_model=True)

        oof_cat[val_idx] = model_cat.predict_proba(X_val)[:, 1]
        predictions_cat += model_cat.predict_proba(X_test)[:, 1] / N_FOLDS

    # Ensemble de los 3 modelos
    # Combinar los 3 modelos con pesos similares
    # La idea es promediar sus errores y obtener una predicción más estable.
    oof_ensemble = 0.35 * oof_lgb + 0.35 * oof_xgb + 0.30 * oof_cat
    pred_ensemble = 0.35 * predictions_lgb + 0.35 * predictions_xgb + 0.30 * predictions_cat

    all_oof_preds.append(oof_ensemble)
    all_test_preds.append(pred_ensemble)

    # Calcular el AUC de la combinación
    print(f"  AUC: {roc_auc_score(y, oof_ensemble):.5f}")

# Promedio final entre todas las seeds
final_oof = np.mean(all_oof_preds, axis=0)
final_test_pred = np.mean(all_test_preds, axis=0)

print("\n" + "="*70)
print("RESULTADO FINAL CON MÚLTIPLES SEEDS")
print("="*70)
print(f"AUC Final OOF: {roc_auc_score(y, final_oof):.5f}")



SEED 1/5: 42
  Fold 1/10... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[460]	valid_0's auc: 0.998288
  Fold 2/10... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[296]	valid_0's auc: 0.998352
  Fold 3/10... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[387]	valid_0's auc: 0.998488
  Fold 4/10... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[362]	valid_0's auc: 0.998406
  Fold 5/10... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[521]	valid_0's auc: 0.998694
  Fold 6/10... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[443]	valid_0's auc: 0.998612
  Fold 7/10... Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[535]	valid_0's auc: 0

In [11]:
# Pseudo-Labeling

if USE_PSEUDO_LABELING:

    # Seleccionar predicciones del test más confiables
    confident_mask = (final_test_pred > PSEUDO_THRESHOLD) | (final_test_pred < (1 - PSEUDO_THRESHOLD))
    n_confident = confident_mask.sum()

    print(f"Samples confiables (>{PSEUDO_THRESHOLD} o <{1-PSEUDO_THRESHOLD}): {n_confident}/{len(X_test)}")

    if n_confident > 0:
        # Agrego esas muestras al train con las etiquetas estimadas
        X_pseudo = X_test[confident_mask]
        y_pseudo = (final_test_pred[confident_mask] > 0.5).astype(int)

        X_extended = pd.concat([X, X_pseudo], ignore_index=True)
        y_extended = pd.concat([y, pd.Series(y_pseudo)], ignore_index=True)

        print(f"Entrenando con {len(X_extended)} samples (original + pseudo)...")

        # Re-entrenar un modelo LightGBM final con los datos extendidos
        model_final = lgb.LGBMClassifier(
            n_estimators=1500, learning_rate=0.02,
            num_leaves=45, max_depth=9,
            subsample=0.85, colsample_bytree=0.85,
            reg_alpha=1.0, reg_lambda=1.0,
            random_state=RANDOM_SEED, verbose=-1
        )
        model_final.fit(X_extended, y_extended)

        final_test_pred = model_final.predict_proba(X_test)[:, 1]
        print("\nPseudo-labeling aplicado")

Samples confiables (>0.95 o <0.050000000000000044): 23042/30000
Entrenando con 93042 samples (original + pseudo)...

Pseudo-labeling aplicado


In [12]:
# Optimización del Threshold

from sklearn.metrics import precision_recall_curve

# Busco el threshold que maximiza el F1-score
precision, recall, thresholds = precision_recall_curve(y, final_oof)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-10)
best_threshold = thresholds[np.argmax(f1_scores)]

# Aplico el threshold óptimo a las predicciones
oof_pred_binary = (final_oof > best_threshold).astype(int)
print(f"Accuracy OOF: {accuracy_score(y, oof_pred_binary):.5f}")
print(f"Mejor threshold: {best_threshold:.4f}")


Accuracy OOF: 0.97980
Mejor threshold: 0.4836


In [13]:
# Generar archivo final de submission
submission = pd.DataFrame({
    'ID': pd.Series(range(1,len(test)+1)),
    'condition': (final_test_pred > best_threshold).astype(int)
})

# Mapear las etiquetas binarias a los nombres del dataset original
submission['condition'] = submission['condition'].map({0: 'used', 1: 'new'})

submission.to_csv('TREVISAN_submission3.csv', index=False)
print("Archivo de submission generado correctamente.")

Archivo de submission generado correctamente.
