In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

In [10]:
# 1. FUNCI√ìN DE DIAGN√ìSTICO
def diagnostico_rapido(df, target='target_variable'):
    # 1. Check desbalanceo
    y = df[target]
    balance_ratio = y.value_counts().min() / len(y)
    print(f"üìä Balanceo: {balance_ratio:.2%}")
    print(f"Distribuci√≥n target:\n{y.value_counts()}")
    
    # 2. Check correlaciones con target
    correlations = df.corr()[target].abs().sort_values(ascending=False)
    print(f"\nüîó Top 5 correlaciones con target:")
    print(correlations.head(6))
    
    # 3. Modelo baseline r√°pido
    from sklearn.ensemble import RandomForestClassifier
    
    X = df.drop(columns=[target, 'id'] if 'id' in df.columns else [target])
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    scores = cross_val_score(rf, X, y, cv=5, scoring='f1')
    print(f"üéØ Baseline F1: {scores.mean():.3f}")
    
    return balance_ratio < 0.3, correlations

# 2. FEATURE ENGINEERING ESPEC√çFICO
def features_especificos_ventas(df):
    X = df.copy()
    
    # Eliminar columnas que no son features
    if 'target_variable' in X.columns:
        X = X.drop('target_variable', axis=1)
    if 'id' in X.columns:
        X = X.drop('id', axis=1)
    
    # 1. RATIOS CLAVE
    X['eficiencia_ventas'] = X['cust_hitrate'] / (X['cust_interactions'] + 1)
    X['valor_cliente'] = X['cust_contracts'] * X['cust_hitrate']
    
    # 2. INTERACCIONES PRODUCTO-CLIENTE
    X['product_A_affinity'] = X['product_A_sold_in_the_past'] * X['cust_hitrate']
    X['product_B_affinity'] = X['product_B_sold_in_the_past'] * X['cust_interactions']
    
    # 3. COMPETENCIA AGREGADA
    competitor_cols = [col for col in X.columns if 'competitor' in col]
    if competitor_cols:
        X['intensidad_competencia'] = X[competitor_cols].sum(axis=1)
        X['mercado_competitivo'] = (X['intensidad_competencia'] > 1).astype(int)
    
    # 4. ESTACIONALIDAD MEJORADA
    if 'opp_month' in X.columns:
        X['es_fin_mes'] = (X['opp_month'] > 0.5).astype(int)
    
    print(f"‚úÖ Features creados. Total: {X.shape[1]} variables")
    return X

# 3. BALANCEO INTELIGENTE
def balanceo_inteligente(X, y):
    # Solo si hay desbalanceo significativo
    if y.value_counts().min() / len(y) < 0.3:
        smote = SMOTE(random_state=42, k_neighbors=3)
        X_bal, y_bal = smote.fit_resample(X, y)
        print("‚úÖ Datos balanceados con SMOTE")
        return X_bal, y_bal
    else:
        print("‚öñÔ∏è Desbalanceo m√≠nimo, no se aplica balanceo")
        return X, y

# 4. OPTIMIZACI√ìN R√ÅPIDA
def optimizacion_rapida(X, y):
    # XGBoost con par√°metros b√°sicos optimizados
    xgb_opt = XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        scale_pos_weight=len(y[y==0])/len(y[y==1]) if len(y[y==1]) > 0 else 1
    )
    
    # Cross-validation
    scores = cross_val_score(xgb_opt, X, y, cv=5, scoring='f1')
    print(f"üéØ XGBoost optimizado - F1 CV: {scores.mean():.3f} (+/- {scores.std():.3f})")
    
    # Entrenar modelo final
    xgb_opt.fit(X, y)
    return xgb_opt

# 5. PIPELINE MEJORADO COMPLETO
def pipeline_mejorado_f1(df):
    print("üöÄ INICIANDO PIPELINE DE MEJORA...")
    
    # 1. DIAGN√ìSTICO
    print("\n" + "="*50)
    print("1. DIAGN√ìSTICO INICIAL")
    print("="*50)
    needs_balance, top_correlations = diagnostico_rapido(df)
    
    # 2. FEATURE ENGINEERING
    print("\n" + "="*50)
    print("2. FEATURE ENGINEERING")
    print("="*50)
    X_mejorado = features_especificos_ventas(df)
    y = df['target_variable']
    
    # 3. BALANCEO
    print("\n" + "="*50)
    print("3. BALANCEO DE CLASES")
    print("="*50)
    X_bal, y_bal = balanceo_inteligente(X_mejorado, y)
    
    # 4. DIVISI√ìN TRAIN/TEST
    X_train, X_test, y_train, y_test = train_test_split(
        X_bal, y_bal, test_size=0.2, random_state=42, stratify=y_bal
    )
    
    print(f"üìä Conjunto de entrenamiento: {X_train.shape}")
    print(f"üìä Conjunto de test: {X_test.shape}")
    
    # 5. MODELO OPTIMIZADO
    print("\n" + "="*50)
    print("4. ENTRENAMIENTO DEL MODELO")
    print("="*50)
    modelo_final = optimizacion_rapida(X_train, y_train)
    
    # 6. EVALUACI√ìN FINAL
    print("\n" + "="*50)
    print("5. EVALUACI√ìN FINAL")
    print("="*50)
    y_pred = modelo_final.predict(X_test)
    f1 = f1_score(y_test, y_pred)
    
    print(f"üéØ F1-score MEJORADO: {f1:.3f}")
    print(f"üìà Comparaci√≥n: 0.71 (anterior) ‚Üí {f1:.3f} (nuevo)")
    print(f"üí™ Mejora: {(f1 - 0.71) / 0.71 * 100:+.1f}%")
    
    print("\nüìã Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # 7. IMPORTANCIA DE FEATURES
    print("\n" + "="*50)
    print("6. IMPORTANCIA DE VARIABLES")
    print("="*50)
    feature_importance = pd.DataFrame({
        'feature': X_mejorado.columns,
        'importance': modelo_final.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("Top 10 features m√°s importantes:")
    print(feature_importance.head(10))
    
    return modelo_final, X_mejorado

# USO FINAL
if __name__ == "__main__":
    # Cargar tus datos (ajusta la ruta)
    df = pd.read_csv('dataset.csv')  # Cambia por tu ruta real
    
    # Ejecutar pipeline completo
    modelo_final, features_mejorados = pipeline_mejorado_f1(df)
    
    print("\n" + "üéâ PIPELINE COMPLETADO!" + "üéâ")

üöÄ INICIANDO PIPELINE DE MEJORA...

1. DIAGN√ìSTICO INICIAL
üìä Balanceo: 47.05%
Distribuci√≥n target:
target_variable
0    19009
1    16890
Name: count, dtype: int64

üîó Top 5 correlaciones con target:
target_variable      1.000000
cust_hitrate         0.280532
opp_old              0.172064
cust_interactions    0.106199
cust_contracts       0.089844
cust_in_iberia       0.080362
Name: target_variable, dtype: float64
üéØ Baseline F1: 0.832

2. FEATURE ENGINEERING
‚úÖ Features creados. Total: 22 variables

3. BALANCEO DE CLASES
‚öñÔ∏è Desbalanceo m√≠nimo, no se aplica balanceo
üìä Conjunto de entrenamiento: (28719, 22)
üìä Conjunto de test: (7180, 22)

4. ENTRENAMIENTO DEL MODELO
üéØ XGBoost optimizado - F1 CV: 0.753 (+/- 0.007)

5. EVALUACI√ìN FINAL
üéØ F1-score MEJORADO: 0.767
üìà Comparaci√≥n: 0.71 (anterior) ‚Üí 0.767 (nuevo)
üí™ Mejora: +8.0%

üìã Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.74      0.77  