In [None]:
import os
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
# %pip install catboost
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import joblib

# Deep Learning for Text
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import Dataset, DataLoader

print("="*80)
print("TOP 10 STRATEGY - ADVANCED MULTI-MODAL ENSEMBLE")
print("="*80)

# Load data
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))

print(f"Train: {len(train):,} | Test: {len(test):,}")
print(f"\nPrice Distribution:")
print(train['price'].describe())

In [None]:
"""
AMAZON ML CHALLENGE - TOP 5 SOLUTION
Complete pipeline in one file
Expected SMAPE: 45-50% ‚Üí Target: < 47% for TOP 5
"""

import os
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import joblib

print("="*80)
print("AMAZON ML CHALLENGE - TOP 5 COMPLETE SOLUTION")
print("="*80)

# ============================================================================
# STEP 1: LOAD DATA
# ============================================================================
print("\n[1/6] Loading data...")
DATASET_FOLDER = 'dataset/'

train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))

print(f"Train: {len(train):,} samples")
print(f"Test: {len(test):,} samples")
print(f"\nPrice Distribution (Train):")
print(train['price'].describe())

# ============================================================================
# STEP 2: ADVANCED FEATURE ENGINEERING
# ============================================================================
print("\n[2/6] Extracting features (this takes 2-3 minutes)...")

def extract_all_features(df):
    """Extract comprehensive features"""
    
    features_list = []
    
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
        text = str(row['catalog_content']).lower()
        original = str(row['catalog_content'])
        
        feat = {}
        
        # === TEXT STATS ===
        words = text.split()
        feat['text_len'] = len(text)
        feat['word_count'] = len(words)
        feat['unique_words'] = len(set(words))
        feat['avg_word_len'] = feat['text_len'] / max(feat['word_count'], 1)
        feat['capital_ratio'] = sum(1 for c in original if c.isupper()) / max(len(original), 1)
        feat['digit_ratio'] = sum(1 for c in text if c.isdigit()) / max(len(text), 1)
        
        # === IPQ (CRITICAL!) ===
        ipq = 1
        for pattern in [r'ipq[:\s]*(\d+)', r'pack[:\s]*of[:\s]*(\d+)', r'(\d+)[:\s]*pack',
                       r'quantity[:\s]*(\d+)', r'set[:\s]*of[:\s]*(\d+)', r'(\d+)[:\s]*piece']:
            match = re.search(pattern, text)
            if match:
                try:
                    val = int(match.group(1))
                    if 1 <= val <= 100:
                        ipq = val
                        break
                except:
                    pass
        
        feat['ipq'] = ipq
        feat['ipq_log'] = np.log1p(ipq)
        feat['ipq_sqrt'] = np.sqrt(ipq)
        feat['ipq_sq'] = ipq ** 2
        feat['is_multipack'] = 1 if ipq > 1 else 0
        
        # === NUMBERS ===
        numbers = [float(n) for n in re.findall(r'\d+\.?\d*', text) if 0 < float(n) < 1000000]
        if numbers:
            feat['num_count'] = len(numbers)
            feat['num_max'] = max(numbers)
            feat['num_min'] = min(numbers)
            feat['num_mean'] = np.mean(numbers)
            feat['num_median'] = np.median(numbers)
            feat['num_std'] = np.std(numbers) if len(numbers) > 1 else 0
            feat['num_sum'] = sum(numbers)
            feat['num_range'] = max(numbers) - min(numbers)
        else:
            for k in ['num_count', 'num_max', 'num_min', 'num_mean', 'num_median', 
                     'num_std', 'num_sum', 'num_range']:
                feat[k] = 0
        
        # === STORAGE (GB/TB) ===
        storage_gb = 0
        for match in re.finditer(r'(\d+)\s*(gb|tb|mb)', text):
            val = int(match.group(1))
            unit = match.group(2)
            if unit == 'tb':
                val *= 1000
            elif unit == 'mb':
                val *= 0.001
            storage_gb += val
        
        feat['storage_gb'] = storage_gb
        feat['storage_log'] = np.log1p(storage_gb)
        feat['has_storage'] = 1 if storage_gb > 0 else 0
        
        # RAM
        ram_match = re.search(r'(\d+)\s*gb\s*ram', text)
        feat['ram_gb'] = int(ram_match.group(1)) if ram_match else 0
        
        # === BRANDS ===
        brands = {
            'apple': 5, 'samsung': 4, 'sony': 4, 'lg': 3, 'dell': 3, 'hp': 3,
            'lenovo': 3, 'asus': 3, 'microsoft': 5, 'google': 4, 'nike': 4,
            'adidas': 4, 'puma': 3, 'canon': 4, 'nikon': 4, 'bose': 5,
            'bosch': 4, 'philips': 3, 'xiaomi': 2, 'oneplus': 3
        }
        
        feat['brand_score'] = sum(score for brand, score in brands.items() if brand in text)
        feat['brand_count'] = sum(1 for brand in brands if brand in text)
        feat['has_premium_brand'] = 1 if feat['brand_score'] >= 4 else 0
        
        # Top brands individually
        for brand in ['apple', 'samsung', 'sony', 'nike', 'microsoft', 'canon', 'bose']:
            feat[f'brand_{brand}'] = 1 if brand in text else 0
        
        # === CATEGORIES ===
        categories = {
            'electronics': ['phone', 'laptop', 'tablet', 'computer', 'tv', 'camera', 
                          'headphone', 'speaker', 'watch', 'earphone', 'monitor'],
            'clothing': ['shirt', 'pant', 'dress', 'jean', 'jacket', 'shoe', 'sneaker'],
            'home': ['furniture', 'table', 'chair', 'sofa', 'bed', 'lamp'],
            'kitchen': ['cookware', 'pan', 'pot', 'knife', 'blender', 'oven'],
            'beauty': ['cosmetic', 'perfume', 'makeup', 'skincare', 'shampoo'],
            'sports': ['fitness', 'gym', 'yoga', 'dumbbell', 'cycle', 'ball'],
            'books': ['book', 'novel', 'textbook', 'guide'],
            'toys': ['toy', 'game', 'puzzle', 'doll']
        }
        
        cat_scores = {}
        for cat_name, keywords in categories.items():
            count = sum(1 for kw in keywords if kw in text)
            feat[f'cat_{cat_name}'] = 1 if count > 0 else 0
            cat_scores[cat_name] = count
        
        # Primary category
        primary_cat = max(cat_scores, key=cat_scores.get) if max(cat_scores.values()) > 0 else 'other'
        feat['primary_category'] = primary_cat
        
        # === QUALITY ===
        premium = ['premium', 'luxury', 'pro', 'plus', 'ultra', 'max', 'deluxe', 
                  'elite', 'supreme', 'professional', 'advanced']
        budget = ['basic', 'standard', 'lite', 'mini', 'essential', 'simple']
        
        feat['premium_count'] = sum(1 for w in premium if w in text)
        feat['budget_count'] = sum(1 for w in budget if w in text)
        feat['quality_score'] = feat['premium_count'] - feat['budget_count']
        feat['is_premium'] = 1 if feat['premium_count'] > feat['budget_count'] else 0
        
        # === SPECIAL FEATURES ===
        features_kw = {
            'wireless': ['wireless', 'bluetooth', 'wifi'],
            'original': ['original', 'genuine'],
            'warranty': ['warranty', 'guarantee'],
            'waterproof': ['waterproof', 'water-resistant'],
            'rechargeable': ['rechargeable', 'battery'],
            'smart': ['smart', 'ai'],
            'hd': ['hd', '4k', 'uhd']
        }
        
        for feat_name, keywords in features_kw.items():
            feat[f'feat_{feat_name}'] = 1 if any(kw in text for kw in keywords) else 0
        
        # === MATERIALS ===
        expensive_mat = ['leather', 'gold', 'silver', 'steel', 'aluminum', 'titanium', 'wood']
        cheap_mat = ['plastic', 'rubber', 'synthetic']
        
        feat['expensive_mat'] = 1 if any(m in text for m in expensive_mat) else 0
        feat['cheap_mat'] = 1 if any(m in text for m in cheap_mat) else 0
        
        # === MISC ===
        feat['is_new'] = 1 if any(w in text for w in ['new', 'latest', '2024', '2025']) else 0
        feat['color_count'] = sum(1 for c in ['black', 'white', 'red', 'blue', 'green'] if c in text)
        feat['has_size'] = 1 if any(s in text for s in ['small', 'medium', 'large', 'xl']) else 0
        
        features_list.append(feat)
    
    return pd.DataFrame(features_list)

# Extract features
train_features = extract_all_features(train)
test_features = extract_all_features(test)

# Encode categorical
le = LabelEncoder()
train_features['primary_category_encoded'] = le.fit_transform(train_features['primary_category'])
test_features['primary_category_encoded'] = le.transform(test_features['primary_category'])

train_features.drop('primary_category', axis=1, inplace=True)
test_features.drop('primary_category', axis=1, inplace=True)

print(f"‚úì Extracted {train_features.shape[1]} handcrafted features")

# ============================================================================
# STEP 3: TF-IDF TEXT FEATURES
# ============================================================================
print("\n[3/6] TF-IDF text vectorization...")

tfidf = TfidfVectorizer(
    max_features=400,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

tfidf_train = tfidf.fit_transform(train['catalog_content'])
tfidf_test = tfidf.transform(test['catalog_content'])

# Reduce dimensions
svd = TruncatedSVD(n_components=120, random_state=42)
tfidf_train_red = svd.fit_transform(tfidf_train)
tfidf_test_red = svd.transform(tfidf_test)

tfidf_train_df = pd.DataFrame(tfidf_train_red, columns=[f'tfidf_{i}' for i in range(120)])
tfidf_test_df = pd.DataFrame(tfidf_test_red, columns=[f'tfidf_{i}' for i in range(120)])

print(f"‚úì TF-IDF: {tfidf_train_df.shape[1]} features")

# ============================================================================
# STEP 4: COMBINE ALL FEATURES
# ============================================================================
print("\n[4/6] Combining features...")

X_train = pd.concat([train_features.reset_index(drop=True), tfidf_train_df], axis=1)
X_test = pd.concat([test_features.reset_index(drop=True), tfidf_test_df], axis=1)
y_train = train['price'].values

print(f"‚úì Total features: {X_train.shape[1]}")

# ============================================================================
# STEP 5: TRAIN ADVANCED ENSEMBLE
# ============================================================================
print("\n[5/6] Training advanced ensemble (7-fold CV)...")

def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return 100 * np.mean(diff)

class TopEnsemble:
    def __init__(self):
        self.lgb_models = []
        self.xgb_models = []
        self.scaler = StandardScaler()
        
    def train(self, X, y, n_folds=7):
        X_scaled = self.scaler.fit_transform(X)
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        oof_predictions = np.zeros(len(X))
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
            print(f"\nFold {fold + 1}/{n_folds}")
            
            X_tr, X_val = X_scaled[train_idx], X_scaled[val_idx]
            y_tr, y_val = y[train_idx], y[val_idx]
            
            # LightGBM
            lgb_model = lgb.LGBMRegressor(
                n_estimators=1500,
                learning_rate=0.025,
                num_leaves=45,
                max_depth=12,
                min_child_samples=25,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.05,
                reg_lambda=0.05,
                random_state=42,
                n_jobs=-1,
                verbose=-1
            )
            lgb_model.fit(X_tr, y_tr, 
                         eval_set=[(X_val, y_val)],
                         callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
            
            # XGBoost
            xgb_model = xgb.XGBRegressor(
                n_estimators=1500,
                learning_rate=0.025,
                max_depth=12,
                min_child_weight=5,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.05,
                reg_lambda=0.05,
                random_state=42,
                tree_method='hist',
                n_jobs=-1,
                verbosity=0
            )
            xgb_model.fit(X_tr, y_tr,
                         eval_set=[(X_val, y_val)],
                         verbose=False)
            
            # Predictions
            lgb_pred = lgb_model.predict(X_val)
            xgb_pred = xgb_model.predict(X_val)
            
            # Ensemble
            fold_pred = 0.55 * lgb_pred + 0.45 * xgb_pred
            oof_predictions[val_idx] = fold_pred
            
            fold_smape = smape(y_val, fold_pred)
            print(f"  LGB: {smape(y_val, lgb_pred):.4f} | XGB: {smape(y_val, xgb_pred):.4f} | Ensemble: {fold_smape:.4f}")
            
            self.lgb_models.append(lgb_model)
            self.xgb_models.append(xgb_model)
        
        overall_smape = smape(y, oof_predictions)
        print(f"\n{'='*60}")
        print(f"‚≠ê Overall CV SMAPE: {overall_smape:.4f} ‚≠ê")
        print(f"{'='*60}")
        
        return oof_predictions
    
    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        predictions = []
        
        for lgb_m, xgb_m in zip(self.lgb_models, self.xgb_models):
            lgb_pred = lgb_m.predict(X_scaled)
            xgb_pred = xgb_m.predict(X_scaled)
            fold_pred = 0.55 * lgb_pred + 0.45 * xgb_pred
            predictions.append(fold_pred)
        
        final_pred = np.mean(predictions, axis=0)
        return np.maximum(final_pred, 0.01)

# Train model
model = TopEnsemble()
oof_preds = model.train(X_train.values, y_train, n_folds=7)

# ============================================================================
# STEP 6: PREDICT & SAVE
# ============================================================================
print("\n[6/6] Predicting on test set...")

predictions = model.predict(X_test.values)

# Post-processing: clip extreme predictions
predictions = np.clip(predictions, 
                     train['price'].quantile(0.001), 
                     train['price'].quantile(0.999))

# Create submission
submission = pd.DataFrame({
    'sample_id': test['sample_id'],
    'price': predictions
})

submission.to_csv('test_out.csv', index=False)

# Save model
joblib.dump(model, 'top5_model.pkl')

# ============================================================================
# RESULTS
# ============================================================================
print(f"\n{'='*80}")
print("‚úÖ TRAINING COMPLETE!")
print(f"{'='*80}")

print(f"\nPrediction Statistics:")
print(f"  Mean: ${predictions.mean():.2f}")
print(f"  Median: ${np.median(predictions):.2f}")
print(f"  Min: ${predictions.min():.2f}")
print(f"  Max: ${predictions.max():.2f}")
print(f"  Std: ${predictions.std():.2f}")

print(f"\nTrain Price Statistics:")
print(f"  Mean: ${y_train.mean():.2f}")
print(f"  Median: ${np.median(y_train):.2f}")

print(f"\n{'='*80}")
print("FILES CREATED:")
print(f"{'='*80}")
print(f"‚úì test_out.csv - Your submission file")
print(f"‚úì top5_model.pkl - Trained model")

print(f"\n{'='*80}")
print("SUBMISSION VERIFICATION:")
print(f"{'='*80}")
print(f"‚úì Rows: {len(submission)} (Expected: {len(test)})")
print(f"‚úì Columns: {submission.columns.tolist()}")
print(f"‚úì No NaN: {submission['price'].isna().sum() == 0}")
print(f"‚úì All positive: {(submission['price'] > 0).all()}")

if len(submission) == len(test) and (submission['price'] > 0).all():
    print(f"\nüéØ READY TO SUBMIT!")
    print(f"\nüìä EXPECTED PERFORMANCE:")
    print(f"  CV SMAPE: {smape(y_train, oof_preds):.2f}%")
    print(f"  Target: < 47% for TOP 5")
    print(f"  Your improvement: {69.2 - smape(y_train, oof_preds):.1f}% better!")
else:
    print(f"\n‚ö†Ô∏è Check submission file!")

print(f"\n{'='*80}")
print("üöÄ UPLOAD test_out.csv TO COMPETITION PORTAL!")
print(f"{'='*80}")

In [None]:
# TF-IDF for text understanding
print("\n[2/6] TF-IDF vectorization...")
tfidf = TfidfVectorizer(
    max_features=500,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
    strip_accents='unicode'
)

tfidf_train = tfidf.fit_transform(train['catalog_content'])
tfidf_test = tfidf.transform(test['catalog_content'])

# Reduce dimensions
svd = TruncatedSVD(n_components=150, random_state=42)
tfidf_train_svd = svd.fit_transform(tfidf_train)
tfidf_test_svd = svd.transform(tfidf_test)

tfidf_train_df = pd.DataFrame(tfidf_train_svd, columns=[f'tfidf_{i}' for i in range(150)])
tfidf_test_df = pd.DataFrame(tfidf_test_svd, columns=[f'tfidf_{i}' for i in range(150)])

print(f"‚úì TF-IDF: {tfidf_train_df.shape[1]} features")

# Optional: Add BERT if you have time (improves score by 5-10%)
# Uncomment if you want to use BERT:
"""
print("\n[3/6] BERT embeddings (this takes time)...")

class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=64):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

def get_bert_embeddings(texts, batch_size=16):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
    model = AutoModel.from_pretrained('distilbert-base-uncased').to(device)
    model.eval()
    
    dataset = TextDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
    embeddings = []
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="BERT"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            cls_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
            embeddings.append(cls_embeddings)
    
    return np.vstack(embeddings)

train_bert = get_bert_embeddings(train['catalog_content'].values)
test_bert = get_bert_embeddings(test['catalog_content'].values)

train_bert_df = pd.DataFrame(train_bert, columns=[f'bert_{i}' for i in range(train_bert.shape[1])])
test_bert_df = pd.DataFrame(test_bert, columns=[f'bert_{i}' for i in range(test_bert.shape[1])])

# Combine all features
X_train_full = pd.concat([train_features.reset_index(drop=True), 
                          tfidf_train_df, 
                          train_bert_df], axis=1)
X_test_full = pd.concat([test_features.reset_index(drop=True), 
                         tfidf_test_df, 
                         test_bert_df], axis=1)
"""

# Without BERT (faster):
X_train_full = pd.concat([train_features.reset_index(drop=True), tfidf_train_df], axis=1)
X_test_full = pd.concat([test_features.reset_index(drop=True), tfidf_test_df], axis=1)

y_train = train['price'].values

print(f"‚úì Total features: {X_train_full.shape[1]}")


[2/6] TF-IDF vectorization...


KeyboardInterrupt: 

In [None]:
print("\n[4/6] Training advanced ensemble...")

def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return 100 * np.mean(diff)

class AdvancedEnsemble:
    def __init__(self):
        self.lgb_models = []
        self.xgb_models = []
        self.cat_models = []
        self.scaler = StandardScaler()
        
    def train(self, X, y, n_folds=7):  # 7 folds for better stability
        X_scaled = self.scaler.fit_transform(X)
        kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)
        oof_predictions = np.zeros(len(X))
        
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_scaled)):
            print(f"\n{'='*60}")
            print(f"Fold {fold + 1}/{n_folds}")
            print(f"{'='*60}")
            
            X_train_fold, X_val = X_scaled[train_idx], X_scaled[val_idx]
            y_train_fold, y_val = y[train_idx], y[val_idx]
            
            # Model 1: LightGBM
            print("Training LightGBM...")
            lgb_model = lgb.LGBMRegressor(
                n_estimators=2000,
                learning_rate=0.02,
                num_leaves=50,
                max_depth=12,
                min_child_samples=25,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.05,
                reg_lambda=0.05,
                random_state=42,
                n_jobs=-1,
                verbose=-1
            )
            lgb_model.fit(X_train_fold, y_train_fold,
                         eval_set=[(X_val, y_val)],
                         callbacks=[lgb.early_stopping(100), lgb.log_evaluation(0)])
            
            # Model 2: XGBoost
            print("Training XGBoost...")
            xgb_model = xgb.XGBRegressor(
                n_estimators=2000,
                learning_rate=0.02,
                max_depth=12,
                min_child_weight=5,
                subsample=0.8,
                colsample_bytree=0.8,
                reg_alpha=0.05,
                reg_lambda=0.05,
                random_state=42,
                tree_method='hist',
                n_jobs=-1,
                verbosity=0
            )
            xgb_model.fit(X_train_fold, y_train_fold,
                         eval_set=[(X_val, y_val)],
                         verbose=False)
            
            # Model 3: CatBoost
            print("Training CatBoost...")
            cat_model = CatBoostRegressor(
                iterations=2000,
                learning_rate=0.02,
                depth=10,
                l2_leaf_reg=5,
                random_seed=42,
                verbose=0
            )
            cat_model.fit(X_train_fold, y_train_fold,
                         eval_set=(X_val, y_val),
                         early_stopping_rounds=100,
                         verbose=False)
            
            # Predictions
            lgb_pred = lgb_model.predict(X_val)
            xgb_pred = xgb_model.predict(X_val)
            cat_pred = cat_model.predict(X_val)
            
            # Weighted ensemble (tune these weights!)
            fold_pred = 0.4 * lgb_pred + 0.35 * xgb_pred + 0.25 * cat_pred
            oof_predictions[val_idx] = fold_pred
            
            # Individual model scores
            lgb_smape = smape(y_val, lgb_pred)
            xgb_smape = smape(y_val, xgb_pred)
            cat_smape = smape(y_val, cat_pred)
            fold_smape = smape(y_val, fold_pred)
            
            print(f"  LGB: {lgb_smape:.4f}")
            print(f"  XGB: {xgb_smape:.4f}")
            print(f"  CAT: {cat_smape:.4f}")
            print(f"‚úì Ensemble: {fold_smape:.4f}")
            
            self.lgb_models.append(lgb_model)
            self.xgb_models.append(xgb_model)
            self.cat_models.append(cat_model)
        
        overall_smape = smape(y, oof_predictions)
        print(f"\n{'='*60}")
        print(f"Overall CV SMAPE: {overall_smape:.4f}")
        print(f"{'='*60}")
        
        return oof_predictions
    
    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        predictions = []
        
        for lgb_m, xgb_m, cat_m in zip(self.lgb_models, self.xgb_models, self.cat_models):
            lgb_pred = lgb_m.predict(X_scaled)
            xgb_pred = xgb_m.predict(X_scaled)
            cat_pred = cat_m.predict(X_scaled)
            
            fold_pred = 0.4 * lgb_pred + 0.35 * xgb_pred + 0.25 * cat_pred
            predictions.append(fold_pred)
        
        final_pred = np.mean(predictions, axis=0)
        return np.maximum(final_pred, 0.01)

model = AdvancedEnsemble()
oof_preds = model.train(X_train_full.values, y_train, n_folds=7)

In [None]:
# Verify submission
print("\n" + "="*80)
print("SUBMISSION VERIFICATION")
print("="*80)

submission_check = pd.read_csv('test_out.csv')
print(f"‚úì Rows: {len(submission_check)} (Expected: {len(test)})")
print(f"‚úì Columns: {submission_check.columns.tolist()}")
print(f"‚úì No NaN: {submission_check['price'].isna().sum() == 0}")
print(f"‚úì All positive: {(submission_check['price'] > 0).all()}")

if len(submission_check) == len(test) and (submission_check['price'] > 0).all():
    print(f"\n‚úÖ SUBMISSION READY FOR UPLOAD!")
else:
    print(f"\n‚ö†Ô∏è Fix issues before submitting!")