In [1]:
import zipfile
import os

zip_path = "68e8d1d70b66d_student_resource.zip"
extract_path = "amazon_ml_submission"  # target folder

# Create target folder if it doesn't exist
os.makedirs(extract_path, exist_ok=True)

# Extract all files into this folder
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

print(f"ZIP extracted into folder: {extract_path}")


ZIP extracted into folder: amazon_ml_submission


In [2]:
import os

# Path to your extracted folder
folder_path = "amazon_ml_submission"

# Walk through the folder and print files/subfolders
for root, dirs, files in os.walk(folder_path):
    level = root.replace(folder_path, "").count(os.sep)
    indent = " " * 4 * level
    print(f"{indent}{os.path.basename(root)}/")
    sub_indent = " " * 4 * (level + 1)
    for f in files:
        print(f"{sub_indent}{f}")


amazon_ml_submission/
    student_resource/
        Documentation_template.md
        .DS_Store
        README.md
        sample_code.py
        dataset/
            sample_test_out.csv
            sample_test.csv
            test.csv
            train.csv
        src/
            example.ipynb
            utils.py
            __pycache__/
                utils.cpython-310.pyc
                utils.cpython-37.pyc
            .ipynb_checkpoints/
                utils-checkpoint.py
    __MACOSX/
        ._student_resource
        student_resource/
            ._src
            ._dataset
            ._.DS_Store
            ._sample_code.py
            ._Documentation_template.md
            ._README.md
            dataset/
                ._train.csv
                ._sample_test.csv
                ._sample_test_out.csv
                ._test.csv
            src/
                ._utils.py
                .___pycache__
                ._example.ipynb
                __pycache__/
        

In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


In [4]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers
!pip install pandas numpy
!pip install pillow
!pip install scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://download.pytorch.org/whl/cu121
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [5]:
# Install HuggingFace Transformers and EfficientNet
!pip install torch torchvision transformers efficientnet_pytorch tqdm --quiet


In [6]:
pip install torch torchvision sentence-transformers timm tqdm pandas numpy pillow


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sentence_transformers import SentenceTransformer
import re
import lightgbm as lgb
from tqdm import tqdm

# ===============================
# 1. Paths
# ===============================
base_path = "student_resource/dataset"
train_path = os.path.join(base_path, "train.csv")
test_path = os.path.join(base_path, "test.csv")

# ===============================
# 2. Load Data
# ===============================
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
print(f"Train samples: {train.shape[0]}, Test samples: {test.shape[0]}")

# ===============================
# 3. Remove Outliers (IQR)
# ===============================
Q1 = train['price'].quantile(0.25)
Q3 = train['price'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
train = train[(train['price'] >= lower_bound) & (train['price'] <= upper_bound)]
print(f"After IQR removal, train samples: {train.shape[0]}")

# ===============================
# 4. Feature Engineering
# ===============================
def extract_numeric(text):
    nums = re.findall(r'\d+', str(text))
    return max([int(n) for n in nums], default=1)

# Quantity / IPQ
train['quantity'] = train['catalog_content'].apply(extract_numeric)
test['quantity'] = test['catalog_content'].apply(extract_numeric)

# Text stats
train['text_len'] = train['catalog_content'].apply(lambda x: len(str(x)))
test['text_len'] = test['catalog_content'].apply(lambda x: len(str(x)))

train['num_count'] = train['catalog_content'].apply(lambda x: len(re.findall(r'\d+', str(x))))
test['num_count'] = test['catalog_content'].apply(lambda x: len(re.findall(r'\d+', str(x))))

# Brand extraction
train['brand'] = train['catalog_content'].apply(lambda x: str(x).split()[0])
test['brand'] = test['catalog_content'].apply(lambda x: str(x).split()[0])
le = LabelEncoder()
train['brand_enc'] = le.fit_transform(train['brand'])
test['brand_enc'] = le.transform(test['brand'])

# Word count and average word length
train['word_count'] = train['catalog_content'].apply(lambda x: len(str(x).split()))
test['word_count'] = test['catalog_content'].apply(lambda x: len(str(x).split()))
train['avg_word_len'] = train['catalog_content'].apply(lambda x: np.mean([len(w) for w in str(x).split()]) if len(str(x).split())>0 else 0)
test['avg_word_len'] = test['catalog_content'].apply(lambda x: np.mean([len(w) for w in str(x).split()]) if len(str(x).split())>0 else 0)

# ===============================
# 5. Text Embeddings
# ===============================
model_name = "all-MiniLM-L6-v2"
embedder = SentenceTransformer(model_name)

train_text_embeddings = embedder.encode(train['catalog_content'].tolist(), batch_size=64, show_progress_bar=True)
test_text_embeddings = embedder.encode(test['catalog_content'].tolist(), batch_size=64, show_progress_bar=True)

# ===============================
# 6. Combine Features
# ===============================
numeric_features = ['quantity','text_len','num_count','brand_enc','word_count','avg_word_len']

scaler = StandardScaler()
train_numeric = scaler.fit_transform(train[numeric_features])
test_numeric = scaler.transform(test[numeric_features])

X_train_features = np.hstack([train_text_embeddings, train_numeric])
X_test_features = np.hstack([test_text_embeddings, test_numeric])

# ===============================
# 7. Target Transformation
# ===============================
y = np.log1p(train['price'])  # log-transform stabilizes high prices

# ===============================
# 8. Custom SMAPE for LightGBM
# ===============================
def smape_lgb(y_pred, dataset):
    y_true = dataset.get_label()
    y_pred = np.expm1(y_pred)  # revert log1p
    y_true_exp = np.expm1(y_true)
    denominator = (np.abs(y_true_exp) + np.abs(y_pred)) / 2
    denominator[denominator == 0] = 1e-6
    grad = (y_pred - y_true_exp) / denominator
    hess = np.ones_like(grad)
    return grad, hess

def smape_metric(y_pred, dataset):
    y_true = np.expm1(dataset.get_label())
    y_pred = np.expm1(y_pred)
    smape = np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))) * 100
    return 'SMAPE', smape, False

# ===============================
# 9. Train LightGBM with K-Fold
# ===============================
kf = KFold(n_splits=5, shuffle=True, random_state=42)
preds = np.zeros(X_test_features.shape[0])

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_features)):
    X_tr, X_val = X_train_features[train_idx], X_train_features[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    lgb_train = lgb.Dataset(X_tr, label=y_tr)
    lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)
    
    params = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 64,
        'max_depth': -1,
        'seed': 42,
        'verbose': -1,
        'n_jobs': -1
    }
    
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_train, lgb_val],
        valid_names=['train','valid'],
        fobj=smape_lgb,
        feval=smape_metric,
        callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(100)]
    )
    
    preds += np.expm1(model.predict(X_test_features, num_iteration=model.best_iteration)) / kf.n_splits

# ===============================
# 10. Save Submission
# ===============================
submission = pd.DataFrame({
    'sample_id': test['sample_id'],
    'price': preds.clip(min=1)  # ensure positive prices
})
submission_path = os.path.join(base_path, "test1_out.csv")
submission.to_csv(submission_path, index=False)
print(f"✅ Submission saved as {submission_path}")


2025-10-20 14:14:24.810178: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-10-20 14:14:24.854128: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX512_FP16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-10-20 14:14:25.903573: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


ValueError: Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [8]:
def smape_metric(y_pred, dataset):
    y_true = np.expm1(dataset.get_label())
    y_pred = np.expm1(y_pred)
    smape = np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true))) * 100
    return 'SMAPE', smape, False

# Training loop
kf = KFold(n_splits=5, shuffle=True, random_state=42)
preds = np.zeros(X_test_features.shape[0])

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_features)):
    X_tr, X_val = X_train_features[train_idx], X_train_features[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    lgb_train = lgb.Dataset(X_tr, label=y_tr)
    lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)
    
    params = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'metric': 'mae',  # use MAE as the loss
        'learning_rate': 0.05,
        'num_leaves': 64,
        'max_depth': -1,
        'seed': 42,
        'verbose': -1,
        'n_jobs': -1
    }
    
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_train, lgb_val],
        valid_names=['train','valid'],
        feval=smape_metric,
        callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(100)]
    )
    
    preds += np.expm1(model.predict(X_test_features, num_iteration=model.best_iteration)) / kf.n_splits


NameError: name 'X_test_features' is not defined

In [None]:
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,  # train additional rounds
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train','valid'],
    feval=smape_metric,
    init_model=model,  # continue from existing model
    callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(100)]
)


In [None]:
lgb_train = lgb.Dataset(X_tr, label=y_tr, free_raw_data=False)
lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train, free_raw_data=False)


In [None]:
# Keep raw data
lgb_train = lgb.Dataset(X_tr, label=y_tr, free_raw_data=False)
lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train, free_raw_data=False)

# Continue training
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=1000,  # additional rounds
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train','valid'],
    feval=smape_metric,
    init_model=model,       # continue from previous model
    callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(100)]
)


In [None]:
params['learning_rate'] = 0.01
model = lgb.train(
    params,
    lgb_train,
    num_boost_round=2000,  # more rounds with smaller learning rate
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train','valid'],
    feval=smape_metric,
    init_model=model,
    callbacks=[lgb.early_stopping(stopping_rounds=200), lgb.log_evaluation(100)]
)


In [None]:
residuals = y_val - np.expm1(model.predict(X_val_features, num_iteration=model.best_iteration))
res_model = LGBMRegressor(...).fit(X_train_features, residuals)
preds += res_model.predict(X_test_features) / n_models


In [None]:
# Predictions on validation set
val_preds = np.expm1(model.predict(X_val, num_iteration=model.best_iteration))

# Residuals = true values - predicted
residuals = y_val - val_preds


In [None]:
# Residuals on training fold
train_preds = np.expm1(model.predict(X_tr, num_iteration=model.best_iteration))
residuals_tr = y_tr - train_preds

# Train residual model on X_tr
res_model.fit(X_tr, residuals_tr)

# Predict on test set
res_preds = res_model.predict(X_test_features)

# Combine with original predictions
final_preds = np.expm1(model.predict(X_test_features, num_iteration=model.best_iteration)) + res_preds


In [None]:
# For fold i
X_tr, X_val = X_train_features[train_idx], X_train_features[val_idx]
y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

# Predictions on the training fold, not validation
train_preds = np.expm1(model.predict(X_tr, num_iteration=model.best_iteration))

# Residuals = true - predicted on training fold
residuals_tr = y_tr - train_preds

# Fit residual model on the training fold
res_model.fit(X_tr, residuals_tr)


In [None]:
train_preds = np.expm1(model.predict(X_train_features, num_iteration=model.best_iteration))
residuals = y - train_preds


In [None]:
res_model = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.01,
    num_leaves=64,
    random_state=42
)
res_model.fit(X_train_features, residuals)


In [None]:
test_preds_orig = np.expm1(model.predict(X_test_features, num_iteration=model.best_iteration))
res_preds = res_model.predict(X_test_features)
final_preds = test_preds_orig + res_preds


In [None]:
final_preds = np.clip(final_preds, 1, np.percentile(final_preds, 99.9))
final_preds = np.round(final_preds, 2)


In [None]:
from catboost import CatBoostRegressor

model2 = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    verbose=100
)
model2.fit(X_train_features, y)


In [None]:
import pandas as pd

# Make sure test_df is your test dataset containing sample_id
submission = pd.DataFrame({
    "sample_id": test_df['sample_id'],  
    "price": final_preds
})

# Save to CSV
submission.to_csv("final_submission.csv", index=False)
print("✅ Submission file saved successfully!")


In [None]:
preds_model1 = final_preds  # From your first model
preds_model2 = model2.predict(X_test_features)


In [None]:
final_ensemble_preds = 0.6*preds_model1 + 0.4*preds_model2
final_ensemble_preds = np.clip(final_ensemble_preds, 1, np.percentile(final_ensemble_preds, 99.9))
final_ensemble_preds = np.round(final_ensemble_preds, 2)


In [None]:
submission["price"] = final_ensemble_preds
submission.to_csv("final_submission_ensemble.csv", index=False)
print("✅ Ensemble submission saved successfully!")


In [None]:
test_df = pd.read_csv("student_resource/dataset/test.csv")
import pandas as pd

# Load the test file (adjust path if needed)
test_df = pd.read_csv("student_resource/dataset/test.csv")  

# Create submission DataFrame
submission_df = pd.DataFrame({
    "sample_id": test_df["sample_id"],  # keep the IDs from test set
    "price": final_ensemble_preds                 # your final predictions
})


In [None]:
# Save in the required format
submission_df.to_csv("test.csv", index=False)
print("Submission file saved successfully!")


In [None]:
y_train = np.log1p(y_train)  # log(1 + price)


In [None]:
import pandas as pd

# Load test file
test_df = pd.read_csv("student_resource/dataset/test.csv")

# Make sure final_ensemble_preds is same length as test set
assert len(final_ensemble_preds) == len(test_df), "Predictions length mismatch!"

# Create submission DataFrame
submission_df = pd.DataFrame({
    "sample_id": test_df["sample_id"],
    "price": final_ensemble_preds
})

# Save CSV
submission_df.to_csv("final_submission_ensemble.csv", index=False)
print("✅ Ensemble submission saved successfully!")


In [None]:
# LightGBM predictions in log scale
preds_lgb = model.predict(X_test_features, num_iteration=model.best_iteration)

# CatBoost predictions in log scale if needed
preds_cb = np.log1p(model2.predict(X_test_features))


In [None]:
final_preds_log = 0.6 * preds_lgb + 0.4 * preds_cb


In [None]:
final_ensemble_preds = np.expm1(final_preds_log)
final_ensemble_preds = np.clip(final_ensemble_preds, 1, np.percentile(final_ensemble_preds, 99.9))
final_ensemble_preds = np.round(final_ensemble_preds, 2)


In [None]:
submission_df = pd.DataFrame({
    "sample_id": test_df["sample_id"],
    "price": final_ensemble_preds
})

# Save as file1.csv
submission_df.to_csv("file1.csv", index=False)
print("✅ Ensemble submission saved as file1.csv")

In [None]:
preds_raw = model.predict(X_test_features, num_iteration=model.best_iteration)
print(np.min(preds_raw), np.max(preds_raw))  # are they negative?


In [None]:
# Optionally clip only extreme prices
train['price'] = train['price'].clip(1, train['price'].quantile(0.99))
y = np.log1p(train['price'])


In [None]:
preds_raw = model.predict(X_test_features, num_iteration=model.best_iteration)
print(np.min(preds_raw), np.max(preds_raw))


In [None]:
# ===============================
# 0. Imports
# ===============================
import os
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sentence_transformers import SentenceTransformer
import lightgbm as lgb
from catboost import CatBoostRegressor
from tqdm import tqdm

# ===============================
# 1. Paths
# ===============================
base_path = "student_resource/dataset"
train_path = os.path.join(base_path, "train.csv")
test_path = os.path.join(base_path, "test.csv")

# ===============================
# 2. Load Data
# ===============================
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
print(f"Train samples: {train.shape[0]}, Test samples: {test.shape[0]}")

# ===============================
# 3. Feature Engineering
# ===============================

def extract_numeric(text):
    nums = re.findall(r'\d+', str(text))
    return max([int(n) for n in nums], default=1)

# Quantity / IPQ
train['quantity'] = train['catalog_content'].apply(extract_numeric)
test['quantity'] = test['catalog_content'].apply(extract_numeric)

# Text statistics
for df in [train, test]:
    df['text_len'] = df['catalog_content'].apply(lambda x: len(str(x)))
    df['num_count'] = df['catalog_content'].apply(lambda x: len(re.findall(r'\d+', str(x))))
    df['word_count'] = df['catalog_content'].apply(lambda x: len(str(x).split()))
    df['avg_word_len'] = df['catalog_content'].apply(lambda x: np.mean([len(w) for w in str(x).split()]) if len(str(x).split())>0 else 0)
    df['brand'] = df['catalog_content'].apply(lambda x: str(x).split()[0])

# Encode brands
le = LabelEncoder()
train['brand_enc'] = le.fit_transform(train['brand'])
test['brand_enc'] = le.transform(test['brand'])

numeric_features = ['quantity','text_len','num_count','word_count','avg_word_len','brand_enc']

scaler = StandardScaler()
train_numeric = scaler.fit_transform(train[numeric_features])
test_numeric = scaler.transform(test[numeric_features])

# ===============================
# 4. Text Embeddings
# ===============================
embedder = SentenceTransformer("all-MiniLM-L6-v2")

train_text_embeddings = embedder.encode(train['catalog_content'].tolist(), batch_size=64, show_progress_bar=True)
test_text_embeddings = embedder.encode(test['catalog_content'].tolist(), batch_size=64, show_progress_bar=True)

# Combine numeric + embeddings
X_train_features = np.hstack([train_text_embeddings, train_numeric])
X_test_features = np.hstack([test_text_embeddings, test_numeric])
y = train['price'].values  # use raw price, not log1p

# ===============================
# 5. K-Fold Training LightGBM
# ===============================
kf = KFold(n_splits=5, shuffle=True, random_state=42)
lgb_preds = np.zeros(X_test_features.shape[0])

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_features)):
    X_tr, X_val = X_train_features[train_idx], X_train_features[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    lgb_train = lgb.Dataset(X_tr, label=y_tr)
    lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)
    
    params = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 128,
        'max_depth': -1,
        'metric': 'mae',
        'n_jobs': -1,
        'seed': 42,
        'verbose': -1
    }
    
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_train, lgb_val],
        valid_names=['train','valid'],
        early_stopping_rounds=100,
        verbose_eval=200
    )
    
    lgb_preds += model.predict(X_test_features, num_iteration=model.best_iteration) / kf.n_splits

# ===============================
# 6. CatBoost Training
# ===============================
cat_model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    verbose=200
)
cat_model.fit(X_train_features, y)

cat_preds = cat_model.predict(X_test_features)

# ===============================
# 7. Ensemble (Stacking)
# ===============================
final_preds = 0.6*lgb_preds + 0.4*cat_preds
final_preds = np.clip(final_preds, 1, np.percentile(final_preds, 99.9))
final_preds = np.round(final_preds, 2)

# ===============================
# 8. Save Submission
# ===============================
submission = pd.DataFrame({
    'sample_id': test['sample_id'],
    'price': final_preds
})
submission_path = os.path.join(base_path, "final_submission_ensemble.csv")
submission.to_csv(submission_path, index=False)
print(f"✅ Submission saved: {submission_path}")


In [None]:
import lightgbm as lgb

model = lgb.train(
    params,
    lgb_train,
    num_boost_round=5000,
    valid_sets=[lgb_train, lgb_val],
    valid_names=['train','valid'],
    feval=smape_metric,  # your custom SMAPE
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=200)
    ]
)


In [None]:
def smape_metric_log(y_pred, dataset):
    y_true = dataset.get_label()
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    denominator[denominator == 0] = 1e-6
    smape = np.mean(2 * np.abs(y_pred - y_true) / denominator) * 100
    return 'SMAPE', smape, False


In [None]:
from sklearn.model_selection import KFold
import lightgbm as lgb
import numpy as np
import pandas as pd

# ===============================
# 1. Prepare K-Fold
# ===============================
kf = KFold(n_splits=5, shuffle=True, random_state=42)
preds = np.zeros(X_test_features.shape[0])  # for test predictions

# ===============================
# 2. K-Fold Training Loop
# ===============================
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_features)):
    print(f"Training fold {fold+1}...")
    X_tr, X_val = X_train_features[train_idx], X_train_features[val_idx]
    y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    lgb_train = lgb.Dataset(X_tr, label=y_tr)
    lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)
    
    params = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 64,
        'max_depth': -1,
        'seed': 42,
        'verbose': -1,
        'n_jobs': -1
    }
    
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_train, lgb_val],
        valid_names=['train','valid'],
        feval=smape_metric_log,  # use log-space SMAPE
        callbacks=[lgb.early_stopping(stopping_rounds=100), lgb.log_evaluation(100)]
    )
    
    # Predict on test set and average over folds
    preds += np.expm1(model.predict(X_test_features, num_iteration=model.best_iteration)) / kf.n_splits

# ===============================
# 3. Clip and round predictions
# ===============================
final_preds = np.clip(preds, 1, np.percentile(preds, 99.9))
final_preds = np.round(final_preds, 2)

# ===============================
# 4. Save submission
# ===============================
submission_df = pd.DataFrame({
    "sample_id": test['sample_id'],
    "price": final_preds
})

submission_df.to_csv("final_submission.csv", index=False)
print("✅ Submission saved as final_submission.csv")


In [None]:
y = np.log1p(train['price'].values)  # as array

# Then use array indexing instead of .iloc
y_tr, y_val = y[train_idx], y[val_idx]


In [None]:
from sklearn.model_selection import KFold
import lightgbm as lgb
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)
preds = np.zeros(X_test_features.shape[0])

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train_features)):
    print(f"Training fold {fold+1}...")
    
    X_tr, X_val = X_train_features[train_idx], X_train_features[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    lgb_train = lgb.Dataset(X_tr, label=y_tr)
    lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)
    
    params = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        'metric': 'mae',
        'learning_rate': 0.05,
        'num_leaves': 64,
        'max_depth': -1,
        'seed': 42,
        'verbose': -1,
        'n_jobs': -1
    }
    
    model = lgb.train(
        params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_train, lgb_val],
        valid_names=['train','valid'],
        feval=smape_metric_log,   # your custom SMAPE
        callbacks=[lgb.early_stopping(stopping_rounds=100),
                   lgb.log_evaluation(100)]
    )
    
    # Predict on test set and average over folds
    preds += np.expm1(model.predict(X_test_features, num_iteration=model.best_iteration)) / kf.n_splits

# Clip extreme predictions
final_preds = np.clip(preds, 1, np.percentile(preds, 99.9))
final_preds = np.round(final_preds, 2)

# Create submission
submission_df = pd.DataFrame({
    "sample_id": test['sample_id'],
    "price": final_preds
})
submission_df.to_csv("final_submission.csv", index=False)
print("✅ Submission saved successfully!")


In [None]:
submission_df = pd.DataFrame({
    "sample_id": test['sample_id'],
    "price": final_preds
})
submission_df.to_csv("final2.csv", index=False)
print("✅ Submission saved successfully!")
