In [4]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
import joblib
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

# --- 1. Define File Paths ---
train_file_path = '/Users/adityasharma/Github Projects/Amazon/input/train.csv'
final_test_file_path = '/Users/adityasharma/Github Projects/Amazon/input/test.csv'

# --- 2. Load Training Data ---
try:
    df = pd.read_csv(train_file_path)
    df = df.dropna(subset=['price'])
    df['catalog_content'] = df['catalog_content'].astype(str).fillna('')
    print("Training data loaded successfully.")
except FileNotFoundError as e:
    print(e); raise

# --- 3. Create Feature and Target Sets ---
X = df['catalog_content']
y = df['price']
# --- FIX: Define y_log for the full dataset here ---
y_log = np.log1p(y)

# Create a final hold-out validation set for scoring
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_log = np.log1p(y_train)

# --- 4. Define the Final Model with Best Parameters ---
best_params = {
    'objective': 'regression_l1', 'metric': 'mae',
    'n_estimators': 761, 'learning_rate': 0.1884930281597436,
    'num_leaves': 41, 'max_depth': 17, 'lambda_l1': 0.040587974507707486,
    'lambda_l2': 2.538242194369503e-06, 'feature_fraction': 0.7302844764184871,
    'bagging_fraction': 0.8115111618141515, 'bagging_freq': 5,
    'min_child_samples': 9, 'random_state': 42, 'n_jobs': -1, 'verbose': -1
}

final_pipeline = make_pipeline(
    TfidfVectorizer(stop_words='english', max_features=30000, ngram_range=(1, 2)),
    lgb.LGBMRegressor(**best_params)
)

# --- 5. Train the Model on the Training Portion ---
print("\nTraining final model on the 80% training split...")
final_pipeline.fit(X_train, y_train_log)
print("Training complete.")

# --- 6. Evaluate on the 20% Hold-Out Validation Set ---
print("\nEvaluating model on the hold-out validation set...")
val_preds_log = final_pipeline.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0)) * 100

final_validation_smape = smape(y_val, val_preds)

print("\n--- Final Model Performance ---")
print(f"SMAPE on hold-out validation set: {final_validation_smape:.4f}")

# --- 7. Save the Model ---
model_filename = 'lgbm_price_model_v1.joblib'
joblib.dump(final_pipeline, model_filename)
print(f"\nModel trained on 80% of data saved to '{model_filename}'")

# --- 8. (Optional) Create Submission File ---
print("\nRetraining model on 100% of training data for final submission...")
# This will now work because y_log is defined for the full dataset
final_pipeline.fit(X, y_log)
final_test_df = pd.read_csv(final_test_file_path)
X_final_test = final_test_df['catalog_content'].astype(str).fillna('')
final_predictions_log = final_pipeline.predict(X_final_test)
final_predictions = np.expm1(final_predictions_log)
final_predictions[final_predictions < 0] = 0

submission_df = pd.DataFrame({'id': final_test_df.index, 'price': final_predictions})
submission_df.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created.")

Training data loaded successfully.

Training final model on the 80% training split...
Training complete.

Evaluating model on the hold-out validation set...

--- Final Model Performance ---
SMAPE on hold-out validation set: 51.9241

Model trained on 80% of data saved to 'lgbm_price_model_v1.joblib'

Retraining model on 100% of training data for final submission...
Submission file 'submission.csv' created.
