The new plan is as follows:

Use a pre-trained sentence-transformer model to convert each catalog_content entry into a dense numerical vector (an embedding). This is a one-time, computationally expensive step.

Train our tuned LightGBM model on these new embedding features instead of the TF-IDF features.

In [1]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl.metadata (16 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp311-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting huggingface-hub>=0.20.0 (from sentence-transformers)
  Downloading huggingface_hub-0.35.3-py3-none-any.whl.metadata (14 kB)
Collecting filelock (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading regex-2025.9.18-cp311-cp311-macosx_11_0_arm64.whl.metadata (40 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers<5.0.0,>=4.41.0->sentence-transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl.metadata (6.8 kB)


In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import re
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
import joblib

# --- 1. Load Data ---
train_df = pd.read_csv('/Users/adityasharma/Github Projects/Amazon/input/train.csv')
train_df = train_df.dropna(subset=['price'])
train_df['catalog_content'] = train_df['catalog_content'].astype(str).fillna('')
print("Training data loaded successfully.")

# --- 2. Generate Semantic Embeddings ---
print("Loading pre-trained sentence-transformer model...")
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

print("Generating embeddings for all catalog_content... (This will take several minutes)")
def extract_title(text):
    match = re.search(r'^item name:\s*(.*)', text, re.IGNORECASE | re.MULTILINE)
    return match.group(1).strip() if match else text
train_df['title'] = train_df['catalog_content'].apply(extract_title)

embeddings = embedding_model.encode(train_df['title'].tolist(), show_progress_bar=True)
print(f"Embeddings created with shape: {embeddings.shape}")


# --- 3. Create Hold-Out Set ---
X = embeddings
y = train_df['price']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
y_train_log = np.log1p(y_train)

# --- 4. Train LightGBM on the New Embedding Features ---
best_params = {
    'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 761, 'learning_rate': 0.188,
    'num_leaves': 41, 'max_depth': 17, 'lambda_l1': 0.04, 'lambda_l2': 2.53e-06,
    'feature_fraction': 0.73, 'bagging_fraction': 0.81, 'bagging_freq': 5,
    'min_child_samples': 9, 'random_state': 42,
    'n_jobs': -1, # <-- FIX: Corrected the value from '-नहीं' to '-1'
    'verbose': -1
}

model_v4 = lgb.LGBMRegressor(**best_params)

print("\nTraining V4 model on semantic embeddings...")
model_v4.fit(X_train, y_train_log)
print("Training complete.")

# --- 5. Evaluate the V4 Model ---
print("\nEvaluating V4 model...")
val_preds_log = model_v4.predict(X_val)
val_preds = np.expm1(val_preds_log)
val_preds[val_preds < 0] = 0

def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(np.divide(numerator, denominator, out=np.zeros_like(numerator, dtype=float), where=denominator!=0)) * 100

v4_smape = smape(y_val, val_preds)

print("\n--- Model Performance Comparison ---")
print(f"V1 Model SMAPE (TF-IDF on raw text): 51.9241")
print(f"V4 Model SMAPE (Semantic Embeddings): {v4_smape:.4f}")

Training data loaded successfully.
Loading pre-trained sentence-transformer model...
Generating embeddings for all catalog_content... (This will take several minutes)


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

Embeddings created with shape: (75000, 384)

Training V4 model on semantic embeddings...
Training complete.

Evaluating V4 model...

--- Model Performance Comparison ---
V1 Model SMAPE (TF-IDF on raw text): 51.9241
V4 Model SMAPE (Semantic Embeddings): 58.8297


: 