In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
import re
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [11]:
df = pd.read_csv("train.csv")

In [12]:
print("Vectorizing text...")

tfidf = TfidfVectorizer(
    stop_words='english',
    lowercase=True,
    sublinear_tf=True,
    max_features=10000,
    ngram_range=(1,2)
)

X = tfidf.fit_transform(df['catalog_content'])
# X_valid_vec = tfidf.transform(X_valid)

print("TF-IDF shape:", X.shape)

Vectorizing text...
TF-IDF shape: (75000, 10000)


In [13]:
text_embeddings_file = "text_embeddings.npy"

# Check if cached embeddings exist
try:
    text_embeddings = np.load(text_embeddings_file)
    print("Loaded cached text embeddings")
except:
    text_model = SentenceTransformer('all-MiniLM-L6-v2')  # small & fast
    print("Encoding text with BERT...")
    text_embeddings = text_model.encode(df['catalog_content'].tolist(), show_progress_bar=True)
    np.save(text_embeddings_file, text_embeddings)
    print("Text embeddings saved")

print("Text embeddings shape:", text_embeddings.shape)

Loaded cached text embeddings
Text embeddings shape: (75000, 384)


In [14]:
# Step 1: extract numeric + unit
def extract_value_and_unit(text):
    value_match = re.search(r"Value:\s*([\d\.]+)", text)
    value = float(value_match.group(1)) if value_match else None
    
    unit_match = re.search(r"Unit:\s*([A-Za-z ]+)", text)
    unit = unit_match.group(1).strip() if unit_match else None
    
    return pd.Series([value, unit])

df[["numeric_value", "unit_extracted"]] = df["catalog_content"].apply(extract_value_and_unit)

# Step 2: normalize units
def normalize_units(value, unit):
    if pd.isna(unit) or pd.isna(value):
        return pd.Series([value, unit, "unknown"])
    
    u = unit.strip().lower()
    
    # Weight conversions to grams
    if u in ["ounce", "oz", "ounces"]:
        return pd.Series([value * 28.35, "g", "weight"])
    elif u in ["pound", "lb", "lbs"]:
        return pd.Series([value * 453.592, "g", "weight"])
    
    # Volume conversions to milliliters
    elif u in ["fl oz", "floz", "fluid ounce", "fluid ounces", "Fl Ounce"]:
        return pd.Series([value * 29.5735, "mL", "volume"])
    elif u in ["liter", "litre", "l","ltr"]:
        return pd.Series([value * 1000, "mL", "volume"])
    
    # Count-based (no conversion)
    elif u in ["count", "pack", "pcs", "piece", "pieces", "PACK", "can", "Carton", "Tea bags"]:
        return pd.Series([value, "count", "count"])
    
    # Unknown
    else:
        return pd.Series([value, unit, "unknown"])

df[["standardized_value", "standardized_unit", "unit_type"]] = df.apply(
    lambda x: normalize_units(x["numeric_value"], x["unit_extracted"]), axis=1
)

print(df[["numeric_value", "unit_extracted", "standardized_value", "standardized_unit", "unit_type"]])


       numeric_value unit_extracted  standardized_value standardized_unit  \
0              72.00          Fl Oz           2129.2920                mL   
1              32.00          Ounce            907.2000                 g   
2              11.40          Ounce            323.1900                 g   
3              11.25          Ounce            318.9375                 g   
4              12.00          Count             12.0000             count   
...              ...            ...                 ...               ...   
74995          12.00          Ounce            340.2000                 g   
74996         100.00          count            100.0000             count   
74997          80.00          Ounce           2268.0000                 g   
74998          16.00          Count             16.0000             count   
74999           2.47          Ounce             70.0245                 g   

      unit_type  
0        volume  
1        weight  
2        weight  
3  

In [15]:
categorical_cols = ["unit_type"]

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

new_feature = df[['standardized_value','unit_type']]

In [16]:
# Suppose X_tfidf is your TF-IDF matrix: (n_samples, 10000)
svd = TruncatedSVD(n_components=256, random_state=42)
X_tfidf_reduced = svd.fit_transform(X)
print("Reduced TF-IDF shape:", X_tfidf_reduced.shape)

# Scale TF-IDF reduced
scaler_tfidf = StandardScaler()
X_tfidf_scaled = scaler_tfidf.fit_transform(X_tfidf_reduced)

# Scale BERT embeddings (dense)
scaler_bert = StandardScaler()
X_bert_scaled = scaler_bert.fit_transform(text_embeddings)


# Combine all features
X_combined = np.hstack([X_tfidf_reduced, text_embeddings, new_feature])

print("Final combined feature shape:", X_combined.shape)


Reduced TF-IDF shape: (75000, 256)
Final combined feature shape: (75000, 642)


In [17]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X_combined, df['price'],
    test_size=0.1, random_state=42
)

In [18]:
y_train_log = np.log1p(y_train)
y_valid_log = np.log1p(y_valid)

In [19]:
def smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_pred - y_true) / denominator
    diff[denominator == 0] = 0  # avoid division by zero
    return np.mean(diff) * 100

def smape_eval(preds, train_data):
    y_true = train_data.get_label()
    denominator = (np.abs(y_true) + np.abs(preds)) / 2
    diff = np.abs(preds - y_true) / denominator
    diff[denominator == 0] = 0
    return 'smape', np.mean(diff) * 100, False  # False = lower is better

def refined_squared_smape_objective(preds, train_data):
    y_true = train_data.get_label()
    eps = 1e-6
    
    abs_y = np.abs(y_true)
    abs_pred = np.abs(preds)
    denom = (abs_y + abs_pred) / 2 + eps
    diff = preds - y_true
    
    # Refined gradient
    grad = 2 * diff / (denom ** 2) - (diff**2) * np.sign(preds) / (denom**3)
    
    # Refined hessian
    hess = 2 / (denom ** 2) + 3 * (diff**2) / (denom ** 4)
    
    return grad, hess

In [21]:
lgb_train = lgb.Dataset(X_train, y_train_log)
lgb_valid = lgb.Dataset(X_valid, y_valid_log, reference=lgb_train)

params = {
    'learning_rate': 0.05,
    "metric": "mae",
    "objective": refined_squared_smape_objective,
    'subsample': 0.7,
}

model = lgb.train(
    params,
    lgb_train,
    valid_sets=[lgb_valid],
    feval=smape_eval,
    num_boost_round=5000
)


In [22]:
y_pred = np.expm1(model.predict(X_valid))
final_smape = smape(y_valid, y_pred)
final_smape

np.float64(53.31871183377386)

In [26]:
y_final = np.log1p(df['price'])
final_dataset = lgb.Dataset(X_combined, y_final)

In [27]:
y_final.shape

(75000,)

In [28]:
final_model = lgb.train(
    params,
    train_set=final_dataset,
    num_boost_round=5000,
    feval=smape_eval,
    )

In [29]:
final_model.save_model('lgbm_model.txt') 

<lightgbm.basic.Booster at 0x24085f4f9d0>

In [30]:
import joblib

# save the fitted vectorizer
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(svd, 'svd_transformer.pkl')
joblib.dump(scaler_bert, 'scaler_bert.pkl')
joblib.dump(scaler_tfidf, 'scaler_tfidf.pkl')

['scaler_tfidf.pkl']

In [31]:
joblib.dump(le, "labelencoder.pkl")

['labelencoder.pkl']