In [8]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sentence_transformers import SentenceTransformer
import re
import joblib

In [2]:
df = pd.read_csv("test.csv")

In [3]:
print("Vectorizing text...")

# load the vectorizer
tfidf = joblib.load('tfidf_vectorizer.pkl')

# transform new text data (no need to fit again)
X_new = tfidf.transform(df['catalog_content'])


print("TF-IDF shape:", X_new.shape)

Vectorizing text...
TF-IDF shape: (75000, 10000)


In [4]:
text_embeddings_file = "test_text_embeddings.npy"

# Check if cached embeddings exist
try:
    text_embeddings = np.load(text_embeddings_file)
    print("Loaded cached text embeddings")
except:
    text_model = SentenceTransformer('all-MiniLM-L6-v2')  # small & fast
    print("Encoding text with BERT...")
    text_embeddings = text_model.encode(df['catalog_content'].tolist(), show_progress_bar=True)
    np.save(text_embeddings_file, text_embeddings)
    print("Text embeddings saved")

print("Text embeddings shape:", text_embeddings.shape)

Loaded cached text embeddings
Text embeddings shape: (75000, 384)


In [5]:
# Step 1: extract numeric + unit
def extract_value_and_unit(text):
    value_match = re.search(r"Value:\s*([\d\.]+)", text)
    value = float(value_match.group(1)) if value_match else None
    
    unit_match = re.search(r"Unit:\s*([A-Za-z ]+)", text)
    unit = unit_match.group(1).strip() if unit_match else None
    
    return pd.Series([value, unit])

df[["numeric_value", "unit_extracted"]] = df["catalog_content"].apply(extract_value_and_unit)

# Step 2: normalize units
def normalize_units(value, unit):
    if pd.isna(unit) or pd.isna(value):
        return pd.Series([value, unit, "unknown"])
    
    u = unit.strip().lower()
    
    # Weight conversions to grams
    if u in ["ounce", "oz", "ounces"]:
        return pd.Series([value * 28.35, "g", "weight"])
    elif u in ["pound", "lb", "lbs"]:
        return pd.Series([value * 453.592, "g", "weight"])
    
    # Volume conversions to milliliters
    elif u in ["fl oz", "floz", "fluid ounce", "fluid ounces", "Fl Ounce"]:
        return pd.Series([value * 29.5735, "mL", "volume"])
    elif u in ["liter", "litre", "l","ltr"]:
        return pd.Series([value * 1000, "mL", "volume"])
    
    # Count-based (no conversion)
    elif u in ["count", "pack", "pcs", "piece", "pieces", "PACK", "can", "Carton", "Tea bags"]:
        return pd.Series([value, "count", "count"])
    
    # Unknown
    else:
        return pd.Series([value, unit, "unknown"])

df[["standardized_value", "standardized_unit", "unit_type"]] = df.apply(
    lambda x: normalize_units(x["numeric_value"], x["unit_extracted"]), axis=1
)

print(df[["numeric_value", "unit_extracted", "standardized_value", "standardized_unit", "unit_type"]])


       numeric_value unit_extracted  standardized_value standardized_unit  \
0               10.5          Ounce           297.67500                 g   
1                2.0          Fl Oz            59.14700                mL   
2               32.0          Ounce           907.20000                 g   
3                2.0          Count             2.00000             count   
4               32.0          Fl Oz           946.35200                mL   
...              ...            ...                 ...               ...   
74995            2.4          Ounce            68.04000                 g   
74996            7.0          Ounce           198.45000                 g   
74997           11.5          Fl Oz           340.09525                mL   
74998           16.0          Ounce           453.60000                 g   
74999           64.8          Ounce          1837.08000                 g   

      unit_type  
0        weight  
1        volume  
2        weight  
3  

In [6]:
le = joblib.load('labelencoder.pkl')

categorical_cols = ["unit_type"]

for col in categorical_cols:
    
    df[col] = le.transform(df[col].astype(str))

new_feature = df[['standardized_value','unit_type']]

In [7]:
# Suppose X_tfidf is your TF-IDF matrix: (n_samples, 10000)
svd = joblib.load('svd_transformer.pkl')

X_tfidf_reduced = svd.transform(X_new)
print("Reduced TF-IDF shape:", X_tfidf_reduced.shape)

# Scale TF-IDF reduced
scaler_tfidf = joblib.load('scaler_tfidf.pkl')
X_tfidf_scaled = scaler_tfidf.transform(X_tfidf_reduced)

# Scale BERT embeddings (dense)
scaler_bert = joblib.load('scaler_bert.pkl')
X_bert_scaled = scaler_bert.transform(text_embeddings)


# Combine all features
X_combined = np.hstack([X_tfidf_reduced, text_embeddings, new_feature])

print("Final combined feature shape:", X_combined.shape)


Reduced TF-IDF shape: (75000, 256)
Final combined feature shape: (75000, 642)


In [10]:
model = lgb.Booster(model_file='lgbm_model.txt')
predictions = np.expm1(model.predict(X_combined))


In [12]:
submission = pd.DataFrame({
    'sample_id': df['sample_id'],
    'price': predictions        # predicted values
})

# Save to CSV
submission.to_csv('test_out.csv', index=False)

print("Saved to test_out.csv")

Saved to test_out.csv
