In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import os
import warnings
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack, coo_matrix

warnings.filterwarnings('ignore')

#SMAPE metric
def smape(y_true, y_pred):
    y_true_unlogged = np.expm1(y_true)
    y_pred_unlogged = np.expm1(y_pred)
    numerator = np.abs(y_pred_unlogged - y_true_unlogged)
    denominator = (np.abs(y_true_unlogged) + np.abs(y_pred_unlogged)) / 2
    smape_val = np.mean(numerator / (denominator + 1e-8)) * 100
    return 'smape', smape_val, False

print("Libraries imported.")

Libraries imported.


In [2]:
# defining paths
PROCESSED_DATA_FOLDER = '../data/processed'
TRAIN_FILE_PATH = os.path.join(PROCESSED_DATA_FOLDER, 'train_processed.parquet')
TEST_FILE_PATH = os.path.join(PROCESSED_DATA_FOLDER, 'test_processed.parquet')

# loading data
train_df = pd.read_parquet(TRAIN_FILE_PATH)
test_df = pd.read_parquet(TEST_FILE_PATH)

# making sure catalog_content is string
train_df['catalog_content'] = train_df['catalog_content'].astype(str)
test_df['catalog_content'] = test_df['catalog_content'].astype(str)

print(f"Loaded train data: {train_df.shape}")
print(f"Loaded test data: {test_df.shape}")

Loaded train data: (75000, 48)
Loaded test data: (75000, 46)


In [3]:
# 80-20 train-validation split
X_train, X_val = train_test_split(train_df, test_size=0.2, random_state=42)

y_train = X_train['log_price']
y_val = X_val['log_price']

print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")

Training data shape: (60000, 48)
Validation data shape: (15000, 48)


In [4]:
print("Starting TF-IDF vectorization...")

# TF-IDF Vectorization
tfidf_vec = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1, 2),
    stop_words='english',
    dtype=np.float32
)

# 1. FIT on training text
X_train_text = tfidf_vec.fit_transform(X_train['catalog_content'])

# 2. TRANSFORM validation text
X_val_text = tfidf_vec.transform(X_val['catalog_content'])

print (f"TF-IDF training matrix shape: {X_train_text.shape}")
print (f"TF-IDF validation matrix shape: {X_val_text.shape}")

Starting TF-IDF vectorization...
TF-IDF training matrix shape: (60000, 50000)
TF-IDF validation matrix shape: (15000, 50000)


In [5]:
print("Starting TruncatedSVD...")

# reducing 50,000 features to 300 "topics"
svd = TruncatedSVD(n_components=300, random_state=42)

# 1. FIT on training text features
X_train_svd = svd.fit_transform(X_train_text)

# transform validation text features
X_val_svd = svd.transform(X_val_text)

print(f"SVD training matrix shape: {X_train_svd.shape}")
print(f"SVD validation matrix shape: {X_val_svd.shape}")

Starting TruncatedSVD...
SVD training matrix shape: (60000, 300)
SVD validation matrix shape: (15000, 300)


In [6]:
#getting all numerical feature columns
numerical_features = [
    col for col in train_df.columns
    if col.startswith('unit_') or col in ['pack_size', 'total_measure']
]
print(f"Found {len(numerical_features)} numerical features.")

# get numerical data
X_train_num = X_train[numerical_features].values
X_val_num = X_val[numerical_features].values

# combine SVD features with numerical features
X_train_final = np.hstack([X_train_svd, X_train_num])
X_val_final = np.hstack([X_val_svd, X_val_num])

print(f"Final training feature matrix shape: {X_train_final.shape}")
print(f"Final validation feature matrix shape: {X_val_final.shape}")

Found 44 numerical features.
Final training feature matrix shape: (60000, 344)
Final validation feature matrix shape: (15000, 344)


In [9]:
print("Training LightGBM model on LSA features...")

lgbm_model_lsa = lgb.LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    n_jobs=-1,
    random_state=42,
    metric='None'
)

# Train the model
lgbm_model_lsa.fit(
    X_train_final, 
    y_train,
    eval_set=[(X_val_final, y_val)],
    eval_metric=smape,
    callbacks=[
        lgb.early_stopping(stopping_rounds=100),
        lgb.log_evaluation(period=100)
    ]
)

print("\nModel training complete.")
print(f"Best SMAPE Score: {lgbm_model_lsa.best_score_['valid_0']['smape']}")

Training LightGBM model on LSA features...
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.105773 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76852
[LightGBM] [Info] Number of data points in the train set: 60000, number of used features: 313
[LightGBM] [Info] Start training from score 2.740904
Training until validation scores don't improve for 100 rounds
[100]	valid_0's smape: 59.3727
[200]	valid_0's smape: 57.4744
[300]	valid_0's smape: 56.6239
[400]	valid_0's smape: 56.1523
[500]	valid_0's smape: 55.7975
[600]	valid_0's smape: 55.5063
[700]	valid_0's smape: 55.2019
[800]	valid_0's smape: 54.9464
[900]	valid_0's smape: 54.7621
[1000]	valid_0's smape: 54.5708
[1100]	valid_0's smape: 54.4545
[1200]	valid_0's smape: 54.3235
[1300]	valid_0's smape: 54.1939
[1400]	valid_0's smape: 54.0943
[1500]	valid_0's smape: 53.9913
[1600]	valid_0's smape: 53.9051
[1700]	valid_0's smape: 53.8201
[1800]	valid_0's

In [10]:
print("Creating LSA submission file...")

# 1. Process the test_df text with TF-IDF
print("Applying TF-IDF to test set...")
test_text = tfidf_vec.transform(test_df['catalog_content'])

# 2. Apply SVD
print("Applying SVD to test set...")
test_svd = svd.transform(test_text)

# 3. Get the test_df numerical data
test_num = test_df[numerical_features].values

# 4. Combine test features
X_test_final = np.hstack((test_svd, test_num))
print(f"Final test feature matrix shape: {X_test_final.shape}")

# 5. Predict
log_price_preds = lgbm_model_lsa.predict(X_test_final)

# 6. Convert back to 'price'
price_preds = np.expm1(log_price_preds)

# 7. Make sure prices are not negative
price_preds[price_preds < 0] = 0

# 8. Create submission DataFrame
submission_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': price_preds
})

# 9. Save to CSV
SUBMISSION_FILE_PATH = '../submissions/submission_lsa.csv'
submission_df.to_csv(SUBMISSION_FILE_PATH, index=False)

print(f"\nSubmission file saved to: {SUBMISSION_FILE_PATH}")
submission_df.head()

Creating LSA submission file...
Applying TF-IDF to test set...
Applying SVD to test set...
Final test feature matrix shape: (75000, 344)

Submission file saved to: ../submissions/submission_lsa.csv


Unnamed: 0,sample_id,price
0,100179,13.313899
1,245611,13.57297
2,146263,17.655464
3,95658,12.87558
4,36806,30.972991
