In [1]:
import numpy as np
import pandas as pd
import requests, re, time, datetime
from urllib.parse import urlparse
from PIL import Image
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingRegressor
import xgboost as xgb
import lightgbm as lgb
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

2025-10-12 09:29:29.089317: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760261369.316972      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760261369.402213      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.get_logger().setLevel('ERROR')

CONFIG = {
    "data_path": "/kaggle/input/dataset",
    "text_features_dim": 300,
    "model_save_path": "/kaggle/working/ensemble_model.pkl",
    "text_max_length": 200,
}

class EfficientFeatureExtractor:
    def __init__(self):
        self.text_vectorizer = TfidfVectorizer(
            max_features=CONFIG["text_features_dim"],
            ngram_range=(1, 2),
            stop_words='english'
        )
        self.scaler = StandardScaler()
        
    def extract_text_features(self, texts):
        if not hasattr(self, 'text_vectorizer_fitted'):
            self.text_vectorizer.fit(texts)
            self.text_vectorizer_fitted = True
        return self.text_vectorizer.transform(texts).toarray()
    
    def extract_simple_image_features(self, image_urls):
        """Extract simple image features without deep learning"""
        features = []
        for url in image_urls:
            try:
                if isinstance(url, str) and url.startswith('http'):
                    has_image = 1
                    
                    parsed_url = urlparse(url)
                    domain = parsed_url.netloc
                    
                    is_cdn = 1 if any(cdn in domain for cdn in ['cdn', 'cloudfront', 'amazonaws', 'googleapis']) else 0
                    is_secure = 1 if url.startswith('https') else 0
                    
                    file_ext = os.path.splitext(parsed_url.path)[1].lower()
                    is_jpg = 1 if file_ext in ['.jpg', '.jpeg'] else 0
                    is_png = 1 if file_ext == '.png' else 0
                    is_webp = 1 if file_ext == '.webp' else 0
                    
                    feature_vector = [has_image, is_cdn, is_secure, is_jpg, is_png, is_webp]
                    
                else:
                    feature_vector = [0, 0, 0, 0, 0, 0]
            except:
                feature_vector = [0, 0, 0, 0, 0, 0]
            
            features.append(feature_vector)
        return np.array(features)
    
    def extract_advanced_features(self, df):
        text_features = self.extract_text_features(df['cleaned_content'].fillna(''))
        image_features = self.extract_simple_image_features(df['image_link'].fillna(''))
        numerical_features = self._extract_numerical_features(df)
        
        all_features = np.hstack([text_features, image_features, numerical_features])
        
        return all_features
    
    def _extract_numerical_features(self, df):
        features = []
        
        features.append(df['cleaned_content'].str.len().fillna(0).values.reshape(-1, 1))
        features.append(df['cleaned_content'].str.split().str.len().fillna(0).values.reshape(-1, 1))
        
        features.append(df['ipq'].values.reshape(-1, 1))
        features.append((df['ipq'] > 1).astype(int).values.reshape(-1, 1))
        features.append((df['ipq'] > 10).astype(int).values.reshape(-1, 1))
        
        price_keywords = ['premium', 'luxury', 'designer', 'professional', 'quality', 'deluxe']
        for keyword in price_keywords:
            features.append(df['cleaned_content'].str.contains(keyword).astype(int).values.reshape(-1, 1))
        
        product_types = ['shirt', 'shoe', 'electronic', 'book', 'tool', 'toy', 'watch', 'phone', 'computer']
        for ptype in product_types:
            features.append(df['cleaned_content'].str.contains(ptype).astype(int).values.reshape(-1, 1))
        
        features.append(df['catalog_content'].str.contains(r'\$|\€|\£').astype(int).values.reshape(-1, 1))
        features.append(df['catalog_content'].str.contains(r'\d+\.\d{2}').astype(int).values.reshape(-1, 1))
        
        return np.hstack(features)

def extract_ipq(text):
    if not isinstance(text, str): 
        return 1
    
    text = text.lower()
    patterns = [
        r'pack\s+of\s+(\d+)', r'(\d+)\s*pack', r'(\d+)\s*count', 
        r'set\s+of\s+(\d+)', r'(\d+)\s*pcs', r'(\d+)\s*pieces', 
        r'(\d+)\s*ct', r'case\s+of\s+(\d+)', r'(\d+)\s*unit',
        r'(\d+)\s*-?piece', r'(\d+)\s*-?pack'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            try:
                value = int(match.group(1))
                return min(max(value, 1), 100)
            except ValueError: 
                continue
    return 1

def clean_text(text):
    if not isinstance(text, str): 
        return ""
    
    text = text.lower()
    text = re.sub(r'[^\w\s$€£¥%+]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def preprocess_data(df, is_train=True):
    df_processed = df.copy()
    
    df_processed['catalog_content'] = df_processed['catalog_content'].fillna('')
    
    df_processed['ipq'] = df_processed['catalog_content'].apply(extract_ipq)
    df_processed['cleaned_content'] = df_processed['catalog_content'].apply(clean_text)
    
    df_processed['text_length'] = df_processed['cleaned_content'].str.len()
    df_processed['word_count'] = df_processed['cleaned_content'].str.split().str.len()
    
    if is_train and 'price' in df_processed.columns:
        df_processed['price'] = df_processed['price'].fillna(df_processed['price'].median())
        df_processed['log_price'] = np.log1p(df_processed['price'])
    
    return df_processed

class EnsemblePricePredictor:
    def __init__(self):
        self.models = {}
        self.feature_extractor = EfficientFeatureExtractor()
        self.is_trained = False
        
    def create_models(self):
        self.models = {
            'xgb': xgb.XGBRegressor(
                n_estimators=500,
                learning_rate=0.1,
                max_depth=8,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                n_jobs=-1
            ),
            'lgb': lgb.LGBMRegressor(
                n_estimators=500,
                learning_rate=0.1,
                max_depth=8,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=42,
                n_jobs=-1
            )
        }
        
        self.ensemble = VotingRegressor([
            ('xgb', self.models['xgb']),
            ('lgb', self.models['lgb'])
        ])
    
    def train(self, X, y):
        self.create_models()
        
        for name, model in self.models.items():
            model.fit(X, y)
        
        self.ensemble.fit(X, y)
        self.is_trained = True
        
    def predict(self, X):
        if not self.is_trained:
            raise ValueError("Model not trained yet")
        return self.ensemble.predict(X)
    
    def save(self, path):
        joblib.dump({
            'ensemble': self.ensemble,
            'feature_extractor': self.feature_extractor
        }, path)
    
    def load(self, path):
        saved_data = joblib.load(path)
        self.ensemble = saved_data['ensemble']
        self.feature_extractor = saved_data['feature_extractor']
        self.is_trained = True

def calculate_smape(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    denominator = (np.abs(y_true) + np.abs(y_pred))
    smape_values = np.where(denominator == 0, 0, 2 * np.abs(y_pred - y_true) / denominator)
    return 100 * np.mean(smape_values)

def prepare_data():
    train_path = os.path.join(CONFIG["data_path"], "train.csv")
    test_path = os.path.join(CONFIG["data_path"], "test.csv")
    
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    train_df = preprocess_data(train_df, is_train=True)
    test_df = preprocess_data(test_df, is_train=False)
    
    return train_df, test_df

def predictor(sample_id, catalog_content, image_link):
    sample_df = pd.DataFrame([{
        'sample_id': sample_id,
        'catalog_content': catalog_content,
        'image_link': image_link
    }])
    
    sample_df = preprocess_data(sample_df, is_train=False)
    
    predictor = EnsemblePricePredictor()
    predictor.load(CONFIG["model_save_path"])
    
    features = predictor.feature_extractor.extract_advanced_features(sample_df)
    
    log_prediction = predictor.predict(features)[0]
    predicted_price = np.expm1(log_prediction)
    
    predicted_price = max(0.1, min(predicted_price, 1000.0))
    
    return predicted_price

def train_and_predict():
    try:
        np.random.seed(42)
        
        train_df, test_df = prepare_data()
        
        feature_extractor = EfficientFeatureExtractor()
        predictor = EnsemblePricePredictor()
        predictor.feature_extractor = feature_extractor
        
        X_train = feature_extractor.extract_advanced_features(train_df)
        y_train = train_df['log_price'].values
        
        predictor.train(X_train, y_train)
        
        predictor.save(CONFIG["model_save_path"])
        
        X_test = feature_extractor.extract_advanced_features(test_df)
        log_predictions = predictor.predict(X_test)
        predictions = np.expm1(log_predictions)
        
        submission_df = pd.DataFrame({
            'sample_id': test_df['sample_id'],
            'price': predictions
        })
        
        train_prices = pd.read_csv(os.path.join(CONFIG["data_path"], "train.csv"))['price']
        median_price = train_prices.median()
        submission_df['price'] = submission_df['price'].fillna(median_price).clip(0.1, 1000.0)
        
        submission_df.to_csv("/kaggle/working/output.csv", index=False)
        
        val_predictions = predictor.predict(X_train[:100])
        val_true = y_train[:100]
        smape_score = calculate_smape(np.expm1(val_true), np.expm1(val_predictions))
        print(f"{smape_score:.2f}")
        
        return predictor, submission_df
        
    except Exception as e:
        print(f"Error: {e}")
        return None, None


In [3]:
if __name__ == "__main__":
    model, predictions = train_and_predict()

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.099240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77044
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 319
[LightGBM] [Info] Start training from score 2.739217
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.101011 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 77044
[LightGBM] [Info] Number of data points in the train set: 75000, number of used features: 319
[LightGBM] [Info] Start training from score 2.739217
57.18
