In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

content scanning 

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy.sparse import save_npz
import joblib
import os
import warnings

warnings.filterwarnings('ignore')

# --- Helper functions (These are correct and do not need changes) ---

def extract_structured_data(text):
    if not isinstance(text, str): return pd.Series([np.nan, 'unknown'], index=['Value', 'Unit'])
    value_match = re.search(r'Value:\s*([\d\.]+)', text, re.IGNORECASE)
    value = float(value_match.group(1)) if value_match else np.nan
    unit_match = re.search(r'Unit:\s*(\w+)', text, re.IGNORECASE)
    unit = unit_match.group(1) if unit_match else 'unknown'
    return pd.Series([value, unit], index=['Value', 'Unit'])

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def clean_text(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'Value:.*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Unit:.*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and len(w) > 2]
    return ' '.join(lemmatized_words)

def standardize_units(unit):
    if not isinstance(unit, str): return 'unknown'
    unit = unit.lower().strip()
    unit_map = {'oz': 'ounce', 'ounces': 'ounce', 'fl oz': 'fl_oz', 'fz': 'fl_oz', 'ct': 'count', 'none': 'unknown'}
    return unit_map.get(unit, unit)


# --- CORRECTED: Main Preprocessing and Feature Engineering Pipeline ---

def create_feature_pipeline(df, is_training=True):
    """
    Creates and applies a full preprocessing pipeline.
    If is_training=True, it fits the preprocessor and returns X, y, and the fitted preprocessor.
    If is_training=False, it loads a pre-fitted preprocessor and transforms the data, returning only X.
    """
    if df is None: return None

    print("\nExtracting 'Value' and 'Unit' from 'catalog_content'...")
    # Use reset_index to prevent potential alignment issues with concat
    extracted_data = df['catalog_content'].apply(extract_structured_data)
    df = pd.concat([df.reset_index(drop=True), extracted_data], axis=1)

    if is_training:
        # --- TRAINING MODE ---
        # This block runs ONLY when you are processing the `train.csv` file.
        df.dropna(subset=['price'], inplace=True)
        y = np.log1p(df['price'])
        sample_ids = df['sample_id']
        # We drop the price here because it's the target, not a feature
        features_df = df.drop(columns=['price', 'sample_id', 'image_link'])

        print("\nCleaning and preparing features for training...")
        features_df['cleaned_catalog'] = features_df['catalog_content'].apply(clean_text)
        features_df['Unit_standardized'] = features_df['Unit'].apply(standardize_units)
        
        # Define the steps for the preprocessor
        numeric_features = ['Value']
        numeric_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
        categorical_features = ['Unit_standardized']
        categorical_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),
                                                  ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))])
        text_features_col = 'cleaned_catalog'
        text_transformer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
        
        # Create the master preprocessor
        preprocessor = ColumnTransformer(
            transformers=[('num', numeric_transformer, numeric_features),
                          ('cat', categorical_transformer, categorical_features),
                          ('text', text_transformer, text_features_col)],
            remainder='drop', n_jobs=-1)
        
        print("Fitting preprocessor and transforming data...")
        # CRITICAL: Use `fit_transform` to learn from the training data
        X = preprocessor.fit_transform(features_df)
        
        print(f"--- Training Preprocessing Complete ---")
        print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")
        
        return X, y, sample_ids, preprocessor
    
    else:
        # --- PREDICTION/TESTING MODE ---
        # This block runs ONLY when you are processing the `test.csv` file.
        sample_ids = df['sample_id']
        # The 'price' column does not exist in test.csv, so no need to drop it
        features_df = df.drop(columns=['sample_id', 'image_link'])

        print("\nCleaning and preparing features for testing...")
        features_df['cleaned_catalog'] = features_df['catalog_content'].apply(clean_text)
        features_df['Unit_standardized'] = features_df['Unit'].apply(standardize_units)
        
        # Load the pre-fitted preprocessor that was saved during training
        try:
            preprocessor = joblib.load('processed_data/preprocessor.joblib')
        except FileNotFoundError:
            print("FATAL ERROR: preprocessor.joblib not found. You must run this script in 'train' mode first.")
            return None, None

        print("Loading fitted preprocessor and transforming test data...")
        # CRITICAL: Use `transform` ONLY. Do not re-fit on test data.
        X = preprocessor.transform(features_df)
        
        print(f"--- Test Preprocessing Complete ---")
        print(f"Shape of X: {X.shape}")

        return X, sample_ids

# --- CORRECTED: Main execution block with a clear MODE switch ---
if __name__ == '__main__':
    # --- CHOOSE YOUR MODE ---
    # Set to 'train' to process the full training data and create the artifacts.
    # Set to 'test' to process the full testing data using the saved artifacts.
    MODE = 'train' # <-- CHANGE THIS TO 'test' WHEN YOU ARE READY TO PROCESS THE TEST FILE

    OUTPUT_DIR = 'processed_data'
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    if MODE == 'train':
        print("--- RUNNING IN TRAINING MODE ---")
        # Load the full training data which includes the price
        # The `load_and_merge_data` function is no longer needed as `train.csv` is complete
        try:
            train_df = pd.read_csv(os.path.join('dataset', 'train.csv'))
        except FileNotFoundError:
            print("Error: `dataset/train.csv` not found. Please place the training file in the 'dataset' directory.")
            exit()
            
        X_processed, y_processed, ids_processed, preprocessor_fitted = create_feature_pipeline(train_df, is_training=True)
        
        if X_processed is not None:
            # Save all the necessary artifacts for later
            print(f"\nSaving processed training data and artifacts to '{OUTPUT_DIR}'...")
            save_npz(os.path.join(OUTPUT_DIR, 'X_processed_train.npz'), X_processed)
            y_df = pd.DataFrame({'sample_id': ids_processed, 'price_log': y_processed})
            y_df.to_csv(os.path.join(OUTPUT_DIR, 'y_processed_train.csv'), index=False)
            joblib.dump(preprocessor_fitted, os.path.join(OUTPUT_DIR, 'preprocessor.joblib'))
            print("‚úÖ All training artifacts saved successfully.")

    elif MODE == 'test':
        print("--- RUNNING IN TEST/PREDICTION MODE ---")
        # Load the test data which does NOT have a price
        try:
            test_df = pd.read_csv(os.path.join('dataset', 'test.csv'))
        except FileNotFoundError:
            print("Error: `dataset/test.csv` not found. Please place the testing file in the 'dataset' directory.")
            exit()
        
        X_processed, ids_processed = create_feature_pipeline(test_df, is_training=False)
        
        if X_processed is not None:
            # Save the processed features for the test set
            print(f"\nSaving processed test data to '{OUTPUT_DIR}'...")
            save_npz(os.path.join(OUTPUT_DIR, 'X_processed_test.npz'), X_processed)
            ids_df = pd.DataFrame({'sample_id': ids_processed})
            ids_df.to_csv(os.path.join(OUTPUT_DIR, 'ids_test.csv'), index=False)
            print("‚úÖ All test artifacts saved successfully.")
    else:
        print("Invalid MODE selected. Please choose 'train' or 'test'.")

# The old save_processed_data function can be used as a helper if needed, but the logic is now in the main block.
def save_processed_data(X, y, ids, preprocessor, output_dir='processed_data'):
    # This function is now effectively replaced by the logic within the if/elif blocks
    # but can be kept for reference or modularity if you refactor later.
    pass

"extract_text_from_images.py"

In [None]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import os
import base64
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import csv
import time

# --- 1. API Interaction Function (This remains the same) ---
def get_local_vlm_description(image_url: str) -> str:
    api_url = "http://localhost:1234/v1/chat/completions"
    prompt_text = (
        "Analyze the image and complete the following fields based *only* on the visible text. "
        "Add a very small targetted informative description about the image and the product inside.\n"
        "Brand: \n" "Product: \n" "Size/Quantity: \n" "Features: \n"
        "above fileds are mandatory in that order itself"
    )
    try:
        with requests.Session() as session:
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = session.get(image_url, timeout=30, headers=headers)
            response.raise_for_status()
        
        base64_image = base64.b64encode(response.content).decode('utf-8')
        payload = {
            "model": "smolvlm-500m-instruct",
            "messages": [
                { "role": "user", "content": [{"type": "text", "text": prompt_text}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}]}
            ],
            "max_tokens": 250, "temperature": 0.0
        }
        api_response = requests.post(api_url, headers={"Content-Type": "application/json"}, json=payload, timeout=60)
        api_response.raise_for_status()
        response_json = api_response.json()
        if 'choices' in response_json and len(response_json['choices']) > 0:
            content = response_json['choices'][0]['message']['content']
            return " ".join(content.strip().splitlines())
        else:
            return ""
    except Exception:
        return ""

# --- Main execution block with LIGHTWEIGHT APPEND-ONLY REPAIR ---
if __name__ == '__main__':
    print("--- Starting TARGETED REPAIR for missing VLM descriptions (Lightweight Mode) ---")

    # --- Configuration ---
    BATCH_SIZE = 20
    TRAIN_CSV_PATH = os.path.join('dataset', 'train.csv')
    TEST_CSV_PATH = os.path.join('dataset', 'test.csv')
    OUTPUT_DIR = 'processed_data'
    VLM_CSV_PATH = os.path.join(OUTPUT_DIR, 'smolvlm_extracted_features.csv') # Use a consistent name

    # --- Step 1: Find missing descriptions ---
    print(f"Loading existing VLM file to find gaps: {VLM_CSV_PATH}")
    try:
        df_vlm = pd.read_csv(VLM_CSV_PATH, engine='python')
        missing_mask = df_vlm['smolvlm_description'].isna() | (df_vlm['smolvlm_description'].str.strip() == '')
        missing_ids = set(df_vlm[missing_mask]['sample_id'])
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print("VLM file not found or is empty. Please run the initial generation script first.")
        exit()

    if not missing_ids:
        print("‚úÖ No missing descriptions found. The file is complete!")
        exit()
    print(f"Found {len(missing_ids)} products with missing descriptions to repair.")

    # --- Step 2: Create master lookup for image links ---
    print("Creating a master lookup for image links...")
    df_train = pd.read_csv(TRAIN_CSV_PATH)
    df_test = pd.read_csv(TEST_CSV_PATH)
    df_all = pd.concat([
        df_train[['sample_id', 'image_link']],
        df_test[['sample_id', 'image_link']]
    ]).drop_duplicates(subset=['sample_id']).set_index('sample_id')
    
    df_todo = df_all[df_all.index.isin(missing_ids)]

    if df_todo.empty:
        print("Could not find image links for any of the missing IDs. Exiting.")
        exit()
    print(f"Found {len(df_todo)} matching image links to re-process.")

    # --- Step 3: Process missing items and APPEND to the same CSV ---
    image_urls = df_todo['image_link'].tolist()
    sample_ids = df_todo.index.tolist()
    
    # Open the file in append mode. This is lightweight.
    with open(VLM_CSV_PATH, 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
        
        with ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
            future_to_id = {executor.submit(get_local_vlm_description, url): sid for url, sid in zip(image_urls, sample_ids)}
            
            for future in tqdm(as_completed(future_to_id), total=len(image_urls), desc="Repairing Descriptions"):
                sample_id = future_to_id[future]
                try:
                    description = future.result()
                    # Append the new (or re-processed) result. This will create duplicates.
                    writer.writerow([sample_id, description])
                except Exception as exc:
                    print(f'\nGenerated an exception for item {sample_id}: {exc}')
                    writer.writerow([sample_id, ""]) # Append a blank row on error

    print("\n‚úÖ Repair process complete. Appended new results. Now de-duplicating the file...")

    # --- Step 4: Final Cleanup Step ---
    # This reads the now-larger file once, de-duplicates it, and saves the clean version.
    # This is much more efficient than doing it in a loop.
    df_final = pd.read_csv(VLM_CSV_PATH, engine='python')
    # Keep the LAST entry for each sample_id, as it's the most recent (and hopefully correct) one
    df_cleaned = df_final.drop_duplicates(subset=['sample_id'], keep='last')
    
    # Overwrite the file with the final, clean version
    df_cleaned.to_csv(VLM_CSV_PATH, index=False)
    
    final_missing = df_cleaned['smolvlm_description'].isna().sum() + (df_cleaned['smolvlm_description'].str.strip() == '').sum()
    print(f"‚úÖ Cleanup complete. The file '{VLM_CSV_PATH}' is now updated and de-duplicated.")
    print(f"   ‚Üí Total missing descriptions remaining: {final_missing}")

handling missing image data 

In [None]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import os
import base64
import json
from concurrent.futures import ThreadPoolExecutor, as_completed
import csv
import time

# --- 1. API Interaction Function (This remains the same) ---
def get_local_vlm_description(image_url: str) -> str:
    api_url = "http://localhost:1234/v1/chat/completions"
    prompt_text = (
        "Analyze the image and complete the following fields based *only* on the visible text. "
        "Add a very small targetted informative description about the image and the product inside.\n"
        "Brand: \n" "Product: \n" "Size/Quantity: \n" "Features: \n"
        "above fileds are mandatory in that order itself"
    )
    try:
        with requests.Session() as session:
            headers = {'User-Agent': 'Mozilla/5.0'}
            response = session.get(image_url, timeout=30, headers=headers)
            response.raise_for_status()
        
        base64_image = base64.b64encode(response.content).decode('utf-8')
        payload = {
            "model": "smolvlm-256m-instruct",
            "messages": [
                { "role": "user", "content": [{"type": "text", "text": prompt_text}, {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}}]}
            ],
            "max_tokens": 250, "temperature": 0.0
        }
        api_response = requests.post(api_url, headers={"Content-Type": "application/json"}, json=payload, timeout=60)
        api_response.raise_for_status()
        response_json = api_response.json()
        if 'choices' in response_json and len(response_json['choices']) > 0:
            content = response_json['choices'][0]['message']['content']
            return " ".join(content.strip().splitlines())
        else:
            return ""
    except Exception:
        return ""

# --- Main execution block with LIGHTWEIGHT APPEND-ONLY REPAIR ---
if __name__ == '__main__':
    print("--- Starting TARGETED REPAIR for missing VLM descriptions (Lightweight Mode) ---")

    # --- Configuration ---
    BATCH_SIZE = 20
    TRAIN_CSV_PATH = os.path.join('dataset', 'train.csv')
    TEST_CSV_PATH = os.path.join('dataset', 'test.csv')
    OUTPUT_DIR = 'processed_data'
    VLM_CSV_PATH = os.path.join(OUTPUT_DIR, 'smolvlm_extracted_features[1].csv') # Use a consistent name

    # --- Step 1: Find missing descriptions ---
    print(f"Loading existing VLM file to find gaps: {VLM_CSV_PATH}")
    try:
        df_vlm = pd.read_csv(VLM_CSV_PATH, engine='python')
        missing_mask = df_vlm['smolvlm_description'].isna() | (df_vlm['smolvlm_description'].str.strip() == '')
        missing_ids = set(df_vlm[missing_mask]['sample_id'])
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print("VLM file not found or is empty. Please run the initial generation script first.")
        exit()

    if not missing_ids:
        print("‚úÖ No missing descriptions found. The file is complete!")
        exit()
    print(f"Found {len(missing_ids)} products with missing descriptions to repair.")

    # --- Step 2: Create master lookup for image links ---
    print("Creating a master lookup for image links...")
    df_train = pd.read_csv(TRAIN_CSV_PATH)
    df_test = pd.read_csv(TEST_CSV_PATH)
    df_all = pd.concat([
        df_train[['sample_id', 'image_link']],
        df_test[['sample_id', 'image_link']]
    ]).drop_duplicates(subset=['sample_id']).set_index('sample_id')
    
    df_todo = df_all[df_all.index.isin(missing_ids)]

    if df_todo.empty:
        print("Could not find image links for any of the missing IDs. Exiting.")
        exit()
    print(f"Found {len(df_todo)} matching image links to re-process.")

    # --- Step 3: Process missing items and APPEND to the same CSV ---
    image_urls = df_todo['image_link'].tolist()
    sample_ids = df_todo.index.tolist()
    
    # Open the file in append mode. This is lightweight.
    with open(VLM_CSV_PATH, 'a', newline='', encoding='utf-8') as f:
        writer = csv.writer(f, quoting=csv.QUOTE_MINIMAL)
        
        with ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
            future_to_id = {executor.submit(get_local_vlm_description, url): sid for url, sid in zip(image_urls, sample_ids)}
            
            for future in tqdm(as_completed(future_to_id), total=len(image_urls), desc="Repairing Descriptions"):
                sample_id = future_to_id[future]
                try:
                    description = future.result()
                    # Append the new (or re-processed) result. This will create duplicates.
                    writer.writerow([sample_id, description])
                except Exception as exc:
                    print(f'\nGenerated an exception for item {sample_id}: {exc}')
                    writer.writerow([sample_id, ""]) # Append a blank row on error

    print("\n‚úÖ Repair process complete. Appended new results. Now de-duplicating the file...")

    # --- Step 4: Final Cleanup Step ---
    # This reads the now-larger file once, de-duplicates it, and saves the clean version.
    # This is much more efficient than doing it in a loop.
    df_final = pd.read_csv(VLM_CSV_PATH, engine='python')
    # Keep the LAST entry for each sample_id, as it's the most recent (and hopefully correct) one
    df_cleaned = df_final.drop_duplicates(subset=['sample_id'], keep='last')
    
    # Overwrite the file with the final, clean version
    df_cleaned.to_csv(VLM_CSV_PATH, index=False)
    
    final_missing = df_cleaned['smolvlm_description'].isna().sum() + (df_cleaned['smolvlm_description'].str.strip() == '').sum()
    print(f"‚úÖ Cleanup complete. The file '{VLM_CSV_PATH}' is now updated and de-duplicated.")
    print(f"   ‚Üí Total missing descriptions remaining: {final_missing}")

In [None]:
import pandas as pd
import os

def clean_and_deduplicate_csv(file_path):
    """
    Cleans a VLM/OCR feature CSV by:
    1. Removing ALL rows that have a missing or empty description.
    2. De-duplicating any remaining sample_ids, keeping the last entry.
    """
    print(f"--- Starting Aggressive Cleanup for: {file_path} ---")

    # --- Step 1: Load the CSV file ---
    try:
        df = pd.read_csv(file_path, engine='python')
        initial_rows = len(df)
        print(f"Successfully loaded {initial_rows} rows.")
    except FileNotFoundError:
        print(f"Error: The file '{file_path}' was not found. Nothing to clean.")
        return
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return

    # --- Step 2: Remove ALL rows with missing descriptions ---
    # Create a mask to identify rows where 'smolvlm_description' is NaN (empty) or just whitespace
    empty_mask = df['smolvlm_description'].isna() | (df['smolvlm_description'].str.strip() == '')
    
    num_empty = empty_mask.sum()
    
    if num_empty > 0:
        print(f"Found {num_empty} rows with empty or missing descriptions.")
        # The `~` operator inverts the mask, keeping all rows that are NOT empty.
        df_cleaned = df[~empty_mask].copy() # Use .copy() to avoid SettingWithCopyWarning
        print(f"   ‚Üí Removed {num_empty} empty rows. Remaining rows: {len(df_cleaned)}")
    else:
        print("‚úÖ No empty rows found to remove.")
        df_cleaned = df.copy()

    # --- Step 3: De-duplicate the remaining data ---
    # After removing the empty rows, we might still have duplicates (e.g., if an ID was processed twice successfully).
    # We will keep the LAST entry for each sample_id, as it is the most recent.
    
    initial_dedupe_rows = len(df_cleaned)
    if df_cleaned.duplicated(subset=['sample_id']).any():
        print(f"Found duplicate sample_ids in the remaining data. Performing final de-duplication...")
        df_final = df_cleaned.drop_duplicates(subset=['sample_id'], keep='last')
        num_duplicates_removed = initial_dedupe_rows - len(df_final)
        print(f"   ‚Üí Removed {num_duplicates_removed} older duplicate entries.")
    else:
        print("‚úÖ No remaining duplicates found.")
        df_final = df_cleaned

    # --- Step 4: Save the cleaned DataFrame back to the same file ---
    # This overwrites the old, messy file with the new, clean one.
    df_final.to_csv(file_path, index=False)
    
    final_rows = len(df_final)
    print("\n--- ‚úÖ Cleanup Complete ---")
    print(f"   ‚Üí Initial rows: {initial_rows}")
    print(f"   ‚Üí Final rows:   {final_rows}")
    print(f"   ‚Üí Total rows removed: {initial_rows - final_rows}")
    print(f"Successfully saved the cleaned data back to '{file_path}'.")


# --- Main execution block ---
if __name__ == '__main__':
    # Define the path to your CSV file
    # I've updated the filename to match the one you used in your previous code.
    VLM_CSV_PATH = os.path.join('processed_data', 'smolvlm_extracted_features[1].csv') 
    
    clean_and_deduplicate_csv(VLM_CSV_PATH)

In [None]:
import pandas as pd
import ollama
from tqdm import tqdm
import os
import csv
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# --- 1. API Interaction Function for Ollama ---

def get_ollama_summary(catalog_text: str) -> str:
    """
    Sends catalog text to a local Ollama model and gets a single-line summary.
    """
    if not isinstance(catalog_text, str) or not catalog_text.strip():
        return "" # Return empty if the input is empty

    prompt = f"""
    Based on the following product text, generate a single, concise, one-sentence description.
    Mention the brand, the main product type, and one or two key features if possible.
    Do not use bullet points. Do not repeat the input.

    Product Text:
    ---
    {catalog_text[:2000]} 
    ---

    One-sentence description:
    """
    
    try:
        response = ollama.chat(
            model='tinyllama:latest',
            messages=[{'role': 'user', 'content': prompt}],
            options={'temperature': 0.2}
        )
        summary = response['message']['content']
        cleaned_summary = summary.strip().replace('\n', ' ').replace('"', '')
        return cleaned_summary
    except Exception:
        return ""

# --- Main execution block with TARGETED REPAIR logic ---
if __name__ == '__main__':
    print("--- Starting SMART REPAIR for smolvlm.csv using Ollama ---")
    print("This will fill gaps using summarized catalog_content.")

    # --- Configuration ---
    CONCURRENT_REQUESTS = 8 # Number of parallel requests to Ollama
    TRAIN_CSV_PATH = os.path.join('dataset', 'train.csv')
    TEST_CSV_PATH = os.path.join('dataset', 'test.csv')
    OUTPUT_DIR = 'processed_data'
    VLM_CSV_PATH = os.path.join(OUTPUT_DIR, 'smolvlm_extracted_features.csv') 

    # --- Step 1: Find missing descriptions in the existing VLM file ---
    print(f"Loading existing VLM file to find gaps: {VLM_CSV_PATH}")
    try:
        df_vlm = pd.read_csv(VLM_CSV_PATH, engine='python')
        # Identify rows where description is missing or blank
        missing_mask = df_vlm['smolvlm_description'].isna() | (df_vlm['smolvlm_description'].str.strip() == '')
        missing_ids = set(df_vlm[missing_mask]['sample_id'])
    except (FileNotFoundError, pd.errors.EmptyDataError):
        print(f"VLM file not found or is empty. Cannot repair. Please run the initial VLM script first.")
        exit()

    if not missing_ids:
        print("‚úÖ No missing descriptions found. The file is already complete!")
        exit()
    print(f"Found {len(missing_ids)} products with missing descriptions to repair.")

    # --- Step 2: Create master lookup to get the `catalog_content` for the missing IDs ---
    print("Creating a master lookup for catalog content...")
    df_train = pd.read_csv(TRAIN_CSV_PATH, usecols=['sample_id', 'catalog_content'])
    df_test = pd.read_csv(TEST_CSV_PATH, usecols=['sample_id', 'catalog_content'])
    df_all = pd.concat([df_train, df_test]).drop_duplicates(subset=['sample_id']).set_index('sample_id')
    
    # Filter the master list to get ONLY the items we need to process
    df_todo = df_all[df_all.index.isin(missing_ids)]

    if df_todo.empty:
        print("Could not find catalog_content for any of the missing IDs. Exiting.")
        exit()
    print(f"Found {len(df_todo)} matching catalog_content entries to summarize.")

    # --- Step 3: Process only the missing items' content and APPEND to the CSV ---
    catalog_texts = df_todo['catalog_content'].tolist()
    sample_ids = df_todo.index.tolist()
    
    # Open the file in append mod

backup for solvlm2-500m-video-instruct

In [None]:
import pandas as pd
import os
import re

def flatten_catalog_content(text: str) -> str:
    """
    Cleans and consolidates the raw catalog_content into a single line of text.
    """
    if not isinstance(text, str):
        return ""
    text = re.sub(r'<.*?>', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def find_and_replace_raw_catalog_content(
    train_csv_path=os.path.join('dataset', 'train.csv'),
    test_csv_path=os.path.join('dataset', 'test.csv'),
    vlm_csv_path=os.path.join('processed_data', 'smolvlm_extracted_features.csv')
):
    """
    Finds rows in the VLM file that were filled with raw catalog_content
    and replaces them with a cleaned, single-line version.
    """
    print(f"--- Starting 'Find and Replace' for: {vlm_csv_path} ---")

    # --- Step 1: Load the existing VLM file that needs fixing ---
    print(f"1. Loading existing VLM file from: {vlm_csv_path}")
    try:
        df_vlm = pd.read_csv(vlm_csv_path)
    except FileNotFoundError:
        print(f"Error: The file '{vlm_csv_path}' was not found. Nothing to fix.")
        return
    print(f"   ‚úì Successfully loaded {len(df_vlm)} rows.")

    # --- Step 2: Identify the "bad" rows ---
    # A row is considered "bad" if its description contains characters that suggest it's raw,
    # uncleaned catalog_content, like newlines or HTML tags.
    # We also check for very long descriptions as another signal.
    print("2. Identifying rows that need to be cleaned...")
    bad_row_mask = (
        df_vlm['smolvlm_description'].str.contains('\n|<br>', regex=True, na=False) |
        (df_vlm['smolvlm_description'].str.len() > 1000) # Descriptions longer than 1000 chars are likely raw content
    )
    
    num_bad_rows = bad_row_mask.sum()

    if num_bad_rows == 0:
        print("‚úÖ No rows with raw catalog_content detected. The file appears to be clean.")
        return

    print(f"   ‚Üí Found {num_bad_rows} rows that need to be replaced with a clean, single-line summary.")
    
    # Get the list of IDs we need to fix
    ids_to_fix = df_vlm[bad_row_mask]['sample_id'].tolist()

    # --- Step 3: Load the master scaffold to get the original catalog_content ---
    print("3. Loading master data to get the source catalog_content...")
    try:
        df_train = pd.read_csv(train_csv_path, usecols=['sample_id', 'catalog_content'])
        df_test = pd.read_csv(test_csv_path, usecols=['sample_id', 'catalog_content'])
        df_all_products = pd.concat([df_train, df_test]).drop_duplicates(subset=['sample_id'])
    except FileNotFoundError as e:
        print(f"FATAL ERROR: Could not find a master CSV file. {e}")
        return

    # Filter the master list to get ONLY the content for the rows we need to fix
    df_source_content = df_all_products[df_all_products['sample_id'].isin(ids_to_fix)]
    
    if df_source_content.empty:
        print("Warning: Could not find source content for the identified bad rows. Cannot proceed.")
        return

    # --- Step 4: Generate the clean, single-line replacements ---
    print(f"4. Generating {len(df_source_content)} single-line replacements...")
    # Create a dictionary mapping sample_id to its new, clean description
    replacements = {
        row['sample_id']: flatten_catalog_content(row['catalog_content'])
        for _, row in df_source_content.iterrows()
    }
    print("   ‚úì Replacements generated.")

    # --- Step 5: Update the original DataFrame and save ---
    print("5. Replacing bad rows and saving the final file...")
    # Set 'sample_id' as the index for efficient updating
    df_vlm.set_index('sample_id', inplace=True)
    
    # Create a pandas Series from the replacements dictionary
    replacement_series = pd.Series(replacements, name='smolvlm_description')
    
    # The .update() method will overwrite the values in df_vlm for the matching indices
    df_vlm.update(replacement_series)
    
    # Bring 'sample_id' back as a column
    df_vlm.reset_index(inplace=True)

    # Overwrite the old file with the now-repaired version
    df_vlm.to_csv(vlm_csv_path, index=False)

    print("\n--- ‚úÖ Process Complete ---")
    print(f"The file '{vlm_csv_path}' has been successfully repaired.")
    print(f"   ‚Üí Replaced {len(replacements)} messy descriptions with clean, single-line versions.")

if __name__ == '__main__':
    # Make sure this path is correct
    repair_and_fill_vlm_csv(vlm_csv_path=os.path.join('processed_data', 'smolvlm_extracted_features[1].csv'))

In [None]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm
import os
import base64
import json
from concurrent.futures import ThreadPoolExecutor, as_completed

# --- 1. API Interaction Function (with the new, simplified prompt) ---

def get_local_vlm_description(image_url: str) -> str:
    """
    Downloads an image, encodes it, and gets a structured description from a local
    OpenAI-compatible VLM server (like LM Studio).
    """
    api_url = "http://localhost:1234/v1/chat/completions"
    
    # --- THIS IS THE NEW, SIMPLIFIED PROMPT ---
    # It is designed for smaller models that struggle with complex instructions.
    # This "fill-in-the-blanks" format is much easier for them to follow.
    prompt_text = (
        "Analyze the image and extract the exact text visible in the image. "
        "Include all visible text.\n\n"
        "Brand: \n"
        "Product: \n"
        "Size/Quantity: \n"
        "Features: \n"
        "a short description of the product"
        "also give a short description of the features from the image and the product"
    )

    try:
        headers = {'User-Agent': 'Mozilla/5.0'}
        response = requests.get(image_url, timeout=20, headers=headers)
        response.raise_for_status()

        base64_image = base64.b64encode(response.content).decode('utf-8')

        payload = {
            "model": "smolvlm-256m-instruct",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt_text},
                        {
                            "type": "image_url",
                            "image_url": { "url": f"data:image/jpeg;base64,{base64_image}" }
                        }
                    ]
                }
            ],
            "max_tokens": 250,
            "temperature": 0.0 # Keep temperature at 0 for factual extraction
        }

        api_response = requests.post(api_url, headers={"Content-Type": "application/json"}, json=payload)
        api_response.raise_for_status()
        response_json = api_response.json()

        if 'choices' in response_json and len(response_json['choices']) > 0:
            content = response_json['choices'][0]['message']['content']
            # We combine the lines into a single string for easy processing later
            return " ".join(content.strip().splitlines())
        else:
            return ""

    except Exception:
        return ""

# --- Main execution block with Parallel Batch Processing and CHECKPOINTING ---
if __name__ == '__main__':
    print("--- Starting PARALLEL BATCH Feature Extraction with CHECKPOINTING ---")
    print("IMPORTANT: Ensure your LM Studio server is running.")

    # --- Configuration ---
    BATCH_SIZE = 20
    TRAIN_CSV_PATH = os.path.join('dataset', 'train.csv')
    TEST_CSV_PATH = os.path.join('dataset', 'test.csv')
    OUTPUT_DIR = 'processed_data'
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # --- NEW: Define the checkpoint file path ---
    CHECKPOINT_FILE = os.path.join(OUTPUT_DIR, 'smolvlm_extracted_features[1].csv')

    print(f"Loading CSV files: {TRAIN_CSV_PATH} and {TEST_CSV_PATH}...")
    try:
        df_train = pd.read_csv(TRAIN_CSV_PATH)
        df_test = pd.read_csv(TEST_CSV_PATH)
    except FileNotFoundError as e:
        print(f"Error: {e}. Make sure your train.csv and test.csv are in the 'dataset' folder.")
        exit()

    df_all = pd.concat([
        df_train[['sample_id', 'image_link']],
        df_test[['sample_id', 'image_link']]
    ]).drop_duplicates(subset=['sample_id']).reset_index(drop=True)

    processed_ids = set()
    # --- NEW: Check if a checkpoint file exists and load it ---
    if os.path.exists(CHECKPOINT_FILE):
        print(f"Resuming from existing checkpoint file: {CHECKPOINT_FILE}")
        df_checkpoint = pd.read_csv(CHECKPOINT_FILE)
        processed_ids = set(df_checkpoint['sample_id'])
        print(f"Found {len(processed_ids)} products already processed.")
    else:
        print("No checkpoint file found. Starting a new run.")
        # Create an empty file with headers if it doesn't exist
        pd.DataFrame(columns=['sample_id', 'smolvlm_description']).to_csv(CHECKPOINT_FILE, index=False)

    # --- NEW: Filter out the products that have already been processed ---
    df_todo = df_all[~df_all['sample_id'].isin(processed_ids)]

    if df_todo.empty:
        print("All products have already been processed. Nothing to do.")
        exit()

    print(f"Found a total of {len(df_todo)} new products to process.")

    # --- Process the remaining items in parallel batches ---
    image_urls = df_todo['image_link'].tolist()
    sample_ids = df_todo['sample_id'].tolist()
    
    with ThreadPoolExecutor(max_workers=BATCH_SIZE) as executor:
        # Map futures to sample_ids to keep track of them
        future_to_id = {executor.submit(get_local_vlm_description, url): sid for url, sid in zip(image_urls, sample_ids)}
        
        results_to_append = []

        for future in tqdm(as_completed(future_to_id), total=len(image_urls), desc="Processing Batches"):
            sample_id = future_to_id[future]
            try:
                description = future.result()
                results_to_append.append({'sample_id': sample_id, 'smolvlm_description': description})

                # --- NEW: Save to CSV in batches to create checkpoints ---
                if len(results_to_append) >= BATCH_SIZE:
                    # Append the batch of results to the CSV file
                    pd.DataFrame(results_to_append).to_csv(CHECKPOINT_FILE, mode='a', header=False, index=False)
                    results_to_append = [] # Clear the batch
                    
            except Exception as exc:
                print(f'\nGenerated an exception for item {sample_id}: {exc}')
                results_to_append.append({'sample_id': sample_id, 'smolvlm_description': ""})

    # --- NEW: Save any remaining results after the loop finishes ---
    if results_to_append:
        pd.DataFrame(results_to_append).to_csv(CHECKPOINT_FILE, mode='a', header=False, index=False)

    print(f"\n‚úÖ Parallel Batch VLM feature extraction complete. All descriptions saved to '{CHECKPOINT_FILE}'")

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz, hstack
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import lightgbm as lgb
from sklearn.metrics import r2_score
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os

# --- 1. Re-usable Components ---

def smape(y_true, y_pred):
    """Calculates the Symmetric Mean Absolute Percentage Error (SMAPE)."""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio) * 100

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def clean_text(text):
    """A consistent cleaning function for both text sources."""
    if not isinstance(text, str): return ""
    text = re.sub(r'<.*?>|Value:.*|Unit:.*', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and len(w) > 2]
    return ' '.join(lemmatized_words)

# --- 2. Load and Combine All Data Sources ---

def load_and_combine_all_features(data_dir='processed_data'):
    """
    Loads all preprocessed features and combines them.
    Returns the combined feature matrix and the original DataFrame with prices.
    """
    try:
        print("--- Loading All Preprocessed Data ---")
        
        X_text_struct = load_npz(f'{data_dir}/X_processed.npz')
        y_df = pd.read_csv(f'{data_dir}/y_processed.csv').set_index('sample_id')
        
        vlm_df = pd.read_csv(f'{data_dir}/smolvlm_extracted_features.csv').set_index('sample_id')
        vlm_df = vlm_df.reindex(y_df.index).fillna('')
        vlm_df['cleaned_vlm'] = vlm_df['smolvlm_description'].apply(clean_text)
        
        vlm_vectorizer = TfidfVectorizer(max_features=1500, ngram_range=(1, 2))
        X_vlm = vlm_vectorizer.fit_transform(vlm_df['cleaned_vlm'])
        
        X_images = np.load(f'{data_dir}/X_image_features.npy')
        all_ids_df = pd.read_csv(f'{data_dir}/all_sample_ids.csv').set_index('sample_id')
        image_features_df = pd.DataFrame(X_images, index=all_ids_df.index)
        X_images_aligned = image_features_df.reindex(y_df.index).values
        
        X_final_combined = hstack([X_text_struct, X_vlm, X_images_aligned]).tocsr()

        print(f"‚úÖ Data loaded and combined successfully! Final shape: {X_final_combined.shape}")
        
        # We need the original dataframe with IDs and prices for splitting and scoring
        full_df = pd.read_csv(f'{data_dir}/y_processed.csv')
        print("Vectorizing VLM text...")
        vlm_vectorizer = TfidfVectorizer(max_features=1500, ngram_range=(1, 2))
        # This line was changed: we fit_transform on the training data
        X_vlm = vlm_vectorizer.fit_transform(vlm_df['cleaned_vlm'])
        
        # --- ADD THIS LINE ---
        joblib.dump(vlm_vectorizer, f'{data_dir}/vlm_vectorizer.joblib')
        print("Saved VLM TF-IDF vectorizer.")
        return X_final_combined, full_df
        
    except FileNotFoundError as e:
        print(f"Error loading data: {e}. Please ensure you have run all three preprocessing scripts first.")
        return None, None

# --- 3. Full Train, Predict, and Evaluate Pipeline ---

def run_full_pipeline(X, df):
    """
    Trains on a subset, predicts on a holdout, saves a submission file, and scores it.
    """
    if X is None or df is None: return

    print("\n--- Simulating Competition Workflow ---")
    
    # --- Step 1: Split the data into a training set and a "pretend" test set ---
    # We split based on the dataframe to keep track of IDs
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
    
    # Get the corresponding rows from the sparse matrix X
    X_train = X[train_df.index]
    y_train = train_df['price_log']
    
    X_test = X[test_df.index]
    # We will use this later to score our predictions
    y_test_ground_truth_log = test_df['price_log'] 

    print(f"Training on {len(train_df)} samples, predicting on {len(test_df)} samples.")

    # --- Step 2: Train the model ONLY on the training set ---
    print("\nTraining a LightGBM Regressor model...")
    lgbm = lgb.LGBMRegressor(
        objective='regression_l1', metric='mae', n_estimators=1000,
        learning_rate=0.05, num_leaves=31, random_state=42, n_jobs=-1
    )
    lgbm.fit(X_train, y_train, eval_set=[(X_test, y_test_ground_truth_log)], callbacks=[lgb.early_stopping(50, verbose=False)])
    print("Model training complete.")

    # --- Step 3: Make predictions on the "pretend" test set ---
    print("Making predictions on the holdout test set...")
    predictions_log = lgbm.predict(X_test)
    
    # Inverse transform to get actual prices
    predictions_actual = np.expm1(predictions_log)
    predictions_actual[predictions_actual < 0] = 0 # Enforce non-negative constraint

    # --- Step 4: Create and save the submission CSV file ---
    submission_df = pd.DataFrame({
        'sample_id': test_df['sample_id'],
        'price': predictions_actual
    })
    
    submission_path = 'sample_submission.csv'
    submission_df.to_csv(submission_path, index=False)
    print(f"‚úÖ Submission file created at: '{submission_path}'")

    # --- Step 5: Score the submission file against the ground truth ---
    print("\n--- Scoring Submission File ---")
    
    # Load the ground truth from the original file
    ground_truth_df = pd.read_csv('68e8d1d70b66d_student_resource/student_resource/dataset/sample_test_out.csv')
    
    # Merge our predictions with the ground truth
    # This mimics exactly how the competition leaderboard is calculated
    merged_score_df = pd.merge(submission_df, ground_truth_df, on='sample_id', suffixes=('_pred', '_true'))
    
    if len(merged_score_df) != len(test_df):
        print("Warning: The number of predictions does not match the test set size!")

    # Calculate the final SMAPE score
    final_smape_score = smape(merged_score_df['price_true'], merged_score_df['price_pred'])
    
    print("\n--- FINAL SIMULATED SCORE ---")
    print(f"SMAPE against sample_test_out.csv: {final_smape_score:.4f}%")
    
    return lgbm

# --- Main execution block ---
if __name__ == '__main__':
    # Load all features and the corresponding dataframe with IDs and prices
    X_final, df_final = load_and_combine_all_features()
    
    # Run the full simulation
    final_model = run_full_pipeline(X_final, df_final)
    
    if final_model:
        joblib.dump(final_model, 'final_lgbm_model_for_submission.joblib')
        print("\n‚úÖ Final trained model saved as 'final_lgbm_model_for_submission.joblib'")
        print("You can use this model to predict on the real 'test.csv' for the competition.")

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
import joblib
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
import os # Added for path joining
from sklearn.metrics import r2_score # Added for R-squared score

# --- 1. Load All Necessary Artifacts and Helper Functions ---
print("--- Loading Artifacts for Prediction ---")

# --- Helper Functions (must be identical to training) ---
# NOTE: The SMAPE function is now defined here for scoring
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio) * 100

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def clean_text(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'<.*?>|Value:.*|Unit:.*', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and len(w) > 2]
    return ' '.join(lemmatized_words)

def extract_structured_data(text):
    if not isinstance(text, str): return pd.Series([np.nan, 'unknown'], index=['Value', 'Unit'])
    value_match = re.search(r'Value:\s*([\d\.]+)', text)
    value = float(value_match.group(1)) if value_match else np.nan
    unit_match = re.search(r'Unit:\s*(\w+)', text)
    unit = unit_match.group(1) if unit_match else 'unknown'
    return pd.Series([value, unit], index=['Value', 'Unit'])

def standardize_units(unit):
    if not isinstance(unit, str): return 'unknown'
    unit = unit.lower().strip()
    unit_map = {'oz': 'ounce', 'ounces': 'ounce', 'fl oz': 'fl_oz', 'fz': 'fl_oz', 'ct': 'count', 'none': 'unknown'}
    return unit_map.get(unit, unit)

# --- Load Saved Objects ---
try:
    PREPROCESSOR_PATH = 'processed_data/preprocessor.joblib'
    VLM_VECTORIZER_PATH = 'processed_data/vlm_vectorizer.joblib'
    MODEL_PATH = 'final_lgbm_model_for_submission.joblib'
    
    preprocessor = joblib.load(PREPROCESSOR_PATH)
    vlm_vectorizer = joblib.load(VLM_VECTORIZER_PATH)
    model = joblib.load(MODEL_PATH)
    print("‚úÖ All models and preprocessors loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading artifact: {e}")
    print("Please ensure you have run the full training pipeline on the complete dataset first.")
    exit()

# --- 2. Feature Engineering Function for New Data (using robust reindexing) ---
def create_features_for_test_data(df, data_dir='processed_data'):
    """
    Applies the complete feature engineering pipeline to new, unseen test data.
    """
    print("\n--- Applying Feature Engineering to Test Data ---")
    
    df_indexed = df.set_index('sample_id')

    print("Processing catalog content...")
    extracted_data = df['catalog_content'].apply(extract_structured_data)
    df_features = pd.concat([df, extracted_data], axis=1)
    df_features['cleaned_catalog'] = df_features['catalog_content'].apply(clean_text)
    df_features['Unit_standardized'] = df_features['Unit'].apply(standardize_units)
    X_text_struct = preprocessor.transform(df_features)

    print("Processing VLM text features...")
    vlm_df_all = pd.read_csv(os.path.join(data_dir, 'smolvlm_extracted_features.csv')).set_index('sample_id')
    vlm_df_aligned = vlm_df_all.reindex(df_indexed.index).fillna('')
    vlm_df_aligned['cleaned_vlm'] = vlm_df_aligned['smolvlm_description'].apply(clean_text)
    X_vlm = vlm_vectorizer.transform(vlm_df_aligned['cleaned_vlm'])

    print("Processing visual features...")
    all_image_features = np.load(os.path.join(data_dir, 'X_image_features.npy'))
    all_ids_df = pd.read_csv(os.path.join(data_dir, 'all_sample_ids.csv')).set_index('sample_id')
    image_features_df = pd.DataFrame(all_image_features, index=all_ids_df.index)
    image_features_aligned = image_features_df.reindex(df_indexed.index)
    
    if image_features_aligned.isnull().values.any():
        print("Warning: Missing visual features for some IDs. Imputing with mean.")
        image_features_aligned = image_features_aligned.fillna(image_features_aligned.mean())
    X_images_test = image_features_aligned.values

    print("Combining all feature sets...")
    X_test_final = hstack([X_text_struct, X_vlm, X_images_test]).tocsr()
    
    if X_text_struct.shape[0] != X_images_test.shape[0]:
         raise ValueError(f"Row count mismatch! Text features: {X_text_struct.shape[0]}, Image features: {X_images_test.shape[0]}")
    
    print(f"‚úÖ Final test feature matrix created with shape: {X_test_final.shape}")
    return X_test_final

# --- 3. Main Prediction & Evaluation Block ---
if __name__ == '__main__':
    # --- Define file paths for the sample data evaluation ---
    # We will PREDICT on `sample_test.csv` and SCORE against `sample_test_out.csv`
    TEST_DATA_PATH = '68e8d1d70b66d_student_resource\student_resource\dataset\sample_test.csv'
    GROUND_TRUTH_PATH = '68e8d1d70b66d_student_resource\student_resource\dataset\sample_test_out.csv'
    SUBMISSION_FILE_PATH = 'sample_test_prediction_output.csv' # Give it a different name
    
    print(f"\nLoading SAMPLE test data from: {TEST_DATA_PATH}")
    test_df = pd.read_csv(TEST_DATA_PATH)
    
    # --- Create features for the test data ---
    X_test = create_features_for_test_data(test_df.copy())
    
    # --- Make Predictions ---
    print("\nMaking final predictions on sample data...")
    predictions_log = model.predict(X_test)
    
    predictions_actual = np.expm1(predictions_log)
    predictions_actual[predictions_actual < 0] = 0

    # --- Create and Save the Prediction File ---
    submission_df = pd.DataFrame({
        'sample_id': test_df['sample_id'],
        'price': predictions_actual
    })
    submission_df.to_csv(SUBMISSION_FILE_PATH, index=False)
    print(f"Prediction file for sample data generated at: '{SUBMISSION_FILE_PATH}'")
    
    # --- NEW: Score the predictions against the ground truth ---
    print("\n--- Scoring Predictions Against Ground Truth ---")
    try:
        ground_truth_df = pd.read_csv(GROUND_TRUTH_PATH)
        
        # Merge our predictions with the true prices
        merged_score_df = pd.merge(submission_df, ground_truth_df, on='sample_id', suffixes=('_pred', '_true'))
        
        if len(merged_score_df) != len(test_df):
            print("Warning: Not all sample IDs could be scored.")

        # Calculate and print the final scores
        final_smape_score = smape(merged_score_df['price_true'], merged_score_df['price_pred'])
        final_r2_score = r2_score(merged_score_df['price_true'], merged_score_df['price_pred'])

        print("\n--- FINAL PERFORMANCE ON SAMPLE DATA ---")
        print(f"SMAPE Score: {final_smape_score:.4f}%")
        print(f"R-squared (R¬≤): {final_r2_score:.4f}")

    except FileNotFoundError:
        print(f"\nCould not find ground truth file at '{GROUND_TRUTH_PATH}'. Skipping scoring.")
        
    print(f"\n\nüéâ SUCCESS! üéâ")
    print("The script has successfully generated predictions and scored them.")
    print("To generate the REAL submission, change TEST_DATA_PATH to 'dataset/test.csv' and remove the scoring section.")

extract_image_features.py

In [None]:
import torch
import torchvision.models as models
from torchvision import transforms
from PIL import Image
import requests
from io import BytesIO
import numpy as np
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import os
import time
import warnings
warnings.filterwarnings('ignore')

# ==========================================
# GPU SETUP
# ==========================================

def setup_device():
    """Setup PyTorch device with optimization."""
    if torch.cuda.is_available():
        device = torch.device('cuda')
        gpu_name = torch.cuda.get_device_name(0)
        gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9

        print(f"‚úÖ GPU: {gpu_name}")
        print(f"   Memory: {gpu_memory:.1f} GB")
        print(f"   CUDA: {torch.version.cuda}")

        # Optimization for RTX 4060
        torch.backends.cudnn.benchmark = True  # Auto-tune convolutions
        torch.cuda.empty_cache()

    else:
        device = torch.device('cpu')
        print("‚ö†Ô∏è  Using CPU")

    return device

# ==========================================
# MODEL SETUP
# ==========================================

def setup_model(device):
    """Load ResNet50 optimized for inference."""
    print("Loading ResNet50...")

    # Load pretrained model
    model = models.resnet50(weights='IMAGENET1K_V2')

    # Remove classification head (keep features only)
    model = torch.nn.Sequential(*list(model.children())[:-1])

    # Move to GPU and set to eval mode
    model = model.to(device)
    model.eval()

    # Enable mixed precision for faster inference (RTX 4060 supports this)
    if device.type == 'cuda':
        model = model.half()  # FP16 for 2x speedup
        print("‚úÖ Model loaded with FP16 optimization (2048D features)")
    else:
        print("‚úÖ Model loaded (2048D features)")

    return model

# ==========================================
# IMAGE PREPROCESSING
# ==========================================

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

def download_image(url, max_retries=3):
    """Download and preprocess image with retries."""
    for attempt in range(max_retries):
        try:
            response = requests.get(
                url,
                timeout=12,
                headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'}
            )
            response.raise_for_status()

            # Load and convert to RGB
            img = Image.open(BytesIO(response.content)).convert('RGB')

            # Apply transforms
            return transform(img)

        except Exception:
            if attempt < max_retries - 1:
                time.sleep(0.3)
            else:
                return None

    return None

# ==========================================
# BATCH PROCESSING
# ==========================================

def process_batch(urls, sample_ids, model, device, max_workers=20):
    """Process batch with parallel downloads + GPU inference."""

    # Parallel download
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        images = list(executor.map(download_image, urls))

    # Separate valid/invalid
    valid_images = []
    valid_indices = []

    for i, img in enumerate(images):
        if img is not None:
            valid_images.append(img)
            valid_indices.append(i)

    # Initialize with NaN
    features = np.full((len(urls), 2048), np.nan, dtype=np.float32)

    # GPU inference
    if valid_images:
        batch_tensor = torch.stack(valid_images).to(device)

        # Use FP16 if GPU
        if device.type == 'cuda':
            batch_tensor = batch_tensor.half()

        with torch.no_grad():
            batch_features = model(batch_tensor)
            batch_features = batch_features.squeeze(-1).squeeze(-1)
            batch_features = batch_features.float().cpu().numpy()  # Back to FP32

        # Assign features
        for i, idx in enumerate(valid_indices):
            features[idx] = batch_features[i]

    success = [img is not None for img in images]

    return sample_ids, features, success

# ==========================================
# CHECKPOINTING
# ==========================================

class Checkpoint:
    """Incremental saving with resume capability."""

    def __init__(self, output_dir):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

        self.features_file = os.path.join(output_dir, 'X_image_features.npy')
        self.ids_file = os.path.join(output_dir, 'all_sample_ids.csv')

    def load_progress(self):
        """Get already processed IDs."""
        if os.path.exists(self.ids_file):
            df = pd.read_csv(self.ids_file)
            processed = set(df['sample_id'].values)
            print(f"üìÇ Resuming: {len(processed)} processed")
            return processed
        return set()

    def save_batch(self, sample_ids, features):
        """Append batch."""
        # Features
        if os.path.exists(self.features_file):
            existing = np.load(self.features_file)
            combined = np.vstack([existing, features])
        else:
            combined = features

        np.save(self.features_file, combined)

        # IDs
        pd.DataFrame({'sample_id': sample_ids}).to_csv(
            self.ids_file,
            mode='a',
            header=not os.path.exists(self.ids_file),
            index=False
        )

# ==========================================
# MAIN PIPELINE
# ==========================================

def extract_features(
    train_csv='dataset/train.csv',
    test_csv='dataset/test.csv',
    output_dir='processed_data',
    batch_size=96,
    max_workers=20
):
    """Main extraction pipeline."""

    print("="*70)
    print("Amazon ML Challenge 2025 - Image Feature Extraction")
    print("="*70)

    # Setup
    device = setup_device()
    model = setup_model(device)
    checkpoint = Checkpoint(output_dir)

    # Load data
    print(f"\nüìÇ Loading data...")
    df_train = pd.read_csv(train_csv)
    df_test = pd.read_csv(test_csv)

    df_all = pd.concat([
        df_train[['sample_id', 'image_link']],
        df_test[['sample_id', 'image_link']]
    ]).drop_duplicates(subset=['sample_id']).reset_index(drop=True)

    print(f"   Total: {len(df_all)}")

    # Resume
    processed = checkpoint.load_progress()
    df_todo = df_all[~df_all['sample_id'].isin(processed)].reset_index(drop=True)

    if df_todo.empty:
        print("\n‚úÖ All processed!")
        return

    print(f"   Remaining: {len(df_todo)}\n")

    # Process
    num_batches = (len(df_todo) + batch_size - 1) // batch_size

    print(f"üöÄ Processing batches of {batch_size} (ETA: ~{num_batches*2.5/60:.0f} min)")
    print(f"   Workers: {max_workers}\n")

    start = time.time()
    stats = {'success': 0, 'failed': 0}

    pbar = tqdm(range(0, len(df_todo), batch_size), total=num_batches, desc="Progress")

    for i in pbar:
        batch_df = df_todo.iloc[i:i + batch_size]

        batch_ids, batch_features, success = process_batch(
            batch_df['image_link'].tolist(),
            batch_df['sample_id'].tolist(),
            model, device, max_workers
        )

        checkpoint.save_batch(batch_ids, batch_features)

        # Stats
        stats['success'] += sum(success)
        stats['failed'] += len(success) - sum(success)

        # Update progress bar
        total = stats['success'] + stats['failed']
        rate = stats['success'] / total * 100 if total > 0 else 0
        speed = total / (time.time() - start)
        pbar.set_postfix({
            'Success': f"{rate:.1f}%",
            'Speed': f"{speed:.1f} img/s"
        })

        # Clear GPU cache periodically
        if i % 50 == 0 and device.type == 'cuda':
            torch.cuda.empty_cache()

    elapsed = time.time() - start

    # Summary
    print(f"\n{'='*70}")
    print("‚úÖ Extraction Complete!")
    print(f"{'='*70}")
    total = stats['success'] + stats['failed']
    print(f"Total: {total:,}")
    print(f"Success: {stats['success']:,} ({stats['success']/total*100:.1f}%)")
    print(f"Failed: {stats['failed']:,}")
    print(f"Time: {elapsed/60:.1f} min")
    print(f"Speed: {total/elapsed:.1f} images/sec")

    # Impute
    print(f"\nüîß Final processing...")
    features = np.load(checkpoint.features_file)

    if np.isnan(features).any():
        n_missing = np.isnan(features).any(axis=1).sum()
        print(f"   Imputing {n_missing:,} samples...")

        col_means = np.nanmean(features, axis=0)
        for col in range(features.shape[1]):
            mask = np.isnan(features[:, col])
            features[mask, col] = col_means[col]

        np.save(checkpoint.features_file, features)

    print(f"\n‚úÖ Saved: {checkpoint.features_file}")
    print(f"   Shape: {features.shape}")
    print(f"{'='*70}\n")

# ==========================================
# RUN
# ==========================================

if __name__ == '__main__':
    CONFIG = {
        'train_csv': 'dataset/train.csv',
        'test_csv': 'dataset/test.csv',
        'output_dir': 'processed_data',
        'batch_size': 96,      # Optimized for RTX 4060
        'max_workers': 20      # Max parallel downloads
    }

    extract_features(**CONFIG)


In [None]:
!pip install optuna joblib scipy 

dilkash

In [None]:
!pip install lightgbm --config-settings=cmake.define.USE_GPU=ON


In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold
from scipy.sparse import hstack
import joblib
import os
import warnings
import time
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm

# Initialize tqdm for pandas apply functions
tqdm.pandas(desc="Applying")

warnings.filterwarnings('ignore')

# --- 1. Configuration & Setup ---
CONFIG = {
    'n_folds': 5, # Start with 5 for faster iteration, can increase for final run
    'random_state': 42,
    'st_batch_size': 256, # Batch size for Sentence Transformer encoding
}

# --- GPU Check ---
if torch.cuda.is_available():
    DEVICE = 'cuda'
    print(f"‚úÖ GPU found: {torch.cuda.get_device_name(0)}. Training will be GPU-accelerated.")
else:
    DEVICE = 'cpu'
    print("‚ö†Ô∏è No GPU found. Training will run on CPU.")

# Download NLTK data
nltk.download('stopwords', quiet=True); nltk.download('wordnet', quiet=True)

# --- 2. Helper Functions (Advanced Regex and Feature Creation) ---
lemmatizer = WordNetLemmatizer(); stop_words = set(stopwords.words('english'))
# (All helper functions from your provided script are excellent and remain here)
# ... [Keeping all your great helper functions: measurement_patterns, extract_from_text_with_patterns, etc.] ...
def smape(y_true, y_pred):
    y_true_actual = np.expm1(y_true); y_pred_actual = np.expm1(y_pred)
    numerator = np.abs(y_pred_actual - y_true_actual); denominator = (np.abs(y_true_actual) + np.abs(y_pred_actual)) / 2
    ratio = np.where(denominator == 0, 0, numerator / denominator); return np.mean(ratio) * 100
measurement_patterns = {'weight': {'regex': r'(?:weight|net weight|gross weight|capacity|pack weight)\s*[:\-\(\[]?\s*(\d+\.?\d*)\s*(kg|g|lbs|lb|oz|ounces|gram|kilogram)\b', 'units': {'kg': 1000, 'g': 1, 'lbs': 453.592, 'lb': 453.592, 'oz': 28.3495, 'ounces': 28.3495, 'gram': 1, 'kilogram': 1000}, 'default_unit': 'g'}, 'volume': {'regex': r'(?:volume|liquid|capacity|ml|l|liter|fl oz|floz|fluid ounce)\s*[:\-\(\[]?\s*(\d+\.?\d*)\s*(ml|l|liter|fl\s*oz|floz|fluid\s*ounce)\b', 'units': {'ml': 1, 'l': 1000, 'liter': 1000, 'fl oz': 29.5735, 'floz': 29.5735, 'fluid ounce': 29.5735}, 'default_unit': 'ml'}, 'dimension': {'regex': r'(?:dimension|size|length|width|height|depth)\s*[:\-\(\[]?\s*(\d+\.?\d*)\s*(cm|m|inch|in|ft|foot|meter|centimeter)\b', 'units': {'cm': 1, 'm': 100, 'inch': 2.54, 'in': 2.54, 'ft': 30.48, 'foot': 30.48, 'meter': 100, 'centimeter': 1}, 'default_unit': 'cm'}, 'wattage': {'regex': r'(?:power|wattage|watts)\s*[:\-\(\[]?\s*(\d+\.?\d*)\s*(w|watt|watts|kw|kilowatt)\b', 'units': {'w': 1, 'watt': 1, 'watts': 1, 'kw': 1000, 'kilowatt': 1000}, 'default_unit': 'w'}, 'count': {'regex': r'(\d+)\s*(?:pack|pcs|pieces|units|ct|count)\b', 'units': {'pack': 1, 'pcs': 1, 'pieces': 1, 'units': 1, 'ct': 1, 'count': 1}, 'default_unit': 'count'}, 'percentage': {'regex': r'(\d+\.?\d*)\s*%', 'units': {'%': 1}, 'default_unit': '%'}, 'value_and_unit': {'regex': r'value\s*[:\-\(\[]?\s*([\d\.]+)\s*unit\s*[:\-\(\[]?\s*(\w+)', 'units': {}, 'default_unit': 'unknown'}, 'general_number_unit': {'regex': r'(\d+\.?\d*)\s*([a-zA-Z]{1,5})\b', 'units': {}, 'default_unit': 'unknown'}}
def extract_from_text_with_patterns(text, pattern_type):
    if not isinstance(text, str): return np.nan, 'unknown'
    text_lower = text.lower(); pattern_info = measurement_patterns.get(pattern_type)
    if not pattern_info: return np.nan, 'unknown'
    regex = pattern_info['regex']; matches = re.findall(regex, text_lower)
    if matches:
        try: value = float(matches[0][0]); unit = matches[0][1]; return value, unit
        except (ValueError, IndexError): return np.nan, 'unknown'
    return np.nan, 'unknown'
def extract_structured_data_advanced(text):
    if not isinstance(text, str):
        default_vals = {'extracted_value': np.nan, 'extracted_unit_value': np.nan, 'extracted_weight': np.nan, 'extracted_volume': np.nan, 'extracted_dimension': np.nan, 'extracted_wattage': np.nan, 'extracted_count': np.nan, 'extracted_percentage': np.nan, 'extracted_unit': 'unknown', 'extracted_weight_unit': 'unknown', 'extracted_volume_unit': 'unknown', 'extracted_dimension_unit': 'unknown', 'extracted_general_num_val': np.nan, 'extracted_general_num_unit': 'unknown'}
        return pd.Series(default_vals)
    text_lower = text.lower(); value, unit = extract_from_text_with_patterns(text, 'value_and_unit'); weight, weight_unit = extract_from_text_with_patterns(text, 'weight'); volume, volume_unit = extract_from_text_with_patterns(text, 'volume'); dimension, dimension_unit = extract_from_text_with_patterns(text, 'dimension'); wattage, wattage_unit = extract_from_text_with_patterns(text, 'wattage'); count, count_unit = extract_from_text_with_patterns(text, 'count'); percentage, percentage_unit = extract_from_text_with_patterns(text, 'percentage'); general_num_val, general_num_unit = extract_from_text_with_patterns(text, 'general_number_unit')
    return pd.Series({'extracted_value': value, 'extracted_unit_value': float(re.search(r'(\d+\.?\d*)', text_lower).group(1)) if re.search(r'(\d+\.?\d*)', text_lower) else np.nan, 'extracted_weight': weight, 'extracted_volume': volume, 'extracted_dimension': dimension, 'extracted_wattage': wattage, 'extracted_count': count, 'extracted_percentage': percentage, 'extracted_unit': unit, 'extracted_weight_unit': weight_unit, 'extracted_volume_unit': volume_unit, 'extracted_dimension_unit': dimension_unit, 'extracted_general_num_val': general_num_val, 'extracted_general_num_unit': general_num_unit})
def clean_text_improved(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'Value:\s*[\d\.]+\s*Unit:\s*\w+', ' ', text, flags=re.IGNORECASE); text = re.sub(r'Brand:.*?Product:', ' ', text, flags=re.IGNORECASE | re.DOTALL); text = re.sub(r'Size/Quantity:.*?Features:', ' ', text, flags=re.IGNORECASE | re.DOTALL)
    text = re.sub(r'\b\d+\.?\d*\s*(?:kg|g|lbs?|oz|ml|l|liter|fl\s?oz|floz|cm|m|inch|in|ft|foot|w|watt|watts|kw|pack|pcs|pieces|units|ct|count|%)?\b', ' ', text, flags=re.IGNORECASE); text = re.sub(r'\b(?:xl|l|m|s|xs|large|medium|small|extra large|extra small)\b', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'<.*?>', ' ', text); text = re.sub(r'[^a-zA-Z\s]', ' ', text).lower(); words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and len(w) > 1]; return ' '.join(lemmatized_words)
def create_better_features(df):
    features = pd.DataFrame(index=df.index); features['catalog_length'] = df['catalog_content'].str.len().fillna(0)
    if 'smolvlm_description' in df.columns: features['vlm_length'] = df['smolvlm_description'].str.len().fillna(0)
    features['catalog_word_count'] = df['catalog_content'].str.split().str.len().fillna(0)
    if 'smolvlm_description' in df.columns: features['vlm_word_count'] = df['smolvlm_description'].str.split().str.len().fillna(0)
    features['catalog_num_count'] = df['catalog_content'].apply(lambda x: len(re.findall(r'\d+\.?\d*', str(x)))).fillna(0)
    if 'smolvlm_description' in df.columns: features['vlm_num_count'] = df['smolvlm_description'].apply(lambda x: len(re.findall(r'\d+\.?\d*', str(x)))).fillna(0)
    features['catalog_numeric_ratio'] = df['catalog_content'].astype(str).apply(lambda x: sum(c.isdigit() for c in x) / len(x) if len(x) > 0 else 0)
    if 'smolvlm_description' in df.columns: features['vlm_numeric_ratio'] = df['smolvlm_description'].astype(str).apply(lambda x: sum(c.isdigit() for c in x) / len(x) if len(x) > 0 else 0)
    keywords = ['electronic', 'furniture', 'apparel', 'food', 'book', 'toy', 'tool', 'home', 'garden', 'kitchen', 'beauty', 'health', 'automotive']
    for kw in keywords:
        features[f'has_kw_{kw}'] = df['cleaned_catalog'].apply(lambda x: 1 if kw in x else 0)
        if 'cleaned_vlm' in df.columns: features[f'vlm_has_kw_{kw}'] = df['cleaned_vlm'].apply(lambda x: 1 if kw in x else 0)
    if 'has_vlm_data' in df.columns and 'has_kw_electronic' in features.columns: features['vlm_electronic_interaction'] = df['has_vlm_data'] * features['has_kw_electronic']
    features['has_extracted_weight'] = df['extracted_weight'].notna().astype(int); features['has_extracted_volume'] = df['extracted_volume'].notna().astype(int); features['has_extracted_dimension'] = df['extracted_dimension'].notna().astype(int); features['has_extracted_wattage'] = df['extracted_wattage'].notna().astype(int); features['has_extracted_count'] = df['extracted_count'].notna().astype(int); features['has_extracted_percentage'] = df['extracted_percentage'].notna().astype(int)
    return features

# --- 3. Main Feature Engineering and Training Pipeline ---
def run_complete_pipeline():
    """Main pipeline to load, feature engineer, and train models on the full dataset."""
    OUTPUT_DIR = 'processed_data'
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    
    # --- Load FULL training data ---
    try:
        print("Loading full training data..."); train_df = pd.read_csv(os.path.join('dataset', 'train.csv'))
        print(f"‚úÖ Loaded {len(train_df)} training samples.")
    except FileNotFoundError:
        print("Error: dataset/train.csv not found!"); return

    # --- VLM Feature Loading ---
    try:
        print("Loading VLM features..."); vlm_df = pd.read_csv(os.path.join(OUTPUT_DIR, 'smolvlm_extracted_features.csv'))
        merged_df = train_df.merge(vlm_df, on='sample_id', how='left')
        merged_df['has_vlm_data'] = ~merged_df['smolvlm_description'].isna() & (merged_df['smolvlm_description'].str.strip() != '')
    except FileNotFoundError:
        print("‚ö†Ô∏è VLM features not found, proceeding without them."); merged_df = train_df.copy()
        merged_df['has_vlm_data'] = False; merged_df['smolvlm_description'] = ""
    
    features_df = merged_df.copy()
    
    # --- On-the-fly Feature Engineering with Progress Bars ---
    print("\n--- Starting Feature Engineering on Full Dataset ---")
    print("1. Performing advanced structured data extraction..."); extracted_data = features_df['catalog_content'].progress_apply(extract_structured_data_advanced)
    features_df = pd.concat([features_df.reset_index(drop=True), extracted_data], axis=1)
    
    print("2. Cleaning text features..."); features_df['cleaned_catalog'] = features_df['catalog_content'].progress_apply(clean_text_improved)
    features_df['cleaned_vlm'] = features_df['smolvlm_description'].progress_apply(clean_text_improved)
    
    print("3. Creating meta-features..."); additional_features = create_better_features(features_df)
    features_df = pd.concat([features_df, additional_features], axis=1)
    
    features_df['combined_text'] = features_df.apply(
        lambda x: x['cleaned_vlm'] if x['has_vlm_data'] else x['cleaned_catalog'], axis=1
    )
    
    # --- Sentence Transformer Embeddings ---
    print("4. Generating Sentence Transformer embeddings...")
    st_model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)
    catalog_embeddings = st_model.encode(features_df['cleaned_catalog'].tolist(), batch_size=CONFIG['st_batch_size'], show_progress_bar=True)
    vlm_embeddings = st_model.encode(features_df['cleaned_vlm'].tolist(), batch_size=CONFIG['st_batch_size'], show_progress_bar=True)
    
    # --- Prepare for Preprocessing ---
    y = np.log1p(features_df['price'].values)
    cols_to_drop = ['price', 'sample_id', 'image_link', 'catalog_content', 'smolvlm_description', 'cleaned_catalog', 'cleaned_vlm']
    features_df.drop(columns=[c for c in cols_to_drop if c in features_df.columns], inplace=True)
    
    # --- Build Preprocessing Pipeline ---
    print("5. Building preprocessing pipeline...");
    numeric_features = [c for c in features_df.columns if features_df[c].dtype in ['int64', 'float64', 'bool']]
    categorical_features = [c for c in features_df.columns if features_df[c].dtype == 'object' and c != 'combined_text']
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), numeric_features),
            ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='unknown')), ('onehot', OneHotEncoder(handle_unknown='ignore', max_categories=50))]), categorical_features),
            ('text', TfidfVectorizer(max_features=3000, ngram_range=(1, 2), min_df=5, max_df=0.7), 'combined_text')
        ], remainder='drop', n_jobs=-1)
    
    print("6. Fitting preprocessor and transforming data...");
    X_base = preprocessor.fit_transform(features_df)
    
    # --- Final Combination of ALL Features ---
    X_final = hstack([X_base, catalog_embeddings, vlm_embeddings]).tocsr()
    print(f"   ‚úì Final training data shape: {X_final.shape}")

    # --- Train Ensemble with Cross-Validation ---
    print("\n" + "="*50); print("üöÄ Starting GPU-Accelerated Ensemble Training üöÄ"); print(f"   Folds: {CONFIG['n_folds']}, Device: {DEVICE.upper()}"); print("="*50)
    
    kf = KFold(n_splits=CONFIG['n_folds'], shuffle=True, random_state=CONFIG['random_state'])
    lgb_params = {'objective': 'regression_l1', 'metric': 'mae', 'random_state': CONFIG['random_state'], 'n_estimators': 4000, 'learning_rate': 0.01, 'num_leaves': 50, 'subsample': 0.7, 'colsample_bytree': 0.7, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'n_jobs': -1, 'device': DEVICE}
    xgb_params = {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'seed': CONFIG['random_state'], 'n_estimators': 4000, 'learning_rate': 0.01, 'max_depth': 8, 'subsample': 0.7, 'colsample_bytree': 0.7, 'tree_method': 'hist', 'device': DEVICE, 'early_stopping_rounds': 200, 'n_jobs': -1}
    cat_params = {'loss_function': 'MAE', 'eval_metric': 'MAE', 'random_seed': CONFIG['random_state'], 'iterations': 4000, 'learning_rate': 0.05, 'depth': 8, 'verbose': 0, 'task_type': 'GPU' if DEVICE == 'cuda' else 'CPU'}

    model_configs = {
        'lgbm': {'model': lgb.LGBMRegressor(**lgb_params), 'preds': np.zeros(len(y))},
        'xgb': {'model': xgb.XGBRegressor(**xgb_params), 'preds': np.zeros(len(y))},
        'cat': {'model': cb.CatBoostRegressor(**cat_params), 'preds': np.zeros(len(y))}
    }
    oof_scores = {name: [] for name in model_configs}

    for fold, (train_idx, val_idx) in enumerate(tqdm(kf.split(X_final, y), total=CONFIG['n_folds'], desc="Overall CV Progress")):
        X_train, X_val = X_final[train_idx], X_final[val_idx]; y_train, y_val = y[train_idx], y[val_idx]
        for name, config in model_configs.items():
            print(f"  - Training {name} on Fold {fold+1}..."); start_time = time.time()
            try:
                if name == 'lgbm': config['model'].fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(200, verbose=False)])
                elif name == 'xgb': config['model'].fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
                else: config['model'].fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=200, verbose=False)
            except Exception as e:
                if name == 'lgbm' and "CUDA" in str(e): print("    ‚ö†Ô∏è LGBM GPU failed, falling back to CPU..."); cpu_params = lgb_params.copy(); del cpu_params['device']; config['model'] = lgb.LGBMRegressor(**cpu_params); config['model'].fit(X_train, y_train, eval_set=[(X_val, y_val)], callbacks=[lgb.early_stopping(200, verbose=False)])
                else: print(f"    ‚ö†Ô∏è Error training {name}: {e}. Skipping."); continue
            
            val_preds = config['model'].predict(X_val); config['preds'][val_idx] = val_preds
            fold_smape = smape(y, val_preds); oof_scores[name].append(fold_smape); elapsed = time.time() - start_time
            print(f"    ‚úì {name} Fold {fold+1} SMAPE: {fold_smape:.4f}% ({elapsed:.1f}s)")
            
    print("\n" + "="*50); print("üìä Overall Cross-Validation Results (OOF)"); print("="*50)
    for name, scores in oof_scores.items():
        if scores: print(f"{name.upper():<6} | Mean SMAPE: {np.mean(scores):.4f}% (Std: {np.std(scores):.4f})")
    
    ensemble_preds = (0.5 * model_configs['lgbm']['preds'] + 0.3 * model_configs['xgb']['preds'] + 0.2 * model_configs['cat']['preds'])
    ensemble_smape = smape(y, ensemble_preds); print(f"\n{'ENSEMBLE':<6} | Final OOF SMAPE: {ensemble_smape:.4f}%")
    
    print("\n" + "="*50); print("Retraining final LGBM model on all data..."); print("="*50)
    final_model = lgb.LGBMRegressor(**lgb_params)
    try: final_model.fit(X_final, y)
    except Exception: print("   ‚ö†Ô∏è GPU failed for final model, retraining on CPU..."); cpu_params = lgb_params.copy(); del cpu_params['device']; final_model = lgb.LGBMRegressor(**cpu_params); final_model.fit(X_final, y)
            
    joblib.dump(final_model, 'final_ensemble_model_for_submission.joblib')
    joblib.dump(preprocessor, os.path.join(OUTPUT_DIR, 'preprocessor_final.joblib'))
    print("\n‚úÖ Final model and preprocessor saved.")

if __name__ == '__main__':
    run_complete_pipeline()

sentence tranformer 

In [None]:
# Save this file as: preprocess_final.py

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy.sparse import save_npz, hstack
import joblib
import os
from sentence_transformers import SentenceTransformer
import torch

# --- GPU Check ---
if torch.cuda.is_available():
    DEVICE = 'cuda'
    print(f"‚úÖ GPU found: {torch.cuda.get_device_name(0)}. Embeddings will be generated on GPU.")
else:
    DEVICE = 'cpu'
    print("‚ö†Ô∏è No GPU found. Embeddings will be generated on CPU.")

# --- Helper Functions ---
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# (Functions clean_text, extract_structured_data, standardize_units are correct and remain the same)
def clean_text(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'Value:.*', '', text, flags=re.IGNORECASE); text = re.sub(r'Unit:.*', '', text, flags=re.IGNORECASE)
    text = re.sub(r'<.*?>', ' ', text); text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    words = text.split()
    lemmatized = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and len(w) > 1]
    return ' '.join(lemmatized)
def extract_structured_data(text):
    if not isinstance(text, str): return pd.Series([np.nan, 'unknown'], index=['Value', 'Unit'])
    value_match = re.search(r'Value:\s*([\d\.]+)', text, re.IGNORECASE); value = float(value_match.group(1)) if value_match else np.nan
    unit_match = re.search(r'Unit:\s*(\w+)', text, re.IGNORECASE); unit = unit_match.group(1) if unit_match else 'unknown'
    return pd.Series([value, unit], index=['Value', 'Unit'])
def standardize_units(unit):
    if not isinstance(unit, str): return 'unknown'
    unit = unit.lower().strip(); unit_map = {'oz': 'ounce', 'ounces': 'ounce', 'fl oz': 'fl_oz', 'fz': 'fl_oz', 'ct': 'count', 'none': 'unknown'}
    return unit_map.get(unit, unit)

# --- Main Preprocessing Pipeline ---
def create_all_features(df, is_training=True, output_dir='processed_data'):
    
    print("\nExtracting structured data from catalog...")
    extracted_data = df['catalog_content'].apply(extract_structured_data)
    df = pd.concat([df.reset_index(drop=True), extracted_data], axis=1)

    # --- Setup common DataFrame for features ---
    if is_training:
        df.dropna(subset=['price'], inplace=True)
        y = np.log1p(df['price'])
        sample_ids = df['sample_id']
        features_df = df.drop(columns=['price', 'sample_id', 'image_link'])
    else: # is_training == False
        sample_ids = df['sample_id']
        features_df = df.drop(columns=['sample_id', 'image_link'])
        if 'price' in features_df.columns: features_df = features_df.drop(columns=['price'])

    print("Cleaning all text sources (Catalog and VLM)...")
    features_df['cleaned_catalog'] = features_df['catalog_content'].apply(clean_text)
    features_df['Unit_standardized'] = features_df['Unit'].apply(standardize_units)
    
    # --- 1. Base TF-IDF and Scalers ---
    if is_training:
        print("1. Fitting base preprocessor (TF-IDF, Scalers)...")
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())]), ['Value']),
                ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='unknown')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), ['Unit_standardized']),
                ('text', TfidfVectorizer(max_features=5000, ngram_range=(1, 2)), 'cleaned_catalog')
            ], remainder='drop', n_jobs=-1)
        X_base = preprocessor.fit_transform(features_df)
        joblib.dump(preprocessor, os.path.join(output_dir, 'preprocessor.joblib'))
    else:
        print("1. Loading base preprocessor and transforming test data...")
        preprocessor = joblib.load(os.path.join(output_dir, 'preprocessor.joblib'))
        X_base = preprocessor.transform(features_df)
    
    # --- 2. Sentence Transformer Embeddings for Catalog ---
    print("\n2. Generating Sentence Transformer embeddings for Catalog...")
    st_model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)
    catalog_embeddings = st_model.encode(
        features_df['cleaned_catalog'].tolist(), batch_size=256, show_progress_bar=True, convert_to_numpy=True
    )

    # --- 3. VLM Text Processing (TF-IDF and Embeddings) ---
    print("\n3. Processing VLM text (TF-IDF and Embeddings)...")
    vlm_file = 'smolvlm_extracted_features[1].csv' # Hardcoded to your filename
    vlm_df = pd.read_csv(os.path.join(output_dir, vlm_file)).set_index('sample_id')
    vlm_df_aligned = vlm_df.reindex(sample_ids).fillna('')
    vlm_df_aligned['cleaned_vlm'] = vlm_df_aligned['smolvlm_description'].apply(clean_text)
    
    if is_training:
        vlm_vectorizer = TfidfVectorizer(max_features=1500, ngram_range=(1, 2))
        X_vlm_tfidf = vlm_vectorizer.fit_transform(vlm_df_aligned['cleaned_vlm'])
        joblib.dump(vlm_vectorizer, os.path.join(output_dir, 'vlm_vectorizer.joblib'))
    else:
        vlm_vectorizer = joblib.load(os.path.join(output_dir, 'vlm_vectorizer.joblib'))
        X_vlm_tfidf = vlm_vectorizer.transform(vlm_df_aligned['cleaned_vlm'])

    vlm_embeddings = st_model.encode(
        vlm_df_aligned['cleaned_vlm'].tolist(), batch_size=256, show_progress_bar=True, convert_to_numpy=True
    )
    
    # --- 4. Visual (Image) Feature Processing ---
    print("\n4. Processing visual (image) features...")
    all_image_features = np.load(os.path.join(output_dir, 'X_image_features.npy'))
    all_ids_df = pd.read_csv(os.path.join(output_dir, 'all_sample_ids.csv'))
    image_features_df = pd.DataFrame(all_image_features, index=all_ids_df['sample_id'])
    X_images_aligned = image_features_df.reindex(sample_ids).values
    
    if is_training:
        image_scaler = StandardScaler()
        X_images_scaled = image_scaler.fit_transform(X_images_aligned)
        joblib.dump(image_scaler, os.path.join(output_dir, 'image_scaler.joblib'))
    else:
        image_scaler = joblib.load(os.path.join(output_dir, 'image_scaler.joblib'))
        X_images_scaled = image_scaler.transform(X_images_aligned)
        
    # --- 5. Combine and Save All Features ---
    print("\n5. Combining and saving all feature sets...")
    X_final = hstack([X_base, catalog_embeddings, X_vlm_tfidf, vlm_embeddings, X_images_scaled]).tocsr()
    
    # Save the final combined matrix
    mode_str = 'train' if is_training else 'test'
    save_npz(os.path.join(output_dir, f'X_final_{mode_str}.npz'), X_final)

    # Save corresponding labels or IDs
    if is_training:
        y_df = pd.DataFrame({'sample_id': sample_ids, 'price_log': y})
        y_df.to_csv(os.path.join(output_dir, f'y_{mode_str}.csv'), index=False)
    else:
        ids_df = pd.DataFrame({'sample_id': sample_ids})
        ids_df.to_csv(os.path.join(output_dir, f'ids_{mode_str}.csv'), index=False)
        
    print(f"\n‚úÖ All artifacts for '{mode_str.upper()}' mode are saved. Final shape: {X_final.shape}")

# --- Main execution block ---
if __name__ == '__main__':
    MODE = 'train' # or 'test'
    
    if MODE == 'train':
        train_df = pd.read_csv(os.path.join('dataset', 'train.csv'))
        create_all_features(train_df, is_training=True)
    elif MODE == 'test':
        test_df = pd.read_csv(os.path.join('dataset', 'test.csv'))
        create_all_features(test_df, is_training=False)


sentence tranformer training kfold

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz, hstack
import joblib
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import time
import torch
import warnings
import os
from tqdm import tqdm

warnings.filterwarnings('ignore')

# --- 1. Configuration & Setup ---
CONFIG = {
    'n_folds': 5, # Start with 5 folds for faster iteration, can increase to 20 later
    'random_state': 42,
    'data_dir': 'processed_data',
}
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# --- 2. Helper Functions ---
def smape(y_true, y_pred):
    y_true_actual = np.expm1(y_true)
    y_pred_actual = np.expm1(y_pred)
    numerator = np.abs(y_pred_actual - y_true_actual)
    denominator = (np.abs(y_true_actual) + np.abs(y_pred_actual)) / 2
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio) * 100

# --- 3. Main Training and Ensembling Pipeline ---
def train_ensemble(X, y):
    print("\n" + "="*50)
    print("üöÄ Starting GPU-Accelerated Ensemble Training üöÄ")
    print(f"   Folds: {CONFIG['n_folds']}, Device: {DEVICE.upper()}")
    print("="*50)
    
    kf = KFold(n_splits=CONFIG['n_folds'], shuffle=True, random_state=CONFIG['random_state'])
    
    # Base Hyperparameters with GPU settings
    lgb_params = {'objective': 'regression_l1', 'metric': 'mae', 'random_state': CONFIG['random_state'], 'n_estimators': 4000, 'learning_rate': 0.01, 'num_leaves': 50, 'subsample': 0.7, 'colsample_bytree': 0.7, 'reg_alpha': 0.1, 'reg_lambda': 0.1, 'n_jobs': -1, 'device': DEVICE}
    xgb_params = {'objective': 'reg:squarederror', 'eval_metric': 'mae', 'seed': CONFIG['random_state'], 'n_estimators': 4000, 'learning_rate': 0.01, 'max_depth': 8, 'subsample': 0.7, 'colsample_bytree': 0.7, 'tree_method': 'hist', 'device': DEVICE, 'early_stopping_rounds': 200, 'n_jobs': -1}
    cat_params = {'loss_function': 'MAE', 'eval_metric': 'MAE', 'random_seed': CONFIG['random_state'], 'iterations': 4000, 'learning_rate': 0.05, 'depth': 8, 'task_type': 'GPU' if DEVICE == 'cuda' else 'CPU'}

    model_configs = {
        'lgbm': {'model': lgb.LGBMRegressor(**lgb_params), 'preds': np.zeros(len(y))},
        'xgb': {'model': xgb.XGBRegressor(**xgb_params), 'preds': np.zeros(len(y))},
        'cat': {'model': cb.CatBoostRegressor(**cat_params), 'preds': np.zeros(len(y))}
    }
    oof_scores = {name: [] for name in model_configs}

    # Wrap the KFold loop with tqdm for a master progress bar
    for fold, (train_idx, val_idx) in enumerate(tqdm(kf.split(X, y), total=CONFIG['n_folds'], desc="Overall CV Progress")):
        print(f"\n--- Fold {fold+1}/{CONFIG['n_folds']} ---")
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        for name, config in model_configs.items():
            print(f"  - Training {name}...")
            start_time = time.time()
            
            try:
                # Added verbose logging to all training calls
                if name == 'lgbm':
                    config['model'].fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                                        callbacks=[lgb.early_stopping(200, verbose=False), lgb.log_evaluation(period=500)])
                elif name == 'xgb':
                    config['model'].fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=500)
                else: # CatBoost
                    config['model'].fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=200, verbose=500)
            
            except Exception as e:
                if name == 'lgbm' and "CUDA Tree Learner was not enabled" in str(e):
                    print("    ‚ö†Ô∏è LightGBM GPU failed. Falling back to CPU (this will be slow)...")
                    cpu_params = lgb_params.copy(); del cpu_params['device']
                    config['model'] = lgb.LGBMRegressor(**cpu_params)
                    config['model'].fit(X_train, y_train, eval_set=[(X_val, y_val)], 
                                        callbacks=[lgb.early_stopping(200, verbose=False), lgb.log_evaluation(period=500)])
                else:
                    print(f"    ‚ö†Ô∏è An error occurred training {name}: {e}. Skipping model for this fold.")
                    oof_scores[name].append(np.inf) # Add a high score to indicate failure
                    continue

            val_preds = config['model'].predict(X_val)
            config['preds'][val_idx] = val_preds
            fold_smape = smape(y_val, val_preds)
            oof_scores[name].append(fold_smape)
            elapsed = time.time() - start_time
            print(f"    ‚úì Done in {elapsed:.1f}s. Fold SMAPE: {fold_smape:.4f}%")
            
    # --- Final Evaluation ---
    print("\n" + "="*50); print("üìä Overall Cross-Validation Results (OOF)"); print("="*50)
    for name, scores in oof_scores.items():
        if scores:
            print(f"{name.upper():<6} | Mean SMAPE: {np.mean(scores):.4f}% (Std: {np.std(scores):.4f})")
    
    ensemble_preds = (0.5 * model_configs['lgbm']['preds'] + 
                      0.3 * model_configs['xgb']['preds'] + 
                      0.2 * model_configs['cat']['preds'])
    ensemble_smape = smape(y, ensemble_preds)
    print(f"\n{'ENSEMBLE':<6} | Final OOF SMAPE: {ensemble_smape:.4f}% (Weighted Average)")
    
    # --- Retrain Final Model ---
    print("\n" + "="*50); print("Retraining best model (LGBM) on 100% of data for submission..."); print("="*50)
    try:
        final_model = lgb.LGBMRegressor(**lgb_params)
        final_model.fit(X, y)
    except Exception as e:
        if "CUDA Tree Learner was not enabled" in str(e):
            print("    ‚ö†Ô∏è GPU failed for final model. Retraining on CPU.")
            cpu_params = lgb_params.copy(); del cpu_params['device']
            final_model = lgb.LGBMRegressor(**cpu_params)
            final_model.fit(X, y)
        else: raise e
            
    joblib.dump(final_model, 'final_ensemble_model_for_submission.joblib')
    print("\n‚úÖ Final retrained model saved as 'final_ensemble_model_for_submission.joblib'")

# --- Main execution block ---
if __name__ == '__main__':
    # This assumes your full feature matrix has been saved by a master preprocessing script
    def load_precomputed_data(data_dir):
        try:
            print("--- Loading final precomputed feature matrix for training ---")
            X = load_npz(os.path.join(data_dir, 'X_final_train.npz'))
            y_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))
            print(f"   ‚úì Features loaded. Shape: {X.shape}")
            print(f"   ‚úì Labels loaded. Count: {len(y_df)}")
            return X, y_df['price_log'].values
        except FileNotFoundError as e:
            print(f"FATAL ERROR: Could not load precomputed files. {e}")
            print("Please run the final, combined preprocessing script first to generate 'X_final_train.npz' and 'y_train.csv'.")
            return None, None

    X_final_train, y_final_train = load_precomputed_data(CONFIG['data_dir'])
    
    if X_final_train is not None:
        train_ensemble(X_final_train, y_final_train)

check for connected slaves

In [None]:
import pandas as pd
from dask.distributed import Client, TimeoutError

# --- Configuration ---
# Use the same Dask scheduler address as your main script
DASK_ADDRESS = "tcp://10.0.22.229:8790"

def check_worker_status():
    """
    Connects to the Dask scheduler and prints the status of all connected workers.
    """
    print("="*60)
    print(f"Attempting to connect to Dask scheduler at: {DASK_ADDRESS}")
    print("="*60)

    try:
        # Connect to the scheduler with a timeout to avoid hanging indefinitely
        client = Client(DASK_ADDRESS, timeout="10s")
    except (TimeoutError, OSError) as e:
        print(f"\n‚ùå FAILED TO CONNECT TO SCHEDULER.")
        print(f"   Error: {e}")
        print("\n   Troubleshooting Steps:")
        print("   1. Is the Dask scheduler running at the specified address?")
        print("   2. Is the address and port correct?")
        print("   3. Is there a firewall blocking the connection between this machine and the scheduler?")
        return

    try:
        # Get information about the cluster
        scheduler_info = client.scheduler_info()
        workers = scheduler_info.get('workers', {})

        if not workers:
            print("\nüü° STATUS: Connected to scheduler, but NO workers are registered.")
            print("   Please ensure your dask-worker processes are running and pointing to the correct scheduler address.")
        else:
            print(f"\n‚úÖ SUCCESS: Connected to scheduler. Found {len(workers)} worker(s).")
            print("-" * 60)
            
            # Prepare data for a clean table display
            worker_data = []
            for address, info in workers.items():
                worker_data.append({
                    "Address": address,
                    "Status": info.get('status', 'N/A'),
                    "Cores": info.get('nthreads', 'N/A'),
                    "Memory Limit": info.get('memory_limit', 0) / (1024**3), # Convert bytes to GB
                    "Hostname": info.get('host', 'N/A')
                })
            
            # Display as a formatted table using pandas
            df = pd.DataFrame(worker_data)
            df['Memory Limit'] = df['Memory Limit'].map('{:,.2f} GB'.format)
            print(df.to_string(index=False))
            print("-" * 60)

    except Exception as e:
        print(f"\n‚ùå An error occurred while fetching worker information: {e}")
    finally:
        # Always close the client connection
        client.close()
        print("\nClient disconnected.")

if __name__ == '__main__':
    check_worker_status()

training code for lightgbm using gpu only 

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz
import joblib
from sklearn.model_selection import KFold
import lightgbm as lgb
import time
import torch
import warnings
import os
from tqdm import tqdm

warnings.filterwarnings('ignore')

# --- 1. Configuration & Setup ---
CONFIG = {
    'n_folds': 20,  # As requested, set to 20 folds
    'random_state': 42,
    'data_dir': 'processed_data',
}

# Enforce GPU usage and print device info
try:
    assert torch.cuda.is_available()
    DEVICE = 'cuda'
    print(f"‚úÖ GPU Detected: {torch.cuda.get_device_name(0)}")
except AssertionError:
    raise RuntimeError("‚ùå GPU NOT AVAILABLE! This script requires a CUDA-enabled GPU to run.")


# --- 2. Helper Function (SMAPE Metric) ---
def smape(y_true, y_pred):
    """Calculates SMAPE score on the log-transformed predictions."""
    y_true_actual = np.expm1(y_true)
    y_pred_actual = np.expm1(y_pred)
    
    # To prevent division by zero, especially if both true and pred are 0
    numerator = np.abs(y_pred_actual - y_true_actual)
    denominator = (np.abs(y_true_actual) + np.abs(y_pred_actual)) / 2
    
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio) * 100


# --- 3. Main Training Pipeline ---
def train_lgbm_with_checkpoints(X, y):
    """
    Trains a LightGBM model using 20-fold CV on a GPU.
    Saves a checkpoint of the best model after each fold.
    """
    print("\n" + "="*50)
    print("üöÄ Starting LightGBM GPU Training with 20-Fold CV üöÄ")
    print(f"   Folds: {CONFIG['n_folds']}, Device: {DEVICE.upper()}")
    print("="*50)
    
    kf = KFold(n_splits=CONFIG['n_folds'], shuffle=True, random_state=CONFIG['random_state'])
    
    # LightGBM Hyperparameters optimized for GPU memory efficiency
    lgb_params = {
        'objective': 'regression_l1',
        'metric': 'mae',
        'random_state': CONFIG['random_state'],
        'n_estimators': 4000,
        'learning_rate': 0.01,
        'num_leaves': 31,      # Good balance for performance and memory
        'max_bin': 63,         # Key parameter for reducing GPU memory usage
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'n_jobs': -1,
        'device': 'gpu',       # Enforce GPU usage
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
    }

    oof_preds = np.zeros(len(y))
    oof_scores = []
    best_smape = np.inf
    best_fold = -1
    model_save_path = 'best_lgbm_model.joblib'

    # Training loop with fold-wise model saving
    for fold, (train_idx, val_idx) in enumerate(tqdm(kf.split(X, y), total=CONFIG['n_folds'], desc="Training Progress")):
        print(f"\n{'='*50}\n--- Fold {fold+1}/{CONFIG['n_folds']} ---\n{'='*50}")
        
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        print(f"  Training samples: {len(train_idx)}, Validation samples: {len(val_idx)}")
        start_time = time.time()
        
        # Train model for the current fold
        model = lgb.LGBMRegressor(**lgb_params)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[
                lgb.early_stopping(200, verbose=False),
                lgb.log_evaluation(period=500)
            ]
        )
        
        # Predict on validation set and calculate score
        val_preds = model.predict(X_val)
        oof_preds[val_idx] = val_preds
        fold_smape = smape(y_val, val_preds)
        oof_scores.append(fold_smape)
        elapsed = time.time() - start_time
        
        print(f"\n  ‚úì Fold {fold+1} completed in {elapsed:.1f}s")
        print(f"  üìä Fold SMAPE: {fold_smape:.4f}%")
        
        # --- MODEL SAVING LOGIC ---
        # If the current model is the best one so far, save (or overwrite) it
        if fold_smape < best_smape:
            best_smape = fold_smape
            best_fold = fold + 1
            joblib.dump(model, model_save_path)
            print(f"  üíæ New best model saved! (SMAPE: {best_smape:.4f}%)")
        else:
            print(f"  ‚è≠Ô∏è  Model not saved (current best is {best_smape:.4f}% from fold {best_fold})")
        
        print(f"  üìà Running Mean SMAPE: {np.mean(oof_scores):.4f}% (¬±{np.std(oof_scores):.4f}%)")
    
    # --- Final Evaluation ---
    print("\n" + "="*50); print("üìä Final Cross-Validation Results"); print("="*50)
    print(f"Mean SMAPE across {CONFIG['n_folds']} folds: {np.mean(oof_scores):.4f}%")
    print(f"Std Dev of SMAPE:  {np.std(oof_scores):.4f}%")
    overall_oof_smape = smape(y, oof_preds)
    print(f"\nOverall OOF SMAPE (all predictions): {overall_oof_smape:.4f}%")
    print(f"\nüèÜ Best single model was from Fold {best_fold} with SMAPE: {best_smape:.4f}%")
    print(f"   -> This model is saved at: '{model_save_path}'")
    
    # --- Retrain Final Model on Full Data ---
    print("\n" + "="*50); print("üîÑ Retraining on 100% of data for submission..."); print("="*50)
    final_model = lgb.LGBMRegressor(**lgb_params)
    final_model.fit(X, y)
    final_model_path = 'final_lgbm_model_submission.joblib'
    joblib.dump(final_model, final_model_path)
    print(f"\n‚úÖ Final model for submission trained and saved as '{final_model_path}'")
    
    return oof_scores

# --- Main execution block ---
if __name__ == '__main__':
    def load_precomputed_data(data_dir):
        """Loads the final feature matrix and labels."""
        try:
            print("\n--- Loading final precomputed feature matrix for training ---")
            X = load_npz(os.path.join(data_dir, 'X_final_train.npz'))
            y_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))
            print(f"   ‚úì Features loaded. Shape: {X.shape}")
            print(f"   ‚úì Labels loaded. Count: {len(y_df)}")
            return X, y_df['price_log'].values
        except FileNotFoundError as e:
            print(f"‚ùå FATAL ERROR: Could not load precomputed files. {e}")
            print("Please run the preprocessing script first to generate 'X_final_train.npz' and 'y_train.csv'.")
            return None, None

    # Load the data generated by the optimized preprocessing script
    X_final_train, y_final_train = load_precomputed_data(CONFIG['data_dir'])
    
    if X_final_train is not None:
        oof_scores = train_lgbm_with_checkpoints(X_final_train, y_final_train)
        print("\n" + "="*50); print("üéâ Training Pipeline Complete!"); print("="*50)

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import load_npz, csr_matrix
import joblib
import lightgbm as lgb
import time
import warnings
import os
from tqdm import tqdm
import scipy.sparse
import gc
from sklearn.model_selection import KFold

warnings.filterwarnings('ignore')

# --- 1. Configuration & Setup ---
CONFIG = {
    'n_folds': 20,
    'random_state': 42,
    'data_dir': 'processed_data',
    'gpu_verbose': 1,
    'batch_size': None,  # Set to int (e.g., 100_000) if you want to batch predictions
}

# --- 2. Helper Functions ---
def smape(y_true, y_pred):
    y_true_actual = np.expm1(y_true)
    y_pred_actual = np.expm1(y_pred)
    numerator = np.abs(y_pred_actual - y_true_actual)
    denominator = (np.abs(y_true_actual) + np.abs(y_pred_actual)) / 2
    ratio = np.where(denominator == 0, 0, numerator / denominator)
    return np.mean(ratio) * 100

def batch_predict(model, X, batch_size=100_000):
    """Predict in batches to reduce memory pressure."""
    if batch_size is None or X.shape[0] <= batch_size:
        return model.predict(X)
    preds = []
    for i in range(0, X.shape[0], batch_size):
        batch = X[i:i + batch_size]
        pred = model.predict(batch)
        preds.append(pred)
        del batch, pred
        gc.collect()
    return np.concatenate(preds)

# --- 3. GPU-Optimized Training Pipeline with Memory Cleanup ---
def train_lightgbm_gpu(X, y):
    print("\n" + "="*50)
    print("üöÄ Starting LightGBM GPU Training (20 Folds + Memory Safe) üöÄ")
    print(f"   Folds: {CONFIG['n_folds']}, Device: GPU")
    print("="*50)
    
    if not scipy.sparse.isspmatrix_csr(X):
        print("‚ö†Ô∏è Converting to CSR sparse format for GPU compatibility...")
        X = csr_matrix(X)
    
    kf = KFold(n_splits=CONFIG['n_folds'], shuffle=True, random_state=CONFIG['random_state'])

    params = {
        'objective': 'regression_l1',
        'metric': 'mae',
        'random_state': CONFIG['random_state'],
        'learning_rate': 0.01,
        'num_leaves': 50,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'device': 'gpu',
        'gpu_platform_id': 0,
        'gpu_device_id': 0,
        'verbose': CONFIG['gpu_verbose'],
    }

    oof_preds = np.zeros(len(y))
    oof_scores = []
    models = []

    for fold, (train_idx, val_idx) in enumerate(tqdm(kf.split(X, y), total=CONFIG['n_folds'], desc="Overall CV Progress")):
        print(f"\n--- Fold {fold+1}/{CONFIG['n_folds']} ---")
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        if not scipy.sparse.isspmatrix_csr(X_train):
            X_train = csr_matrix(X_train)
        if not scipy.sparse.isspmatrix_csr(X_val):
            X_val = csr_matrix(X_val)

        print(f"  - Data shapes: Train={X_train.shape}, Val={X_val.shape}")
        print(f"  - Sparsity: {100 * (1 - X_train.nnz / np.prod(X_train.shape)):.2f}%")

        train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=True)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, free_raw_data=True)

        print("  - Starting GPU training...")
        start_time = time.time()

        try:
            model = lgb.train(
                params,
                train_data,
                valid_sets=[val_data],
                num_boost_round=4000,
                callbacks=[
                    lgb.early_stopping(stopping_rounds=200, verbose=True),
                    lgb.log_evaluation(period=50)
                ]
            )
        except Exception as e:
            print(f"‚ùå GPU Training FAILED! Error: {str(e)}")
            raise

        # Predict in batches if configured
        val_preds = batch_predict(model, X_val, batch_size=CONFIG['batch_size'])
        oof_preds[val_idx] = val_preds
        fold_smape = smape(y_val, val_preds)
        oof_scores.append(fold_smape)
        elapsed = time.time() - start_time
        print(f"    ‚úì Done in {elapsed:.1f}s. Fold SMAPE: {fold_smape:.4f}%")

        models.append(model)

        # --- Aggressive memory cleanup ---
        del X_train, X_val, y_train, y_val, train_data, val_data, val_preds
        gc.collect()

    # --- Final Evaluation ---
    print("\n" + "="*50)
    print("üìä Overall Cross-Validation Results (OOF)")
    print("="*50)
    mean_smape = np.mean(oof_scores)
    std_smape = np.std(oof_scores)
    print(f"LGBM | Mean SMAPE: {mean_smape:.4f}% (Std: {std_smape:.4f})")

    overall_oof_smape = smape(y, oof_preds)
    print(f"\nFINAL | Overall OOF SMAPE: {overall_oof_smape:.4f}%")

    # --- Retrain Final Model on Full Data ---
    print("\n" + "="*50)
    print("Retraining final LightGBM model on 100% of data for submission...")
    print("="*50)

    if not scipy.sparse.isspmatrix_csr(X):
        X = csr_matrix(X)

    train_data_full = lgb.Dataset(X, label=y, free_raw_data=True)
    final_model = lgb.train(
        params,
        train_data_full,
        num_boost_round=4000,
        callbacks=[lgb.log_evaluation(period=100)]
    )

    final_model.save_model('final_lgbm_gpu_model.txt')
    print("\n‚úÖ Final retrained model saved as 'final_lgbm_gpu_model.txt'")

    # Clean up final dataset
    del train_data_full
    gc.collect()

    return models

# --- Main execution block ---
if __name__ == '__main__':
    print("üîç Verifying GPU setup...")
    try:
        test_data = np.random.rand(100, 10)
        test_label = np.random.rand(100)
        test_dataset = lgb.Dataset(test_data, label=test_label)
        test_params = {
            'device': 'gpu',
            'verbose': 1,
            'num_leaves': 4,
            'min_data_in_leaf': 1,
            'num_iterations': 2
        }
        lgb.train(test_params, test_dataset)
        print("  ‚úÖ GPU test PASSED!")
        del test_data, test_label, test_dataset
        gc.collect()
    except Exception as e:
        print(f"  ‚ùå GPU test FAILED! Error: {str(e)}")
        exit(1)

    def load_precomputed_data(data_dir):
        try:
            print("\n--- Loading final precomputed feature matrix for training ---")
            X = load_npz(os.path.join(data_dir, 'X_final_train.npz'))
            y_df = pd.read_csv(os.path.join(data_dir, 'y_train.csv'))
            print(f"   ‚úì Features loaded. Shape: {X.shape}")
            print(f"   ‚úì Labels loaded. Count: {len(y_df)}")
            return X, y_df['price_log'].values
        except FileNotFoundError as e:
            print(f"FATAL ERROR: Could not load precomputed files. {e}")
            return None, None

    X_final_train, y_final_train = load_precomputed_data(CONFIG['data_dir'])

    if X_final_train is not None:
        models = train_lightgbm_gpu(X_final_train, y_final_train)
        # Optional: delete models if not needed to free memory
        del models
        gc.collect()

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



üîç Verifying GPU setup...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 350
[LightGBM] [Info] Number of data points in the train set: 100, number of used features: 10
[LightGBM] [Info] Using GPU Device: Intel(R) RaptorLake-S Mobile Graphics Controller, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 64 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 10 dense feature groups (0.00 MB) transferred to GPU in 0.001648 secs. 0 sparse feature groups
[LightGBM] [Info] Start training from score 0.559410
  ‚úÖ GPU test PASSED!

--- Loading final precomputed feature matrix for training ---
   ‚úì Features loaded. Shape: (75000, 9370)
   ‚úì Labels loaded. Count: 75000

üöÄ Starting LightGBM GPU Training (20 Folds + Memory Safe) üöÄ
   Folds: 20, Device: GPU


Overall CV Progress:   0%|          | 0/20 [00:00<?, ?it/s]


--- Fold 1/20 ---
  - Data shapes: Train=(71250, 9370), Val=(3750, 9370)
  - Sparsity: 69.03%
  - Starting GPU training...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1654600
[LightGBM] [Info] Number of data points in the train set: 71250, number of used features: 9299
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Intel(R) RaptorLake-S Mobile Graphics Controller, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2682 dense feature groups (182.38 MB) transferred to GPU in 0.072243 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 2.708050
Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.704925
[100]	valid_0's l1: 0.659465
[150]	valid_0's l1: 0.631268
[200]	valid_0's l1: 0.612141
[250]	valid_0's l1: 0.

Overall CV Progress:   5%|‚ñå         | 1/20 [1:09:19<21:57:16, 4159.80s/it]


--- Fold 2/20 ---
  - Data shapes: Train=(71250, 9370), Val=(3750, 9370)
  - Sparsity: 69.03%
  - Starting GPU training...
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1655881
[LightGBM] [Info] Number of data points in the train set: 71250, number of used features: 9299
[LightGBM] [Info] Using requested OpenCL platform 0 device 0
[LightGBM] [Info] Using GPU Device: Intel(R) RaptorLake-S Mobile Graphics Controller, Vendor: Intel(R) Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 2681 dense feature groups (182.38 MB) transferred to GPU in 0.268200 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 2.709383
Training until validation scores don't improve for 200 rounds
[50]	valid_0's l1: 0.689993
[100]	valid_0's l1: 0.64664
[150]	valid_0's l1: 0.619046
[200]	valid_0's l1: 0.600307
[250]	valid_0's l1: 0.5

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack
import joblib
import lightgbm as lgb
import os
import torch
from sentence_transformers import SentenceTransformer

print("--- All-in-One Prediction Script Initialized ---")

# ==========================================
# CONFIGURATION
# ==========================================
# This section defines where to find all the necessary files.
class CFG:
    PROCESSED_DIR = 'processed_data'
    RAW_DATA_DIR = 'dataset'
    
    # --- INPUT FILES ---
    # The raw test data you want to predict on
    TEST_CSV = os.path.join(RAW_DATA_DIR, 'test.csv')
    
    # The final model you trained
    MODEL_FILE = 'final_lgbm_gpu_model.txt'
    
    # All the helper files and transformers saved during training
    TEXT_PREPROCESSOR = os.path.join(PROCESSED_DIR, 'preprocessor.joblib')
    SENTENCE_PCA_TRANSFORMER = os.path.join(PROCESSED_DIR, 'sentence_pca_transformer.joblib')
    IMAGE_PCA_TRANSFORMER = os.path.join(PROCESSED_DIR, 'image_pca_transformer.joblib')
    
    # Supporting data files needed for feature creation
    VLM_FEATURES_CSV = os.path.join(PROCESSED_DIR, 'smolvlm_extracted_features.csv')
    IMAGE_FEATURES_NPY = os.path.join(PROCESSED_DIR, 'X_image_features.npy')
    IMAGE_IDS_CSV = os.path.join(PROCESSED_DIR, 'all_sample_ids.csv')
    
    # --- OUTPUT FILE ---
    SUBMISSION_FILE = 'submission.csv'
    
    # --- MODEL CONFIG (must match training) ---
    SENTENCE_MODEL_NAME = 'all-MiniLM-L6-v2'


# ==========================================
# MAIN EXECUTION BLOCK
# ==========================================
if __name__ == '__main__':
    # --- 1. Load All Necessary Models and Data ---
    print("\n--- Phase 1: Loading all models, transformers, and data ---")
    try:
        df_test = pd.read_csv(CFG.TEST_CSV)
        print(f"  ‚úì Loaded '{CFG.TEST_CSV}'")
        
        model = lgb.Booster(model_file=CFG.MODEL_FILE)
        print(f"  ‚úì Loaded trained model '{CFG.MODEL_FILE}'")

        text_preprocessor = joblib.load(CFG.TEXT_PREPROCESSOR)
        print(f"  ‚úì Loaded text preprocessor '{CFG.TEXT_PREPROCESSOR}'")
        
        pca_text = joblib.load(CFG.SENTENCE_PCA_TRANSFORMER)
        print(f"  ‚úì Loaded sentence PCA transformer '{CFG.SENTENCE_PCA_TRANSFORMER}'")
        
        pca_image = joblib.load(CFG.IMAGE_PCA_TRANSFORMER)
        print(f"  ‚úì Loaded image PCA transformer '{CFG.IMAGE_PCA_TRANSFORMER}'")

        df_vlm = pd.read_csv(CFG.VLM_FEATURES_CSV)
        df_image_ids = pd.read_csv(CFG.IMAGE_IDS_CSV)
        X_image_full_mmap = np.load(CFG.IMAGE_FEATURES_NPY, mmap_mode='r')
        print("  ‚úì Loaded all supporting data files.")
        
    except FileNotFoundError as e:
        print(f"\n‚ùå FATAL ERROR: A required file was not found.")
        print(f"   Missing file: {e.filename}")
        print("   Please ensure all training artifacts and data files are in the correct directories.")
        exit()

    # --- 2. Re-create Test Features IN MEMORY ---
    print("\n--- Phase 2: Preprocessing test data in memory ---")
    
    # Step A: Generate Base TF-IDF Features
    print("  - Step 2A: Generating base TF-IDF features...")
    X_text_base_tfidf = text_preprocessor.transform(df_test)

    # Step B: Generate Semantic Text Embeddings
    print("  - Step 2B: Generating semantic text embeddings...")
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    sentence_model = SentenceTransformer(CFG.SENTENCE_MODEL_NAME, device=device)
    df_test_text = pd.merge(df_test[['sample_id', 'catalog_content']], df_vlm, on='sample_id', how='left')
    df_test_text.fillna('', inplace=True)
    corpus = (df_test_text['catalog_content'] + " " + df_test_text['smolvlm_description']).tolist()
    sentence_embeddings = sentence_model.encode(corpus, show_progress_bar=True, device=device)
    X_sentence_pca = pca_text.transform(sentence_embeddings)

    # Step C: Align and Reduce Image Features
    print("  - Step 2C: Aligning and reducing image features...")
    id_to_row_idx = pd.Series(index=df_image_ids['sample_id'], data=np.arange(len(df_image_ids)))
    test_indices = id_to_row_idx.loc[df_test['sample_id']].values
    X_image_test_aligned = X_image_full_mmap[test_indices, :]
    X_image_pca = pca_image.transform(X_image_test_aligned)

    # Step D: Combine all features into a single in-memory matrix
    print("  - Step 2D: Combining all features into a final matrix...")
    X_final_test = hstack([
        X_text_base_tfidf,
        X_sentence_pca,
        X_image_pca
    ]).tocsr()
    X_final_test.data = X_final_test.data.astype(np.float32)
    print(f"  ‚úì Preprocessing complete. Final in-memory feature shape: {X_final_test.shape}")

    # --- 3. Generate Predictions ---
    print("\n--- Phase 3: Generating predictions ---")
    log_predictions = model.predict(X_final_test)
    print("  ‚úì Predictions generated.")

    # --- 4. Create and Save Submission File ---
    print("\n--- Phase 4: Creating submission file ---")
    final_prices = np.expm1(log_predictions)
    final_prices[final_prices < 0] = 0 # Enforce non-negative price constraint

    submission_df = pd.DataFrame({
        'sample_id': df_test['sample_id'],
        'price': final_prices
    })

    submission_df.to_csv(CFG.SUBMISSION_FILE, index=False)
    
    print("\n" + "="*50)
    print(f"üéâ Success! Submission file created at: '{CFG.SUBMISSION_FILE}'")
    print("="*50)
    print("\nFirst 5 rows of your submission file:")
    print(submission_df.head())

In [1]:
import pandas as pd
import numpy as np
import joblib
import lightgbm as lgb
import os
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import warnings

warnings.filterwarnings('ignore')
print("--- Final Prediction Script (Using Base Preprocessor Only) ---")

# ==========================================
# CONFIGURATION
# ==========================================
class CFG:
    PROCESSED_DIR = 'processed_data'
    RAW_DATA_DIR = 'dataset'
    
    # --- INPUT FILES ---
    TEST_CSV = os.path.join(RAW_DATA_DIR, 'test.csv')
    MODEL_FILE = 'best_cpu_lgbm_model.joblib'
    
    # The ONLY transformer your model was trained on
    TEXT_PREPROCESSOR = os.path.join(PROCESSED_DIR, 'preprocessor.joblib')
    
    # --- OUTPUT FILE ---
    SUBMISSION_FILE = 'submission.csv'

# ==========================================
# HELPER FUNCTIONS (must match your training preprocessing)
# ==========================================
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def extract_structured_data(text):
    if not isinstance(text, str): return pd.Series([np.nan, 'unknown'], index=['Value', 'Unit'])
    value_match = re.search(r'Value:\s*([\d\.]+)', text, re.IGNORECASE); value = float(value_match.group(1)) if value_match else np.nan
    unit_match = re.search(r'Unit:\s*(\w+)', text, re.IGNORECASE); unit = unit_match.group(1) if unit_match else 'unknown'
    return pd.Series([value, unit], index=['Value', 'Unit'])

def clean_text(text):
    if not isinstance(text, str): return ""
    text = re.sub(r'Value:.*|Unit:.*|<.*?>|[^a-zA-Z\s]', ' ', text, flags=re.IGNORECASE).lower()
    words = text.split()
    lemmatized = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and len(w) > 1]
    return ' '.join(lemmatized)

def standardize_units(unit):
    if not isinstance(unit, str): return 'unknown'
    unit = unit.lower().strip(); unit_map = {'oz': 'ounce', 'ounces': 'ounce', 'fl oz': 'fl_oz', 'fz': 'fl_oz', 'ct': 'count', 'none': 'unknown'}
    return unit_map.get(unit, unit)

# ==========================================
# MAIN EXECUTION BLOCK
# ==========================================
if __name__ == '__main__':
    # --- 1. Load Model and Preprocessor ---
    print("\n--- Phase 1: Loading model and the single preprocessor ---")
    try:
        df_test = pd.read_csv(CFG.TEST_CSV)
        model = joblib.load(CFG.MODEL_FILE)
        preprocessor = joblib.load(CFG.TEXT_PREPROCESSOR)
        print("  ‚úì All required files loaded successfully.")
    except FileNotFoundError as e:
        print(f"\n‚ùå FATAL ERROR: A required file was not found: {e.filename}")
        exit()

    # --- 2. Create Test Features using ONLY the Base Preprocessor ---
    print("\n--- Phase 2: Preprocessing test data to match model's expectation ---")
    
    extracted_data = df_test['catalog_content'].apply(extract_structured_data)
    features_df = pd.concat([df_test.reset_index(drop=True), extracted_data], axis=1)
    features_df['cleaned_catalog'] = features_df['catalog_content'].apply(clean_text)
    features_df['Unit_standardized'] = features_df['Unit'].apply(standardize_units)
    
    # This is the only transformation that was used for the model you have
    X_final_test = preprocessor.transform(features_df)
    
    print(f"  ‚úì Preprocessing complete. Final feature shape: {X_final_test.shape}")
    
    # Final sanity check
    if X_final_test.shape[1] != model.n_features_:
        print(f"\n‚ùå FATAL ERROR: Feature mismatch even with the simplest approach.")
        print(f"   Generated {X_final_test.shape[1]} features, but model expects {model.n_features_}.")
        print("   This may mean the `preprocessor.joblib` file is from a different experiment.")
        exit()
    else:
        print("  ‚úì Feature count matches the trained model. Ready to predict.")

    # --- 3. Generate Predictions ---
    print("\n--- Phase 3: Generating predictions ---")
    log_predictions = model.predict(X_final_test)
    print("  ‚úì Predictions generated.")

    # --- 4. Create Submission File ---
    print("\n--- Phase 4: Creating submission file ---")
    final_prices = np.expm1(log_predictions)
    final_prices[final_prices < 0] = 0

    submission_df = pd.DataFrame({'sample_id': df_test['sample_id'], 'price': final_prices})
    submission_df.to_csv(CFG.SUBMISSION_FILE, index=False)
    
    print("\n" + "="*50)
    print(f"üéâ Success! Submission file created at: '{CFG.SUBMISSION_FILE}'")
    print("="*50)
    print("\nFirst 5 rows of your submission file:")
    print(submission_df.head())

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



--- Final Prediction Script (Using Base Preprocessor Only) ---

--- Phase 1: Loading model and the single preprocessor ---
  ‚úì All required files loaded successfully.

--- Phase 2: Preprocessing test data to match model's expectation ---
  ‚úì Preprocessing complete. Final feature shape: (75000, 5054)
  ‚úì Feature count matches the trained model. Ready to predict.

--- Phase 3: Generating predictions ---
  ‚úì Predictions generated.

--- Phase 4: Creating submission file ---

üéâ Success! Submission file created at: 'submission.csv'

First 5 rows of your submission file:
   sample_id      price
0     100179  10.591078
1     245611   9.450141
2     146263  17.000130
3      95658   7.454436
4      36806  16.120088
