In [3]:
# Final Prediction Script - Loads bundled model and scaler from a single .pth file
# Focuses on prediction, resumability, and saving the final output CSV.

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
import pickle # MUST be imported to handle unpickling of StandardScaler
from tqdm import tqdm
import os
import sys

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Define output file path for predictions
OUTPUT_CSV_PATH = 'test_predictions.csv'
# Column expected in the test data for tracking
SAMPLE_ID_COLUMN = 'sample_id'


# --- Neural Network Architecture (Must match the one used for training) ---
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1024)
        self.bn1 = nn.BatchNorm1d(1024)
        self.dropout1 = nn.Dropout(0.2)

        self.fc2 = nn.Linear(1024, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.dropout2 = nn.Dropout(0.2)

        self.fc3 = nn.Linear(512, 256)
        self.bn3 = nn.BatchNorm1d(256)
        self.dropout3 = nn.Dropout(0.2)

        self.fc4 = nn.Linear(256, 64)
        self.bn4 = nn.BatchNorm1d(64)
        self.dropout4 = nn.Dropout(0.1)

        self.fc5 = nn.Linear(64, 32)
        self.dropout5 = nn.Dropout(0.1)

        self.out = nn.Linear(32, 1)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = self.dropout1(x)
        x = F.relu(self.bn2(self.fc2(x)))
        x = self.dropout2(x)
        x = F.relu(self.bn3(self.fc3(x)))
        x = self.dropout3(x)
        x = F.relu(self.bn4(self.fc4(x)))
        x = self.dropout4(x)
        x = F.relu(self.fc5(x))
        x = self.dropout5(x)
        x = self.out(x)
        return x

def predict_on_test_data(test_data_path, model_path):
    """
    Loads the bundled model/scaler, runs predictions on test data, 
    saves the predicted prices to a CSV, and supports resumability.
    """
    
    # --- STEP 1: Load All Test Data ---
    print(f"--- Loading ALL Test Data from: {test_data_path} ---")
    try:
        all_test_data = pd.read_csv(test_data_path)
    except FileNotFoundError:
        print(f"Error: Test data not found at {test_data_path}. Please check the path.")
        return

    if SAMPLE_ID_COLUMN not in all_test_data.columns:
        print(f"CRITICAL ERROR: Test data must contain a '{SAMPLE_ID_COLUMN}' column.")
        return

    # --- STEP 2: Check for Existing Predictions (Resumability) ---
    predicted_ids = set()
    if os.path.exists(OUTPUT_CSV_PATH):
        print(f"Resuming prediction: Found existing file at {OUTPUT_CSV_PATH}")
        try:
            existing_predictions = pd.read_csv(OUTPUT_CSV_PATH)
            predicted_ids = set(existing_predictions[SAMPLE_ID_COLUMN].tolist())
            print(f"Found {len(predicted_ids)} already predicted samples.")
        except Exception as e:
            print(f"Warning: Could not read existing predictions. Starting fresh. Error: {e}")
            
    # --- STEP 3: Filter Data to Predict Only Missing Samples ---
    test_data_to_predict = all_test_data[~all_test_data[SAMPLE_ID_COLUMN].isin(predicted_ids)].copy()
    
    if test_data_to_predict.empty:
        print("All test samples have already been predicted. Final output CSV is complete.")
        prediction_mode = False
    else:
        print(f"Starting prediction on {len(test_data_to_predict)} remaining samples...")
        prediction_mode = True
        
        # Split features for the remaining set
        gemma_cols = [f'gemma_{i}' for i in range(768)]
        laion_text_cols = [f'laion_text_{i}' for i in range(768)]
        laion_image_cols = [f'laion_image_{i}' for i in range(768)]

        fused_features_test = np.hstack([
            test_data_to_predict[gemma_cols].values,
            test_data_to_predict[laion_text_cols].values,
            test_data_to_predict[laion_image_cols].values
        ])
        
        test_ids_to_predict = test_data_to_predict[SAMPLE_ID_COLUMN].values
    
    
    # --- STEP 4: Load Model and Scaler (Required even for prediction_mode=False to ensure successful load) ---
    print(f"--- Loading Bundled Model/Scaler from: {model_path} ---")
    try:
        # CRITICAL FIX: Use weights_only=False and pickle_module=pickle to load StandardScaler
        checkpoint = torch.load(
            model_path, 
            map_location=device, 
            weights_only=False,
            pickle_module=pickle
        )
    except Exception as e:
        print(f"Error loading model checkpoint. Details: {e}")
        return

    # Extract the components
    loaded_scaler = checkpoint.get('scaler')
    loaded_model_state_dict = checkpoint.get('model_state_dict')
    loaded_input_dim = checkpoint.get('input_dim')
    
    if loaded_scaler is None or loaded_model_state_dict is None or loaded_input_dim is None:
        print("Error: Checkpoint file is missing 'scaler', 'model_state_dict', or 'input_dim'. Cannot proceed.")
        return
        
    # --- STEP 5: Run Prediction (Only if samples are pending) ---
    if prediction_mode:
        
        # Scale remaining Test Features using the LOADED Scaler
        print("Scaling pending test features using the loaded StandardScaler...")
        fused_features_test_scaled = loaded_scaler.transform(fused_features_test)

        # Initialize model and load weights
        model = SimpleNN(input_dim=loaded_input_dim).to(device)
        model.load_state_dict(loaded_model_state_dict)
        model.eval()

        test_dataset = TensorDataset(torch.tensor(fused_features_test_scaled, dtype=torch.float32))
        test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

        test_preds_transformed = []
        with torch.no_grad():
            for xb in tqdm(test_loader, desc="Generating Predictions"):
                xb = xb[0].to(device)
                pred = model(xb).cpu().numpy()
                test_preds_transformed.extend(pred)

        test_preds_transformed = np.array(test_preds_transformed).flatten()
        
        # Inverse Transformation: Exponentiate to return to original price scale (1+log(y) -> y)
        test_preds_original = np.expm1(test_preds_transformed)

        # --- Save New Predictions Incrementally ---
        new_predictions_df = pd.DataFrame({
            SAMPLE_ID_COLUMN: test_ids_to_predict,
            'price': test_preds_original # Predicted Price
        })

        # Append new predictions to the CSV file. mode='a' creates the file if it doesn't exist.
        header = not os.path.exists(OUTPUT_CSV_PATH)
        new_predictions_df.to_csv(OUTPUT_CSV_PATH, mode='a', header=header, index=False)
        print(f"Successfully saved {len(new_predictions_df)} new predictions to {OUTPUT_CSV_PATH}")

    # --- FINAL REPORT ---
    print("\n" + "="*50)
    print("                 Prediction Complete")
    print("="*50)
    print(f"Final prediction output saved to: {OUTPUT_CSV_PATH}")
    print("This file contains 'sample_id' and the predicted 'price' column.")
    print("="*50)
        
    return 0

# --- Execute Testing ---
# 1. Update this path to the location of your friend's test data CSV
TEST_DATA_KAGGLE_PATH = r'/kaggle/input/test-embedding/all_test_embeddings.csv' 

# 2. Update this path to the location of the bundled .pth file (The dataset you created)
# REMINDER: Replace {your-model-dataset-name} with the actual name of your Kaggle dataset.
MODEL_KAGGLE_PATH = r'/kaggle/input/test-embedding/best_nn_model.pth'

if __name__ == '__main__':
    predict_on_test_data(TEST_DATA_KAGGLE_PATH, MODEL_KAGGLE_PATH)


--- Loading ALL Test Data from: /kaggle/input/test-embedding/all_test_embeddings.csv ---
Resuming prediction: Found existing file at test_predictions.csv
Found 40000 already predicted samples.
Starting prediction on 35000 remaining samples...
--- Loading Bundled Model/Scaler from: /kaggle/input/test-embedding/best_nn_model.pth ---
Scaling pending test features using the loaded StandardScaler...


Generating Predictions: 100%|██████████| 69/69 [00:01<00:00, 58.77it/s]


Successfully saved 35000 new predictions to test_predictions.csv

                 Prediction Complete
Final prediction output saved to: test_predictions.csv
This file contains 'sample_id' and the predicted 'price' column.
