In [None]:
import polars as pl
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from transformers import DistilBertTokenizer, DistilBertModel, T5Tokenizer, T5ForConditionalGeneration
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import LinearLR
import re
import os
from datetime import datetime
import pickle
import gc
import random
from torch.amp import autocast, GradScaler
import psutil

# Enable MPS fallback for M1 compatibility
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

# Diagnostic: Print memory usage
def print_memory():
    process = psutil.Process()
    mem = process.memory_info().rss / (1024 ** 2)  # MB
    print(f"Current memory usage: {mem:.2f} MB")

# SMAPE
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

# Feature extraction
def extract_quantity(text):
    match = re.search(r'(?:pack|box|set|bundle|case|dozen|carton|bulk) of (\d+)', text, re.I)
    return int(match.group(1)) if match else 1

def extract_numeric(text, pattern):
    match = re.search(pattern, text, re.I)
    return float(match.group(1)) if match else 0

units = {
    'gb': r'(\d+\.?\d*)\s*gb',
    'oz': r'(\d+\.?\d*)\s*oz',
    'inch': r'(\d+\.?\d*)\s*(?:inch|in(?:ch)?)',
    'mp': r'(\d+\.?\d*)\s*mp',
    'lbs': r'(\d+\.?\d*)\s*lbs?',
    'mah': r'(\d+\.?\d*)\s*mah',
    'watts': r'(\d+\.?\d*)\s*w(?:atts?)?',
    'kg': r'(\d+\.?\d*)\s*kg',
    'ml': r'(\d+\.?\d*)\s*ml'
}

def extract_features(text):
    text_lower = text.lower()
    feats = {'quantity': extract_quantity(text_lower)}
    for unit, pattern in units.items():
        feats[f'feat_{unit}'] = extract_numeric(text_lower, pattern)
    premiums = ['premium', 'luxury', 'high-end', 'pro', 'ultra', 'elite', 'deluxe', 'professional']
    feats['premium_keyword_count'] = sum(text_lower.count(word) for word in premiums)
    title = re.split(r'[.:]\s', text_lower)[0]
    feats['title_length'] = len(title)
    feats['content_word_count'] = len(text_lower.split())
    return feats

# Diagnostic: Load and inspect files
print("=== Train.csv Head ===")
try:
    train = pl.read_csv('input/train.csv', infer_schema_length=10000)
    print(train.head(5))
    print("Columns:", train.columns)
    print("Shape:", train.shape)
except FileNotFoundError:
    print("ERROR: 'input/train.csv' not found. Please confirm file path.")
print_memory()

print("\n=== Test.csv Head ===")
try:
    test = pl.read_csv('input/test.csv', infer_schema_length=10000)
    print(test.head(5))
    print("Columns:", test.columns)
    print("Shape:", test.shape)
except FileNotFoundError:
    print("ERROR: 'input/test.csv' not found. Please confirm file path.")
print_memory()

print("\n=== final_embeddings.pkl Inspection ===")
try:
    with open('input/final_embeddings.pkl', 'rb') as f:
        embeddings_data = pickle.load(f)
    print("Type:", type(embeddings_data))
    if isinstance(embeddings_data, np.ndarray):
        print("NumPy Array Shape:", embeddings_data.shape)
        print("Dtype:", embeddings_data.dtype)
        if embeddings_data.dtype.names:
            print("Structured Array Fields:", embeddings_data.dtype.names)
        else:
            print("First Row Sample:", embeddings_data[0, :5] if embeddings_data.shape[0] > 0 else "Empty")
    elif isinstance(embeddings_data, dict):
        keys = list(embeddings_data.keys())[:5]
        print("Dict Keys (first 5):", keys)
        print("Value Shape (first key):", embeddings_data[keys[0]].shape if keys else "Empty")
    elif isinstance(embeddings_data, pd.DataFrame):
        print("Pandas DataFrame Columns:", embeddings_data.columns.tolist())
        print("Head (5 rows):\n", embeddings_data.head(5))
    else:
        print("Unexpected format, content sample:", str(embeddings_data)[:200])
except FileNotFoundError:
    print("ERROR: 'input/final_embeddings.pkl' not found. Please confirm file path.")
print_memory()

# Load and preprocess embeddings
try:
    with open('input/final_embeddings.pkl', 'rb') as f:
        embeddings_data = pickle.load(f)
    if isinstance(embeddings_data, np.ndarray):
        if embeddings_data.dtype.names:
            embeddings = pl.DataFrame(embeddings_data)
            fields = list(embeddings_data.dtype.names)
            if 'index' in fields:
                embeddings = embeddings.rename({'index': 'image_link'})
            else:
                embeddings = embeddings.rename({fields[0]: 'image_link'})
        else:
            embeddings = pl.DataFrame({
                'image_link': embeddings_data[:, 0].astype(str),
                **{f'emb_{i}': embeddings_data[:, i+1].astype(np.float32) for i in range(embeddings_data.shape[1]-1)}
            })
    elif isinstance(embeddings_data, dict):
        keys = list(embeddings_data.keys())
        arrays = np.stack([embeddings_data[k] for k in keys])
        embeddings = pl.DataFrame({
            'image_link': pl.Series(keys).cast(pl.Utf8),
            **{f'emb_{i}': arrays[:, i].astype(np.float32) for i in range(arrays.shape[1])}
        })
    elif isinstance(embeddings_data, pd.DataFrame):
        embeddings = pl.from_pandas(embeddings_data)
        if 'index' in embeddings.columns:
            embeddings = embeddings.rename({'index': 'image_link'})
    else:
        raise ValueError("Unsupported pickle format. Please share inspection output.")
    embed_cols = [col for col in embeddings.columns if col != 'image_link']
    print(f"Embedding Columns: {len(embed_cols)} dimensions")
except Exception as e:
    print(f"ERROR loading embeddings: {e}")
    raise
print_memory()

# Load CSVs and merge
train = pl.read_csv('input/train.csv', infer_schema_length=10000).lazy()
test = pl.read_csv('input/test.csv', infer_schema_length=10000).lazy()
train = train.with_columns(pl.col('image_link').cast(pl.Utf8))
test = test.with_columns(pl.col('image_link').cast(pl.Utf8))
embeddings = embeddings.with_columns(pl.col('image_link').cast(pl.Utf8))
train = train.join(embeddings, on='image_link', how='left')
test = test.join(embeddings, on='image_link', how='left')
gc.collect()

# Extract features
feature_struct = pl.Struct([pl.Field(k, pl.Float32) for k in ['quantity'] + [f'feat_{u}' for u in units] + ['premium_keyword_count', 'title_length', 'content_word_count']])
train = train.with_columns(
    pl.col('catalog_content').map_elements(extract_features, return_dtype=feature_struct).alias('features')
).unnest('features')
test = test.with_columns(
    pl.col('catalog_content').map_elements(extract_features, return_dtype=feature_struct).alias('features')
).缓

System: You didn't provide the requested information about the dataset (e.g., the output from the diagnostic print statements for `train.csv`, `test.csv`, and `final_embeddings.pkl`). This makes it challenging to ensure the code perfectly aligns with your data's structure, especially for the embeddings pickle file, which caused the `ColumnNotFoundError` due to an incorrect assumption about its format (e.g., expecting an 'index' column). Without the exact structure of these files, I had to make some educated guesses based on the context of your original code and typical dataset formats for price prediction tasks.

To proceed effectively, please run the diagnostic section of the code (lines 1-84) and share the output for the following:
- **train.csv**: The `head(5)`, `columns`, and `shape`.
- **test.csv**: The `head(5)`, `columns`, and `shape`.
- **final_embeddings.pkl**: The type, shape, dtype, and a sample of the data (e.g., first row or keys).
- **Memory usage**: The memory printouts after each section to gauge RAM impact.

Without this, the code above assumes:
- `train.csv` and `test.csv` have the columns you specified.
- `final_embeddings.pkl` is likely a NumPy array or DataFrame with `image_link` as the first column (possibly unnamed or named differently, causing the error) and float32 embeddings.
- The dataset is of moderate size (thousands of rows), suitable for your MacBook M1 Pro’s 16GB RAM.

The code was interrupted mid-execution, so I’ll provide a complete version that addresses the `ColumnNotFoundError` by handling multiple pickle formats flexibly and includes diagnostics to clarify data structure. It retains your core requirements:
- DistilBERT for `catalog_content` text embeddings.
- Extracted numerical features (quantity, units like oz/gb, premium keywords, text lengths) via regex.
- Pre-computed image embeddings from `final_embeddings.pkl`.
- T5-small for paraphrasing high-price (>100) items in augmentation.
- On-the-fly augmentation for high/low price tails (high: T5 paraphrase + "luxury edition"; low: append "bulk carton" and double quantity).
- Log1p price transformation, MSE loss, SMAPE evaluation.
- M1 Pro optimizations (MPS device, float16, AMP, frozen DistilBERT, gradient checkpointing, quantization).

### Instructions to Provide Data Insights
Run the code up to line 84 (before the feature extraction) and share the output. Here’s a snippet to isolate diagnostics if you prefer running just that part:

```python
import polars as pl
import numpy as np
import pandas as pd
import pickle
import psutil

def print_memory():
    process = psutil.Process()
    mem = process.memory_info().rss / (1024 ** 2)
    print(f"Current memory usage: {mem:.2f} MB")

print("=== Train.csv Head ===")
try:
    train = pl.read_csv('input/train.csv', infer_schema_length=10000)
    print(train.head(5))
    print("Columns:", train.columns)
    print("Shape:", train.shape)
except FileNotFoundError:
    print("ERROR: 'input/train.csv' not found. Please confirm file path.")
print_memory()

print("\n=== Test.csv Head ===")
try:
    test = pl.read_csv('input/test.csv', infer_schema_length=10000)
    print(test.head(5))
    print("Columns:", test.columns)
    print("Shape:", test.shape)
except FileNotFoundError:
    print("ERROR: 'input/test.csv' not found. Please confirm file path.")
print_memory()

print("\n=== final_embeddings.pkl Inspection ===")
try:
    with open('input/final_embeddings.pkl', 'rb') as f:
        embeddings_data = pickle.load(f)
    print("Type:", type(embeddings_data))
    if isinstance(embeddings_data, np.ndarray):
        print("NumPy Array Shape:", embeddings_data.shape)
        print("Dtype:", embeddings_data.dtype)
        if embeddings_data.dtype.names:
            print("Structured Array Fields:", embeddings_data.dtype.names)
        else:
            print("First Row Sample:", embeddings_data[0, :5] if embeddings_data.shape[0] > 0 else "Empty")
    elif isinstance(embeddings_data, dict):
        keys = list(embeddings_data.keys())[:5]
        print("Dict Keys (first 5):", keys)
        print("Value Shape (first key):", embeddings_data[keys[0]].shape if keys else "Empty")
    elif isinstance(embeddings_data, pd.DataFrame):
        print("Pandas DataFrame Columns:", embeddings_data.columns.tolist())
        print("Head (5 rows):\n", embeddings_data.head(5))
    else:
        print("Unexpected format, content sample:", str(embeddings_data)[:200])
except FileNotFoundError:
    print("ERROR: 'input/final_embeddings.pkl' not found. Please confirm file path.")
print_memory()

Memory after Start: 141.95 MB
Train columns OK, shape: (75000, 4)
Test columns OK, shape: (75000, 3)
Memory after CSV Load: 318.91 MB
Embeddings type: <class 'dict'>
Dict with 140587 items
Error loading embeddings: all input arrays must have the same shape


ValueError: all input arrays must have the same shape