# Step 1: Load Data (train and test CSV)



In [1]:
import pandas as pd

# Paths to your files
train_path = 'C:/Users/sanja/Desktop/Amazon/data/train.csv'
test_path = 'C:/Users/sanja/Desktop/Amazon/data/test.csv'

# Load CSVs
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

print(train_df.head())
print(test_df.head())

   sample_id                                    catalog_content  \
0      33127  Item Name: La Victoria Green Taco Sauce Mild, ...   
1     198967  Item Name: Salerno Cookies, The Original Butte...   
2     261251  Item Name: Bear Creek Hearty Soup Bowl, Creamy...   
3      55858  Item Name: Judee’s Blue Cheese Powder 11.25 oz...   
4     292686  Item Name: kedem Sherry Cooking Wine, 12.7 Oun...   

                                          image_link  price  
0  https://m.media-amazon.com/images/I/51mo8htwTH...   4.89  
1  https://m.media-amazon.com/images/I/71YtriIHAA...  13.12  
2  https://m.media-amazon.com/images/I/51+PFEe-w-...   1.97  
3  https://m.media-amazon.com/images/I/41mu0HAToD...  30.34  
4  https://m.media-amazon.com/images/I/41sA037+Qv...  66.49  
   sample_id                                    catalog_content  \
0     100179  Item Name: Rani 14-Spice Eshamaya's Mango Chut...   
1     245611  Item Name: Natural MILK TEA Flavoring extract ...   
2     146263  Item Name:


# Step 2: Build Custom PyTorch Dataset to Fetch Images on-the-fly

This Dataset will:

* Read image URLs from the dataframe
* Download and transform images on demand
* Process catalog_content into text tokens/embeddings
* Return combined features and target (for train)

Note: We prefer the on-the-fly fetching approach from URLs during training rather than downloading the complete dataset on the PC
A custom PyTorch Dataset fetching images dynamically during training and optionally caching them is efficient and scalable

In [2]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import requests
from io import BytesIO
from transformers import AutoTokenizer  # tokenizer
import torchvision.transforms as transforms

class ProductDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128, train=True, transform=None):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.train = train
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Text processing
        text = row['catalog_content']
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Image fetching & processing (no caching)
        image_url = row['image_link']
        try:
            response = requests.get(image_url, timeout=5)
            img = Image.open(BytesIO(response.content)).convert('RGB')
        except:
            # Use placeholder on failure
            img = Image.new('RGB', (224, 224), color='gray')

        if self.transform:
            img = self.transform(img)

        if self.train:
            price = torch.tensor(row['price'], dtype=torch.float)
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'image': img,
                'price': price
            }
        else:
            return {
                'input_ids': encoding['input_ids'].squeeze(),
                'attention_mask': encoding['attention_mask'].squeeze(),
                'image': img,
                'sample_id': row['sample_id']
            }


# Step 3: Model Building (Multi-Modal: Text + Image)

We'll use:

* A pretrained DistilBERT model to extract text embeddings.
* A pretrained CNN (e.g., ResNet18) to extract image embeddings.
* Concatenate the two feature vectors.
* Feed them into fully connected layers for regression output.

In [3]:
import torch
import torch.nn as nn
from transformers import AutoModel
from torchvision.models import resnet18
from torchvision.models import ResNet18_Weights


class MultiModalPricePredictor(nn.Module):
    def __init__(self, text_model_name='distilbert-base-uncased', cnn_model_name='resnet18', cnn_pretrained=True, fine_tune_text=False):
        super(MultiModalPricePredictor, self).__init__()

        # Text model - DistilBERT
        self.text_model = AutoModel.from_pretrained(text_model_name)
        if not fine_tune_text:
            for param in self.text_model.parameters():
                param.requires_grad = False

        text_embedding_dim = self.text_model.config.hidden_size  # 768 for DistilBERT


        
        # Image model - ResNet18 feature extractor
        
        # cnn = resnet18(pretrained=cnn_pretrained)
        cnn = resnet18(weights=ResNet18_Weights.DEFAULT if cnn_pretrained else None)
        modules = list(cnn.children())[:-1]  # Remove last FC layer
        self.cnn_model = nn.Sequential(*modules)
        for param in self.cnn_model.parameters():
            param.requires_grad = fine_tune_text  # Optionally freeze CNN layers

        image_embedding_dim = 512  # ResNet18 last conv layer output

        # Combine embeddings and output regression
        self.fc = nn.Sequential(
            nn.Linear(text_embedding_dim + image_embedding_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 1)  # Output single float (price)
        )

    def forward(self, input_ids, attention_mask, image):
        # Text embeddings (use CLS token representation)
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_embeds = text_outputs.last_hidden_state[:, 0, :]  # CLS token

        # Image embeddings
        #img_embeds = self.cnn_model(image).squeeze()  # [batch, 512, 1, 1] -> [batch, 512]
        img_embeds = torch.flatten(self.cnn_model(image), 1)  # Flatten the [batch, 512, 1, 1] tensor to [batch, 512]


        # Concatenate
        combined = torch.cat((text_embeds, img_embeds), dim=1)
        output = self.fc(combined).squeeze(1)  # Output shape [batch]
        return output


# Step 4: Training Loop Setup

We will build a PyTorch training loop using an optimizer and scheduler. Use MSE loss for regression and calculate SMAPE for evaluation:

In [4]:
import torch.optim as optim
from torch.utils.data import DataLoader

def smape(y_true, y_pred, epsilon=1e-6):
    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_pred) + torch.abs(y_true) + epsilon) / 2.0
    return torch.mean(numerator / denominator)

def train_model(model, train_loader, val_loader, epochs=3, lr=2e-5, device='cpu'):
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            prices = batch['price'].to(device)

            optimizer.zero_grad()
            preds = model(input_ids, attention_mask, images)
            loss = criterion(preds, prices)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        # Validation step
        model.eval()
        val_losses = []
        val_smapes = []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                images = batch['image'].to(device)
                prices = batch['price'].to(device)

                preds = model(input_ids, attention_mask, images)
                loss = criterion(preds, prices)
                val_losses.append(loss.item())

                score = smape(prices, preds)
                val_smapes.append(score.item())

        avg_val_loss = sum(val_losses) / len(val_losses)
        avg_val_smape = sum(val_smapes) / len(val_smapes)

        print(f"Epoch {epoch+1}: Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}, Val SMAPE={avg_val_smape:.4f}")

    return model


# Step 5: Data Loaders and Training Execution



In [None]:
import torch
from torch.utils.data import DataLoader, random_split
from transformers import AutoTokenizer
import torchvision.transforms as transforms
import pandas as pd
import time

if __name__ == "__main__":
    # Set up tokenizer (DistilBERT)
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

    # Set up image transforms (for ResNet and similar CNNs)
    image_transforms = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    # Load CSV data
    train_df = pd.read_csv("C:/Users/sanja/Desktop/Amazon/data/train.csv")
    test_df = pd.read_csv("C:/Users/sanja/Desktop/Amazon/data/test.csv")

    # Set up your dataset
    full_dataset = ProductDataset(df=train_df, tokenizer=tokenizer, transform=image_transforms, train=True)

    # Train-validation split
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

    # DataLoaders with num_workers=0 for Windows/Jupyter
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=0)

    # Initialize model and device
    model = MultiModalPricePredictor()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Train your model
    trained_model = train_model(model, train_loader, val_loader, epochs=3, device=device)


    
    # Calculate the time taken for training
    end_time = time.time()
    total_time = end_time - start_time  # Time in seconds

    # Convert total time to minutes and seconds
    minutes = total_time // 60
    seconds = total_time % 60
    

    # Print training time in minutes and seconds
    print("Training completed")
    print(f"Total training time: {minutes:.0f} minutes and {seconds:.2f} seconds")



#  Step 6. Validation Accuracy and Prediction



##  6.1 : Calculate SMAPE on Validation Set



In [None]:
def smape(y_true, y_pred, epsilon=1e-6):
    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_pred) + torch.abs(y_true) + epsilon) / 2.0
    return torch.mean(numerator / denominator).item()

# After training, run evaluation:
model.eval()
smape_list = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        prices = batch['price'].to(device)

        preds = model(input_ids, attention_mask, images)
        smape_list.append(smape(prices, preds))

final_val_smape = sum(smape_list) / len(smape_list)
print(f"Validation SMAPE: {final_val_smape:.4f}")


## 6.2 Generate Predictions for Test Set



In [None]:
# Prepare test dataset (no price column)
test_dataset = ProductDataset(df=test_df, tokenizer=tokenizer, transform=image_transforms, train=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=0)

model.eval()
predictions = []
sample_ids = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        preds = model(input_ids, attention_mask, images)
        
        # Ensure predicted prices are positive
        preds = torch.clamp(preds, min=0.01)
        predictions.extend(preds.cpu().numpy())
        sample_ids.extend(batch['sample_id'].cpu().numpy())

# Final export as a CSV
import pandas as pd
output_df = pd.DataFrame({
    'sample_id': sample_ids,
    'price': [float(p) for p in predictions]
})
output_df.to_csv('test_out.csv', index=False)
print("Exported predictions to test_out.csv!")
