In [1]:
pip install torch torchvision transformers pandas scikit-learn matplotlib pillow

Note: you may need to restart the kernel to use updated packages.


In [2]:
## Step 2: Prepare Dataset Loader





import os
import pandas as pd
import torch
from torch.utils.data import Dataset
from PIL import Image
from torchvision import transforms
from transformers import BertTokenizer

class ProductPriceDataset(Dataset):
    def __init__(self, dataframe, image_dir, tokenizer, max_length=256):
        self.df = dataframe.reset_index(drop=True)
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.image_transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],  # ImageNet mean
                std=[0.229, 0.224, 0.225]   # ImageNet std
            )
        ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]

        # Load and transform image
        img_path = os.path.join(self.image_dir, row['image_filename'])
        image = Image.open(img_path).convert("RGB")
        image = self.image_transform(image)

        # Tokenize text
        text = row['catalog_content']
        encoded = self.tokenizer(text,
                                 truncation=True,
                                 padding='max_length',
                                 max_length=self.max_length,
                                 return_tensors='pt')
        input_ids = encoded['input_ids'].squeeze(0)
        attention_mask = encoded['attention_mask'].squeeze(0)

        # Get price (target)
        price = torch.tensor(row['price'], dtype=torch.float32)

        return {
            'image': image,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'price': price
        }



In [3]:
## Step 3: Build the Multimodal Model



import torch.nn as nn
from transformers import BertModel
from torchvision import models

class MultimodalPricePredictor(nn.Module):
    def __init__(self, text_model_name='bert-base-uncased'):
        super().__init__()

        # Image model: use EfficientNet or ResNet
        self.image_model = models.resnet50(pretrained=True)
        num_features = self.image_model.fc.in_features
        self.image_model.fc = nn.Identity()  # remove classification head

        # Text model: BERT
        self.text_model = BertModel.from_pretrained(text_model_name)
        self.text_proj = nn.Linear(self.text_model.config.hidden_size, 512)

        # Final regressor
        self.regressor = nn.Sequential(
            nn.Linear(num_features + 512, 256),
            nn.ReLU(),
            nn.Linear(256, 1)  # output: price
        )

    def forward(self, image, input_ids, attention_mask):
        image_features = self.image_model(image)  # [B, 2048]
        text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_features = self.text_proj(text_outputs.pooler_output)  # [B, 512]

        # Concatenate image + text
        combined = torch.cat((image_features, text_features), dim=1)  # [B, 2560]
        price = self.regressor(combined).squeeze(1)  # [B]

        return price


In [4]:
## Step 4: Training Loop






import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

# Load data
df = pd.read_csv("D:/ML Amazon Project/train.csv")

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Split train/val
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_dataset = ProductPriceDataset(train_df, "D:/ML Amazon Project/Img_10000", tokenizer)
val_dataset = ProductPriceDataset(val_df, "D:/ML Amazon Project/Img_10000", tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

# Model, optimizer, loss
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = MultimodalPricePredictor().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
criterion = nn.MSELoss()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to C:\Users\hp/.cache\torch\hub\checkpoints\resnet50-0676ba61.pth


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 97.8M/97.8M [02:50<00:00, 600kB/s]
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [6]:
print(df.columns)

Index(['sample_id', 'catalog_content', 'image_link', 'price'], dtype='object')


In [7]:
df['image_filename'] = df['sample_id'].astype(str) + '.jpg'

In [9]:
from sklearn.model_selection import train_test_split

# Split data
train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

# Load tokenizer
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create dataset
train_dataset = ProductPriceDataset(train_df, "D:/ML Amazon Project/Img_10000", tokenizer)
val_dataset = ProductPriceDataset(val_df, "D:/ML Amazon Project/Img_10000", tokenizer)

from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


In [10]:
## img_path = os.path.join(self.image_dir, row['image_filename'])


NameError: name 'self' is not defined

In [15]:
import os

# Just test the first few rows
for i in range(5):
    row = df.iloc[i]
    img_path = os.path.join("D:/ML Amazon Project/Img_10000", row['sample_id'].astype(str) + '.jpg')
    
    if not os.path.exists(img_path):
        print(f"❌ Missing image: {img_path}")
    else:
        print(f"✅ Found image: {img_path}")


❌ Missing image: D:/ML Amazon Project/Img_10000\33127.jpg
❌ Missing image: D:/ML Amazon Project/Img_10000\198967.jpg
❌ Missing image: D:/ML Amazon Project/Img_10000\261251.jpg
❌ Missing image: D:/ML Amazon Project/Img_10000\55858.jpg
❌ Missing image: D:/ML Amazon Project/Img_10000\292686.jpg


In [14]:

# Training Loop
for epoch in range(5):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        image = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        price = batch['price'].to(device)

        preds = model(image, input_ids, attention_mask)
        loss = criterion(preds, price)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Training Loss: {total_loss / len(train_loader):.4f}")

    # Validation
    model.eval()
    preds_list = []
    true_list = []
    with torch.no_grad():
        for batch in val_loader:
            image = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            price = batch['price'].to(device)

            preds = model(image, input_ids, attention_mask)
            preds_list.extend(preds.cpu().numpy())
            true_list.extend(price.cpu().numpy())

    r2 = r2_score(true_list, preds_list)
    mae = mean_absolute_error(true_list, preds_list)
    print(f"Validation R²: {r2:.4f}, MAE: {mae:.2f}")


FileNotFoundError: [Errno 2] No such file or directory: 'D:/ML Amazon Project/Img_10000\\186920.jpg'

In [24]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset

class ProductPriceDataset(Dataset):
    def __init__(self, df, image_dir, tokenizer, image_transform=None):
        self.df = df.reset_index(drop=True)  # reset index to match image filenames
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.image_transform = image_transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Construct image filename using idx + 1 (images start at 1.jpg)
        img_filename = f"{idx + 1}.jpg"
        img_path = os.path.join(self.image_dir, img_filename)

        # Load image
        image = Image.open(img_path).convert("RGB")
        if self.image_transform:
            image = self.image_transform(image)

        # Get row data
        row = self.df.iloc[idx]
        text = row['catalog_content']
        price = row['price']

        # Tokenize text
        encoding = self.tokenizer(
            text,
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=128
        )
        input_ids = encoding['input_ids'].squeeze(0)  # remove batch dimension
        attention_mask = encoding['attention_mask'].squeeze(0)

        return {
            'image': image,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'price': torch.tensor(price, dtype=torch.float)
        }


In [26]:
from torchvision import transforms

image_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # resize images to 224x224 (or your model’s input size)
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],  # standard ImageNet normalization
                         std=[0.229, 0.224, 0.225]),
])


In [27]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')


In [29]:
train_dataset = ProductPriceDataset(train_df, image_dir="D:/ML Amazon Project/Img_10000",
                                   tokenizer=tokenizer, image_transform=image_transform)

val_dataset = ProductPriceDataset(val_df, image_dir="D:/ML Amazon Project/Img_10000",
                                 tokenizer=tokenizer, image_transform=image_transform)


In [30]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)


In [31]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = YourModel()  # Replace with your model
model.to(device)

criterion = torch.nn.MSELoss()  # or another suitable loss for regression
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)


NameError: name 'YourModel' is not defined

In [32]:
import torch
import torch.nn as nn
from transformers import BertModel
from torchvision.models import resnet18

class MultiModalPricePredictor(nn.Module):
    def __init__(self):
        super(MultiModalPricePredictor, self).__init__()

        # Image model (pretrained ResNet18)
        self.cnn = resnet18(pretrained=True)
        self.cnn.fc = nn.Linear(self.cnn.fc.in_features, 256)

        # Text model (BERT)
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.text_fc = nn.Linear(self.bert.config.hidden_size, 256)

        # Combined
        self.fc = nn.Sequential(
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1)  # Output: predicted price
        )

    def forward(self, image, input_ids, attention_mask):
        # Extract image features
        img_features = self.cnn(image)

        # Extract text features
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        text_features = bert_outputs.pooler_output  # CLS token
        text_features = self.text_fc(text_features)

        # Combine both
        combined = torch.cat((img_features, text_features), dim=1)
        out = self.fc(combined)
        return out.squeeze(1)  # shape: (batch_size,)


In [33]:
model = MultiModalPricePredictor()
model.to(device)




Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\hp/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 44.7M/44.7M [00:04<00:00, 11.7MB/s]


MultiModalPricePredictor(
  (cnn): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=Tr

In [34]:
import torch.optim as optim
from torch.utils.data import DataLoader

def smape(y_true, y_pred, epsilon=1e-6):
    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_pred) + torch.abs(y_true) + epsilon) / 2.0
    return torch.mean(numerator / denominator)

def train_model(model, train_loader, val_loader, epochs=3, lr=2e-5, device='cpu'):
    model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            prices = batch['price'].to(device)

            optimizer.zero_grad()
            preds = model(input_ids, attention_mask, images)
            loss = criterion(preds, prices)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)

        # Validation step
        model.eval()
        val_losses = []
        val_smapes = []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                images = batch['image'].to(device)
                prices = batch['price'].to(device)

                preds = model(input_ids, attention_mask, images)
                loss = criterion(preds, prices)
                val_losses.append(loss.item())

                score = smape(prices, preds)
                val_smapes.append(score.item())

        avg_val_loss = sum(val_losses) / len(val_losses)
        avg_val_smape = sum(val_smapes) / len(val_smapes)

        print(f"Epoch {epoch+1}: Train Loss={avg_train_loss:.4f}, Val Loss={avg_val_loss:.4f}, Val SMAPE={avg_val_smape:.4f}")

    return model


In [35]:
import torch
from torch.utils.data import DataLoader, random_split
from transformers import AutoTokenizer
import torchvision.transforms as transforms
import pandas as pd

if __name__ == "__main__":
    # Set up tokenizer (DistilBERT)
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

    # Set up image transforms (for ResNet and similar CNNs)
    image_transforms = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    # Load CSV data
    train_df = pd.read_csv("/content/drive/MyDrive/Amazon_ML_Challenge/train.csv")
    test_df = pd.read_csv("/content/drive/MyDrive/Amazon_ML_Challenge/test.csv")

    # Set up your dataset
    full_dataset = ProductDataset(df=train_df, tokenizer=tokenizer, transform=image_transforms, train=True)

    # Train-validation split
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

    # DataLoaders with num_workers=0 for Windows/Jupyter
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=0)

    # Initialize model and device
    model = MultiModalPricePredictor()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Train your model
    trained_model = train_model(model, train_loader, val_loader, epochs=3, device=device)

    # Print when training is complete
    print("Training completed")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Amazon_ML_Challenge/train.csv'

In [37]:
import torch
from torch.utils.data import DataLoader, random_split
from transformers import AutoTokenizer
import torchvision.transforms as transforms
import pandas as pd

if __name__ == "__main__":
    # Set up tokenizer (DistilBERT)
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

    # Set up image transforms (for ResNet and similar CNNs)
    image_transforms = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                             std=[0.229, 0.224, 0.225])
    ])

    # ✅ Correct local paths for the CSV files
    train_csv_path = r"D:\ML Amazon Project\train.csv"
    test_csv_path = r"D:\ML Amazon Project\test.csv"

    # Load CSV data
    train_df = pd.read_csv(train_csv_path)
    test_df = pd.read_csv(test_csv_path)

    # Set up your dataset (make sure this matches your actual dataset class)
    full_dataset = ProductDataset(df=train_df, tokenizer=tokenizer, transform=image_transforms, train=True)

    # Train-validation split
    train_size = int(0.8 * len(full_dataset))
    val_size = len(full_dataset) - train_size
    train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

    # DataLoaders with num_workers=0 (safe for Windows)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, num_workers=0)

    # Initialize model and device
    model = MultiModalPricePredictor()  # Make sure this is defined above or imported
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # Loss and optimizer
    criterion = torch.nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    # Define the training function if not already
    def train_model(model, train_loader, val_loader, epochs, device):
        from sklearn.metrics import r2_score, mean_absolute_error

        for epoch in range(epochs):
            model.train()
            total_loss = 0
            for batch in train_loader:
                optimizer.zero_grad()
                image = batch['image'].to(device)
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                price = batch['price'].to(device)

                preds = model(image, input_ids, attention_mask).squeeze()
                loss = criterion(preds, price)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()

            print(f"Epoch {epoch + 1}, Training Loss: {total_loss / len(train_loader):.4f}")

            # Validation
            model.eval()
            preds_list = []
            true_list = []
            with torch.no_grad():
                for batch in val_loader:
                    image = batch['image'].to(device)
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    price = batch['price'].to(device)

                    preds = model(image, input_ids, attention_mask).squeeze()
                    preds_list.extend(preds.cpu().numpy())
                    true_list.extend(price.cpu().numpy())

            r2 = r2_score(true_list, preds_list)
            mae = mean_absolute_error(true_list, preds_list)
            print(f"Validation R²: {r2:.4f}, MAE: {mae:.2f}")

        return model

    # Train your model
    trained_model = train_model(model, train_loader, val_loader, epochs=3, device=device)

    print("✅ Training completed")


NameError: name 'ProductDataset' is not defined

In [39]:
class ProductDataset(Dataset):
    def __init__(self, df, tokenizer, transform=None, train=True):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.transform = transform
        self.train = train
        self.image_dir = "D:/ML Amazon Project/Img_10000"  # adjust if needed

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Get image path
        img_filename = f"{idx + 1}.jpg"
        img_path = os.path.join(self.image_dir, img_filename)
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        # Tokenize text
        text = self.df.loc[idx, 'catalog_content']
        encoding = self.tokenizer(
            text,
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=128
        )
        input_ids = encoding['input_ids'].squeeze(0)
        attention_mask = encoding['attention_mask'].squeeze(0)

        item = {
            'image': image,
            'input_ids': input_ids,
            'attention_mask': attention_mask,
        }

        if self.train:
            item['price'] = torch.tensor(self.df.loc[idx, 'price'], dtype=torch.float)

        return item


In [40]:
full_dataset = ProductDataset(df=train_df, tokenizer=tokenizer, transform=image_transforms, train=True)

In [None]:
def smape(y_true, y_pred, epsilon=1e-6):
    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_pred) + torch.abs(y_true) + epsilon) / 2.0
    return torch.mean(numerator / denominator).item()

# After training, run evaluation:
model.eval()
smape_list = []
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        prices = batch['price'].to(device)

        preds = model(input_ids, attention_mask, images)
        smape_list.append(smape(prices, preds))

final_val_smape = sum(smape_list) / len(smape_list)
print(f"Validation SMAPE: {final_val_smape:.4f}")

In [None]:
# Prepare test dataset (no price column)
test_dataset = ProductDataset(df=test_df, tokenizer=tokenizer, transform=image_transforms, train=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=0)

model.eval()
predictions = []
sample_ids = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        images = batch['image'].to(device)
        preds = model(input_ids, attention_mask, images)

        # Ensure predicted prices are positive
        preds = torch.clamp(preds, min=0.01)
        predictions.extend(preds.cpu().numpy())
        sample_ids.extend(batch['sample_id'].cpu().numpy())

# Final export as a CSV
import pandas as pd
output_df = pd.DataFrame({
    'sample_id': sample_ids,
    'price': [float(p) for p in predictions]
})
output_df.to_csv('test_out.csv', index=False)
print("Exported predictions to test_out.csv!")
