In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amazon-ml/sample_test.csv
/kaggle/input/amazon-ml/sample_test_out_fail.csv
/kaggle/input/amazon-ml/sample_test_out.csv
/kaggle/input/amazon-ml/train.csv
/kaggle/input/amazon-ml/test.csv


In [None]:
import os
import warnings
import requests
from io import BytesIO

import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from torchvision import models

from transformers import AutoTokenizer, AutoModel, AdamW, get_linear_schedule_with_warmup

print("Libraries Imported!")

warnings.filterwarnings('ignore')
torch.backends.cudnn.benchmark = True

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Hyperparameters
BATCH_SIZE = 16
EPOCHS = 10
LEARNING_RATE = 2e-5
MAX_LEN = 128
IMAGE_SIZE = 224

entity_unit_map = {
    "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
    "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
    "voltage": {"millivolt", "kilovolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint", "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

all_units = set.union(*entity_unit_map.values())
all_units.add("unknown")

unit_encoder = LabelEncoder()
unit_encoder.fit(list(all_units))

label_encoder = LabelEncoder()

class ProductImageDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, is_test=False):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_test = is_test
        self.transform = A.Compose([
            A.Resize(IMAGE_SIZE, IMAGE_SIZE),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2()
        ])
        if not is_test:
            self.df['encoded_entity_name'] = label_encoder.transform(self.df['entity_name'])

    def __len__(self):
        return len(self.df)

    @staticmethod
    def download_image(url):
        try:
            response = requests.get(url, timeout=5)
            img = Image.open(BytesIO(response.content)).convert('RGB')
            return np.array(img)
        except:
            return np.zeros((IMAGE_SIZE, IMAGE_SIZE, 3), dtype=np.uint8)

    def __getitem__(self, idx):
        image_url = self.df.iloc[idx]['image_link']
        entity_name = self.df.iloc[idx]['entity_name']
        
        img = self.download_image(image_url)
        img = self.transform(image=img)['image']

        combined_text = f"{entity_name}"

        encoding = self.tokenizer.encode_plus(
            combined_text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )

        item = {
            'image': img,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'entity_name': entity_name
        }

        if not self.is_test:
            item['entity_label'] = torch.tensor(self.df.iloc[idx]['encoded_entity_name'], dtype=torch.long)
            
            entity_value = self.df.iloc[idx]['entity_value']
            value, unit = self.parse_entity_value(entity_value)
            item['entity_value'] = torch.tensor(value, dtype=torch.float)
            
            if unit == "":
                unit = "unknown"
            item['entity_unit'] = torch.tensor(unit_encoder.transform([unit])[0], dtype=torch.long)

        return item

    @staticmethod
    def parse_entity_value(entity_value):
        parts = entity_value.replace('[', '').replace(']', '').split()
        values = [float(part) for part in parts if part.replace('.', '').isdigit()]
        value = sum(values) / len(values) if values else 0.0
        unit = ' '.join(parts[len(values):]).strip()
        return value, unit

class EntityExtractionModel(nn.Module):
    def __init__(self, num_labels, num_units):
        super(EntityExtractionModel, self).__init__()
        self.num_labels = num_labels
        self.num_units = num_units
        self.bert = AutoModel.from_pretrained('microsoft/deberta-v3-small')
        self.cnn = models.efficientnet_b0(pretrained=True)
        self.cnn.classifier = nn.Identity()
        
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(self.bert.config.hidden_size + 1280, 512)
        self.classifier = nn.Linear(512, num_labels)
        self.regressor = nn.Linear(512, 1)
        self.unit_classifier = nn.Linear(512, num_units)

    def forward(self, input_ids, attention_mask, images):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        bert_pooled = bert_output.last_hidden_state[:, 0, :]
        
        cnn_features = self.cnn(images)
        
        combined_features = torch.cat((bert_pooled, cnn_features), dim=1)
        x = self.dropout(combined_features)
        x = F.relu(self.fc(x))
        x = self.dropout(x)
        
        logits = self.classifier(x)
        value = self.regressor(x).squeeze(-1)
        unit_logits = self.unit_classifier(x)
        
        return logits, value, unit_logits

def train_model(model, train_loader, val_loader, optimizer, scheduler, classification_criterion, regression_criterion, unit_criterion, epochs):
    scaler = GradScaler()
    best_val_loss = float('inf')
    
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad(set_to_none=True)

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['entity_label'].to(device)
            values = batch['entity_value'].to(device)
            units = batch['entity_unit'].to(device)

            with autocast():
                logits, predicted_values, unit_logits = model(input_ids=input_ids, attention_mask=attention_mask, images=images)
                classification_loss = classification_criterion(logits, labels)
                regression_loss = regression_criterion(predicted_values, values)
                unit_loss = unit_criterion(unit_logits, units)
                loss = classification_loss + regression_loss + unit_loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        
        val_loss = validate_model(model, val_loader, classification_criterion, regression_criterion, unit_criterion)
        
        print(f'Epoch {epoch + 1}/{epochs}, Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')
            print("Saved best model.")

def validate_model(model, val_loader, classification_criterion, regression_criterion, unit_criterion):
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            labels = batch['entity_label'].to(device)
            values = batch['entity_value'].to(device)
            units = batch['entity_unit'].to(device)

            with autocast():
                logits, predicted_values, unit_logits = model(input_ids=input_ids, attention_mask=attention_mask, images=images)
                classification_loss = classification_criterion(logits, labels)
                regression_loss = regression_criterion(predicted_values, values)
                unit_loss = unit_criterion(unit_logits, units)
                loss = classification_loss + regression_loss + unit_loss
            val_loss += loss.item()

    return val_loss / len(val_loader)

def inference(model, test_loader, tokenizer):
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Inference"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            entity_names = batch['entity_name']

            with autocast():
                logits, predicted_values, unit_logits = model(input_ids=input_ids, attention_mask=attention_mask, images=images)
            
            pred_entity_names = torch.argmax(logits, dim=1)
            pred_entity_names = label_encoder.inverse_transform(pred_entity_names.cpu().numpy())
            
            pred_units = torch.argmax(unit_logits, dim=1)
            pred_units = unit_encoder.inverse_transform(pred_units.cpu().numpy())
            
            for idx, (entity_name, value, unit) in enumerate(zip(entity_names, predicted_values.cpu().numpy(), pred_units)):
                formatted_value = format_value(value, unit, entity_name)
                predictions.append({
                    'index': idx,
                    'prediction': formatted_value
                })

    return pd.DataFrame(predictions)

def format_value(value, unit, entity_name):
    if entity_name not in entity_unit_map or unit not in entity_unit_map[entity_name]:
        return ""
    return f"{value:.2f} {unit}".strip()

def main():
    tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-small')
    
    train_df = pd.read_csv('/kaggle/input/amazon-ml/train.csv')
    test_df = pd.read_csv('/kaggle/input/amazon-ml/test.csv')
    
    train_df = train_df.head(5000)
    # Regular expression pattern to match: number, space, and unit
    pattern = r'^\d+(\.\d+)?\s[a-zA-Z]+$'

    # Filter rows that match the pattern
    train_data_filtered = train_df[train_df['entity_value'].str.match(pattern)]
    # Regular expression pattern to extract the unit part
    unit_pattern = r'\s+([a-zA-Z]+)$'

    # Extract units from the 'entity_value' column
    units = train_data_filtered['entity_value'].str.extract(unit_pattern)[0]

    # Filter rows where the extracted unit is in the all_units set
    train_data_filtered_2 = train_data_filtered[units.isin(all_units)]
    
    train_df, val_df = train_test_split(train_data_filtered_2, test_size=0.1, random_state=42)
    
    label_encoder.fit(train_df['entity_name'])

    print("Preparing datasets...")
    train_dataset = ProductImageDataset(train_df, tokenizer, MAX_LEN)
    val_dataset = ProductImageDataset(val_df, tokenizer, MAX_LEN)
    test_dataset = ProductImageDataset(test_df, tokenizer, MAX_LEN, is_test=True)

    print("Creating data loaders...")
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0,pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0 ,pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0,pin_memory=True)

    num_labels = len(label_encoder.classes_)
    num_units = len(unit_encoder.classes_)
    model = EntityExtractionModel(num_labels=num_labels, num_units=num_units).to(device)

    optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    
    classification_criterion = nn.CrossEntropyLoss()
    regression_criterion = nn.MSELoss()
    unit_criterion = nn.CrossEntropyLoss()

    print("Starting training...")
    train_model(model, train_loader, val_loader, optimizer, scheduler, classification_criterion, regression_criterion, unit_criterion, EPOCHS)

    print("Loading best model for inference...")
    model.load_state_dict(torch.load('best_model.pth'))

    print("Running inference on test set...")
    predictions_df = inference(model, test_loader, tokenizer)
    predictions_df.to_csv('submission.csv', index=False)
    print("Predictions saved to submission.csv")

if __name__ == '__main__':
    main()

  check_for_updates()


Libraries Imported!
Using device: cuda


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Preparing datasets...
Creating data loaders...


pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Downloading: "https://download.pytorch.org/models/efficientnet_b0_rwightman-7f5810bc.pth" to /root/.cache/torch/hub/checkpoints/efficientnet_b0_rwightman-7f5810bc.pth
100%|██████████| 20.5M/20.5M [00:00<00:00, 147MB/s]


Starting training...


Epoch 1/10: 100%|██████████| 270/270 [07:27<00:00,  1.66s/it]


Epoch 1/10, Train Loss: 18363979542522044.0000, Val Loss: 1417956.5785
Saved best model.


Epoch 2/10:   7%|▋         | 19/270 [00:32<07:27,  1.78s/it]

In [None]:
import os
import pandas as pd
import torch
from tqdm import tqdm
from torch.cuda.amp import autocast
import random

# Assuming these constants and objects are available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def format_value(value, unit, entity_name):
    """Format the predicted value based on unit and entity_name."""
    if entity_name not in entity_unit_map or unit not in entity_unit_map[entity_name]:
        return ""
    return f"{value:.2f} {unit}".strip()

def predictor(image_link, category_id, entity_name):
    '''
    A simple random predictor. Replace this with actual model predictions.
    '''
    # For testing purposes, we'll generate random predictions.
    # Replace this with actual inference logic.
    if random.random() > 0.5:
        return f"{random.uniform(1, 100):.2f} inch"
    return ""

def inference(model, test_loader, tokenizer):
    """This function handles the model inference on test data."""
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Inference"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            images = batch['image'].to(device)
            entity_names = batch['entity_name']

            with autocast():
                logits, predicted_values, unit_logits = model(input_ids=input_ids, attention_mask=attention_mask, images=images)

            # Convert the logits to meaningful values (labels and units)
            pred_entity_names = torch.argmax(logits, dim=1)
            pred_entity_names = label_encoder.inverse_transform(pred_entity_names.cpu().numpy())

            pred_units = torch.argmax(unit_logits, dim=1)
            pred_units = unit_encoder.inverse_transform(pred_units.cpu().numpy())

            for idx, (entity_name, value, unit) in enumerate(zip(entity_names, predicted_values.cpu().numpy(), pred_units)):
                formatted_value = format_value(value, unit, entity_name)
                predictions.append({
                    'index': idx,
                    'prediction': formatted_value
                })

    return pd.DataFrame(predictions)

def run_test_prediction():
    """Run the prediction and save to submission.csv"""
    DATASET_FOLDER = '/kaggle/input/amazon-ml'  # Path to your dataset folder
    WORKING_DIR = '/kaggle/working/'
    
    # Step 1: Load test data
    test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
    
    # Step 2: Apply predictor function on test data
    test['prediction'] = test.apply(
        lambda row: predictor(row['image_link'], row['group_id'], row['entity_name']), axis=1)

    # Step 3: Ensure predictions are formatted correctly
    test['prediction'] = test['prediction'].apply(
        lambda x: x if isinstance(x, str) and x else "")

    # Step 4: Save the predictions to submission.csv
    output_filename = os.path.join(WORKING_DIR, 'submission.csv')
    test[['index', 'prediction']].to_csv(output_filename, index=False)

    print(f"Submission saved to {output_filename}")


run_test_prediction()