In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau
from transformers import VisualBertModel, VisualBertConfig, BertTokenizer, BlipProcessor, BlipForConditionalGeneration
from torchvision import transforms, models
from torchvision.models import resnet50
from PIL import Image
from io import BytesIO
import requests
import pandas as pd
from tqdm import tqdm


In [None]:

conversion_to_tons = {'gram': 1e-6,'ton': 1,'kilogram': 0.001,'ounce': 2.8349523125e-5,'pound': 0.00045359237,'carat': 2e-7,'microgram': 1e-12,'milligram': 1e-9 }

def convert_to_tons(row):
    try:
        value, unit = row['entity_value'].lower().split(' ', 1)
        value = float(value)
        conversion_factor = conversion_to_tons.get(unit.strip(), 1)
        return value * conversion_factor
    except ValueError:
        return None  

weight_units = ['gram', 'ton', 'kilogram', 'ounce', 'pound', 'carat', 'microgram', 'milligram']

train_df = pd.read_csv('/kaggle/input/pedalanja/student_resource 3/dataset/train.csv')
entity_name = 'item_weight'
entity_df = train_df[train_df['entity_name'] == entity_name].copy()

entity_df = entity_df.dropna(subset=['entity_value'])
entity_df = entity_df[~entity_df['entity_value'].str.contains(r'\[|\bto\b|\be\+17\b', regex=True, na=False)]
entity_df = entity_df[entity_df['entity_value'].str.contains('|'.join(weight_units), case=False, na=False)]

entity_df['entity_value'] = entity_df.apply(convert_to_tons, axis=1)
entity_df = entity_df[(entity_df['entity_value'] > 5e-8) & (entity_df['entity_value'] < 1)]
entity_df = entity_df.dropna(subset=['entity_value'])
entity_df = entity_df.head(500)

blip_processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")


In [None]:

class AdaptiveResNet(nn.Module):
    def __init__(self):
        super(AdaptiveResNet, self).__init__()
        resnet = resnet50(pretrained=True)
        self.features = nn.Sequential(*list(resnet.children())[:-2])  
        self.adaptive_pool = nn.AdaptiveAvgPool2d((1, 1))

    def forward(self, x):
        x = self.features(x)
        x = self.adaptive_pool(x)
        return x.view(x.size(0), -1)

adaptive_resnet = AdaptiveResNet()
adaptive_resnet.eval()

def generate_caption(image_url):
    response = requests.get(image_url)
    image = Image.open(BytesIO(response.content)).convert('RGB')
    inputs = blip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        output = blip_model.generate(**inputs, max_new_tokens=50)
    caption = blip_processor.decode(output[0], skip_special_tokens=True)
    words = caption.split()
    cleaned_words = [words[i] for i in range(len(words)) if i == 0 or words[i] != words[i-1]]
    return ' '.join(cleaned_words)

def extract_visual_features(image_url):
    response = requests.get(image_url)
    image = Image.open(BytesIO(response.content)).convert('RGB')
    transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    image = transform(image).unsqueeze(0)
    with torch.no_grad():
        features = adaptive_resnet(image)
    return features  # Shape: (1, 2048)

class ItemVolumeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=64):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        image_url = row['image_link']
        volume = row['entity_value']

        caption = generate_caption(image_url)
        inputs = self.tokenizer(caption, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt")
        
        visual_embeds = extract_visual_features(image_url)
        visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)

        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'visual_embeds': visual_embeds.squeeze(0),
            'visual_attention_mask': visual_attention_mask.squeeze(0),
            'volume': torch.tensor(volume, dtype=torch.float)
        }

class EntityExtractionModel(nn.Module):
    def __init__(self, hidden_size=768):
        super(EntityExtractionModel, self).__init__()
        self.config = VisualBertConfig.from_pretrained('uclanlp/visualbert-vqa-coco-pre')
        self.visualbert = VisualBertModel.from_pretrained('uclanlp/visualbert-vqa-coco-pre', config=self.config)
        
        self.regression_head = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.ReLU(),
            nn.Linear(hidden_size, 1)
        )

    def forward(self, input_ids, attention_mask, visual_embeds, visual_attention_mask):
        batch_size = input_ids.size(0)
        visual_embeds = visual_embeds.expand(batch_size, -1, -1)
        visual_attention_mask = visual_attention_mask.expand(batch_size, -1)

        outputs = self.visualbert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            visual_embeds=visual_embeds,
            visual_attention_mask=visual_attention_mask
        )
        pooled_output = outputs.pooler_output
        numeric_value = self.regression_head(pooled_output)
        return numeric_value.squeeze(-1)


In [None]:

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = EntityExtractionModel()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

dataset = ItemVolumeDataset(entity_df, tokenizer)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = ReduceLROnPlateau(optimizer, patience=2, factor=0.1)

criterion = nn.MSELoss()

num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
    for batch in tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        visual_embeds = batch['visual_embeds'].to(device)
        visual_attention_mask = batch['visual_attention_mask'].to(device)
        volume = batch['volume'].to(device)
        
        optimizer.zero_grad()
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            visual_embeds=visual_embeds,
            visual_attention_mask=visual_attention_mask
        )
        loss = criterion(outputs, volume)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.4f}")
    
    scheduler.step(avg_loss)

torch.save(model.state_dict(), 'visual_bert_volume_model.pth')