In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/corpus2/pre_proc_train.csv
/kaggle/input/corpus2/pre_proc_test_time.csv
/kaggle/input/corpus2/pre_proc_test_company.csv


In [2]:
!pip install torch torchvision transformers pillow



In [3]:
import torch
import torch.nn as nn
from transformers import CLIPModel, CLIPProcessor
from PIL import Image
import requests
from io import BytesIO
import pandas as pd

In [4]:
class CLIPContentPrediction(nn.Module):
    def __init__(self, clip_model, hidden_size, vocab_size, num_heads, num_layers):
        super(CLIPContentPrediction, self).__init__()
        
        self.clip_model = clip_model
        self.transformer = nn.Transformer(d_model=hidden_size, nhead=num_heads, num_encoder_layers=num_layers, num_decoder_layers=num_layers)

#         self.company_embedding = nn.Linear(1,128)
#         self.timestamp_embedding = nn.Linear(1, 64)

#         # Fully connected layer to combine all embeddings
#         self.fc = nn.Linear(512 + 128 + 64, hidden_size)

        # Transformer decoder layer
#         self.transformer_decoder = nn.TransformerDecoder(
#             nn.TransformerDecoderLayer(d_model=hidden_size, nhead=num_heads),
#             num_layers=num_layers
#         )

        # Final output layer to map decoder outputs to vocab size (for text generation)
        self.output_layer = nn.Linear(hidden_size, vocab_size)

    def forward(self, image, timestamp, company_name, target_seq):
#         with torch.no_grad():  # We do not need gradients for the CLIP model
#             clip_outputs = self.clip_model(pixel_values=image)
#         image_embeddings = clip_outputs.image_embeds  # Shape: [batch_size=16, 512]
        
#         timestamp_embeddings = self.timestamp_embedding(timestamp.unsqueeze(1))  # Shape: [batch_size=16, 64]
#         company_embeddings = self.company_embedding(company_name)  # Shape: [batch_size=16, company_emb_size]
#         combined_embeddings = torch.cat((image_embeddings, timestamp_embeddings, company_embeddings), dim=1)  # Shape: [batch_size=16, 512 + 64 + company_emb_size]
#         combined_embeddings = self.fc(combined_embeddings)  # Shape: [batch_size=16, hidden_size]
#         target_seq = target_seq.unsqueeze(1)  # Shape: [batch_size=16, seq_len=1, hidden_size]
#         transformer_output = self.transformer_decoder(target_seq, combined_embeddings.unsqueeze(1))  # Shape: [batch_size, seq_len, hidden_size]
#         predicted_tokens = self.output_layer(transformer_output)  # Shape: [batch_size=16, seq_len, vocab_size]
        clip_outputs = self.clip_model(input_ids=input_ids, pixel_values=pixel_values)
        image_embeds = clip_outputs.image_embeds
        text_embeds = clip_outputs.text_embeds
        combined_embeds = torch.cat((image_embeds, text_embeds), dim=1)
        transformer_output = self.transformer(combined_embeds)
        logits = self.fc_out(transformer_output)
        
        return logits  # Shape: [batch_size=16, vocab_size]


In [5]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import requests
from io import BytesIO
import pandas as pd

class CustomCLIPDataset(Dataset):
    def __init__(self, data_frame, processor):
        self.data_frame = pd.read_csv(data_frame)
        self.processor = processor
#         self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data_frame)

    def __getitem__(self, idx):
        # Extract row data
        row = self.data_frame.iloc[idx]

        img_url = row['full_image_url']
        image = self.load_image(img_url)
        timestamp = row['date']
        company_name = row['inferred company']
        content_text = row['content']
        
        encoding = self.processor(
            text=[f"{timestamp} {company_name}"], images=image, return_tensors="pt", padding=True, truncation=True
        )
        
        target_encoding = self.processor.tokenizer(content_text, return_tensors="pt", padding=True, truncation=True)
        return encoding['input_ids'].squeeze(0), encoding['pixel_values'].squeeze(0), target_encoding['input_ids'].squeeze(0)
    
    def load_image(self, img_url):
        response = requests.get(img_url)
        if response.status_code == 200:
            image = Image.open(BytesIO(response.content)).convert("RGB")
            image = image.resize((224, 224))
            return image
        else:
            return create_random_image(224, 224)
    
def create_random_image(width, height):
    random_image_array = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
    random_image = Image.fromarray(random_image_array)
    return random_image


In [6]:
# Initialize your tokenizer (for example, a BERT tokenizer for text tokenization)
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

dataset = CustomCLIPDataset(data_frame='/kaggle/input/corpus2/pre_proc_train.csv', processor=processor)

dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=4)

# Hyperparameters
hidden_size = 512  # Size of the hidden layer
vocab_size = 30522  # Vocabulary size for text generation (e.g., from a tokenizer like GPT)
num_heads = 8  # Number of attention heads
num_layers = 4  # Number of transformer layers

model = CLIPContentPrediction(clip_model, hidden_size, vocab_size, num_heads, num_layers)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Set up optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()  # For text token generation

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for images, timestamps, company_names, target_tokens in dataloader:  # Assumes DataLoader is set up
        # Move data to device
        images = images.to(device)
        timestamps = timestamps.to(device)
        company_names = company_names.to(device)
        target_tokens = target_tokens.to(device)
        
        # Forward pass
        optimizer.zero_grad()
        predicted_tokens = model(input_ids=input_ids, pixel_values=pixel_values)
        
        # Compute loss
        loss = criterion(predicted_tokens.view(-1, vocab_size), target_tokens.view(-1))  # Flatten for CrossEntropyLoss
        
        # Backpropagation
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")


config.json:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/961k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]



RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 309, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 317, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 174, in collate
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 174, in <listcomp>
    return [collate(samples, collate_fn_map=collate_fn_map) for samples in transposed]  # Backwards compatibility.
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 142, in collate
    return collate_fn_map[elem_type](batch, collate_fn_map=collate_fn_map)
  File "/opt/conda/lib/python3.10/site-packages/torch/utils/data/_utils/collate.py", line 214, in collate_tensor_fn
    return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [21] at entry 0 and [23] at entry 7
