In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel
import torch.nn as nn
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm


In [None]:

class ItemDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=32):
        self.dataframe = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        text = row['caption']
        label = row['entity_value']

        inputs = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'label': torch.tensor(label, dtype=torch.float)
        }

class DistilBERTRegressor(nn.Module):
    def __init__(self, output_dim=1):
        super(DistilBERTRegressor, self).__init__()
        self.distilbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
        self.fc = nn.Linear(self.distilbert.config.hidden_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, output_dim)
        self.relu = nn.ReLU()

    def forward(self, input_ids, attention_mask):
        outputs = self.distilbert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        last_hidden_state = outputs[0]
        pooled_output = torch.mean(last_hidden_state, 1)
        x = pooled_output
        x = self.relu(self.fc(x))
        x = self.relu(self.fc2(x))
        return self.fc3(x)

In [None]:

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

model = DistilBERTRegressor(output_dim=1)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.device_count() > 1:
    model = torch.nn.DataParallel(model)
model = model.to(device)


In [None]:
print("using" , device)


In [None]:
df = pd.read_csv('/kaggle/input/traindata/final_item_w.csv')
dataset = ItemDataset(df, tokenizer)
dataloader = DataLoader(dataset, batch_size=90, shuffle=True , num_workers=4)


learning_rate = 1e-5
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = ReduceLROnPlateau(optimizer,factor=0.65, patience=1)
criterion = nn.MSELoss()



In [None]:
print(len(df))
print(len(dataloader))

In [None]:
num_epochs = 252
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    
#     for batch in tqdm(dataloader, desc=f'Epoch {epoch+1}/{num_epochs}', unit='batch'):
    for batch in dataloader:
        
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        label = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.squeeze(), label)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {avg_loss:.8f}")
    
    scheduler.step(avg_loss)

    
checkpoint = {'model_state_dict': model.state_dict()}
  
torch.save(checkpoint, 'distilbert_model_itemw.pth')
print("saved")

In [None]:
# torch.cuda.empty_cache()

In [None]:
print(f"Resuming learning rate: {optimizer.param_groups[0]['lr']}")


In [None]:
# weights="/kaggle/working/distilbert_model_itemw.pth"
# checkpoint = torch.load(weights, map_location=device)
# model.load_state_dict(checkpoint['model_state_dict'])
