In [None]:
!pip install transformers torch



In [None]:
import pandas as pd

train_df = pd.read_csv('/content/train.csv', sep='\t')
dev_df = pd.read_csv('/content/dev.csv', sep='\t')

dev_df.rename(columns={'setence1': 'sentence1'}, inplace=True)

In [None]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch
from transformers import BertTokenizer
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Punctuation removal
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])

    return lemmatized_text

In [None]:
class Task1A_Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text1 = str(self.dataframe.iloc[idx]['sentence1'])
        text2 = str(self.dataframe.iloc[idx]['sentence2'])

        sentence1 = preprocess_text(text1)
        sentence2 = preprocess_text(text2)

        score = self.dataframe.iloc[idx]['score']

        # Tokenize the pair of sentences to get the token ids, attention masks, and token type ids
        encoding = self.tokenizer.encode_plus(
            sentence1, sentence2,
            add_special_tokens=True, #cls and sep
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(score, dtype=torch.float)
        }

In [None]:
# Assuming 'df' is your DataFrame
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Initialize the dataset
train_dataset = Task1A_Dataset(train_df, tokenizer)
dev_dataset = Task1A_Dataset(dev_df, tokenizer)

# Create a DataLoader
train_dataloader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=1024, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
from transformers import BertModel
import torch
import torch.nn as nn

class BertForTextSimilarity(nn.Module):
    def __init__(self, freeze_bert=False):
        super(BertForTextSimilarity, self).__init__()
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Add a linear layer for regression
        self.regression = nn.Linear(self.bert.config.hidden_size, 1)

        # Option to freeze BERT layers to prevent them from being updated during training
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        # Get the output from BERT model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # The first token of every sequence is a special token ([CLS]) that contains the aggregate representation for classification tasks. We use it for regression here.
        cls_output = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)

        # Pass the [CLS] token's output through the regression layer
        score = self.regression(cls_output)  # Shape: (batch_size, 1)

        return score


In [None]:
model = BertForTextSimilarity()

# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA:", torch.cuda.get_device_name(0))
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU instead.")


CUDA is not available. Using CPU instead.


In [None]:
import torch

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
mse_loss = torch.nn.MSELoss()

num_epochs = 3  # or however many epochs you plan to train for

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    train_loss = 0
    i = 0
    for batch in train_dataloader:
        print(i)
        i+=1
        # Forward pass
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        loss = mse_loss(outputs.squeeze(), batch['labels'])

        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    # After each epoch, do validation
    model.eval()  # Set the model to evaluation mode
    val_loss = 0
    with torch.no_grad():  # No need to compute gradients during validation
        for batch in dev_dataloader:
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
            loss = mse_loss(outputs.squeeze(), batch['labels'])
            val_loss += loss.item()

    # Calculate average losses
    avg_train_loss = train_loss / len(train_dataloader)
    avg_val_loss = val_loss / len(dev_dataloader)

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}')

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


0
