## Task 1 - Text Similarity

## Setup 1A

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd drive/MyDrive

/content/drive/MyDrive


In [5]:
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch
from transformers import BertTokenizer
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [4]:
dev_df = pd.read_csv('A3_task1_data_files/dev.csv', sep='\t')

dev_df.rename(columns={'setence1': 'sentence1'}, inplace=True)

In [6]:
# Check for null values in both dataframes
print('Dev null values:')
print(dev_df.isnull().sum())

dev_df.dropna(inplace=True)

Dev null values:
score        0
sentence1    0
sentence2    2
dtype: int64


In [7]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()

    # Punctuation removal
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_text = ' '.join([lemmatizer.lemmatize(token) for token in tokens])

    return lemmatized_text

In [8]:
class Task1A_Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.dataframe = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text1 = str(self.dataframe.iloc[idx]['sentence1'])
        text2 = str(self.dataframe.iloc[idx]['sentence2'])

        sentence1 = preprocess_text(text1)
        sentence2 = preprocess_text(text2)

        score = self.dataframe.iloc[idx]['score']

        # Tokenize the pair of sentences to get the token ids, attention masks, and token type ids
        encoding = self.tokenizer.encode_plus(
            sentence1, sentence2,
            add_special_tokens=True, #cls and sep
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(score, dtype=torch.float)
        }

In [9]:
# Assuming 'df' is your DataFrame
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Initialize the dataset
dev_dataset = Task1A_Dataset(dev_df, tokenizer)

# Create a DataLoader
dev_dataloader = DataLoader(dev_dataset, batch_size=64, shuffle=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
from transformers import BertModel
import torch
import torch.nn as nn

class BertForTextSimilarity(nn.Module):
    def __init__(self, freeze_bert=False):
        super(BertForTextSimilarity, self).__init__()
        # Load pre-trained BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Add a linear layer for regression
        self.regression = nn.Linear(self.bert.config.hidden_size, 1)

        # Option to freeze BERT layers to prevent them from being updated during training
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        # Get the output from BERT model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # The first token of every sequence is a special token ([CLS]) that contains the aggregate representation for classification tasks. We use it for regression here.
        cls_output = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)

        # Pass the [CLS] token's output through the regression layer
        score = self.regression(cls_output)  # Shape: (batch_size, 1)

        return score

In [11]:
model = BertForTextSimilarity()
if torch.cuda.is_available():
    device = torch.device("cuda")
    model.to(device)  # Move model to CUDA device if available
    print("Using CUDA")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU instead.")

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Using CUDA


In [12]:
import zipfile
import os

# If the file is in Google Drive, adjust the path accordingly
zip_path = '/content/drive/MyDrive/BERT_Setup1A.pt.zip'

extract_to_path = '/content/'

# Ensure the extract_to_path directory exists
if not os.path.exists(extract_to_path):
    os.makedirs(extract_to_path)

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

print(f'Extracted to: {extract_to_path}')

Extracted to: /content/


In [15]:
model.load_state_dict(torch.load("/content/BERT_Setup1A.pt", map_location=device))

<All keys matched successfully>

### Calculate similarity scores on the validation set

In [16]:
model.eval()  # Set the model to evaluation mode
predictions = []
labels = []

with torch.no_grad():  # No need to compute gradients during validation
    for batch in dev_dataloader:
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        predictions.extend(outputs.squeeze().tolist())
        labels.extend(inputs['labels'].tolist())

# Calculate Pearson correlation
from scipy.stats import pearsonr

correlation, p_value = pearsonr(labels, predictions)

print(f'Pearson Correlation: {correlation:.4f}, P-Value: {p_value:.4f}')

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


Pearson Correlation: 0.8510, P-Value: 0.0000


### - Task 1C

In [None]:
%pip install sentence_transformers

In [20]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from sentence_transformers import InputExample, SentenceTransformer, losses
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sklearn.metrics import mean_squared_error
import numpy as np
from scipy.stats import pearsonr

In [18]:
import zipfile
import os

# If the file is in Google Drive, adjust the path accordingly
zip_path = '/content/drive/MyDrive/fine-tuned-model_epoch_4.zip'

extract_to_path = '/content/'

# Ensure the extract_to_path directory exists
if not os.path.exists(extract_to_path):
    os.makedirs(extract_to_path)

# Extract the zip file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_to_path)

print(f'Extracted to: {extract_to_path}')

Extracted to: /content/


In [21]:
dev_examples = [InputExample(texts=[str(row['sentence1']), str(row['sentence2'])], label=float(row['score'])/5.0) for _, row in dev_df.iterrows()]

In [22]:
# Load the final model
model = SentenceTransformer(f'/content/fine-tuned-model_epoch_4')

In [23]:
# Function to compute predictions
def compute_predictions(model, examples):
    embeddings1 = model.encode([example.texts[0] for example in examples], convert_to_tensor=True, show_progress_bar=False)
    embeddings2 = model.encode([example.texts[1] for example in examples], convert_to_tensor=True, show_progress_bar=False)

    # Calculate cosine similarities
    cosine_similarities = torch.cosine_similarity(embeddings1, embeddings2).cpu().numpy()
    return cosine_similarities

# Compute predictions for the dev dataset
predicted_similarities = compute_predictions(model, dev_examples)

# Extract the true labels/scores
true_labels = np.array([example.label for example in dev_examples])

# Calculate the Pearson correlation coefficient
pearson_corr, _ = pearsonr(predicted_similarities, true_labels)

print(f"Pearson Correlation Coefficient for Validation Data: {pearson_corr}")

Pearson Correlation Coefficient for Validation Data: 0.8914478394906815


### Inference Pipeline for 1A

In [24]:
test_df_1a = pd.read_csv('A3_task1_data_files/sample_test.csv', sep='\t')

test_df_1a.rename(columns={'setence1': 'sentence1'}, inplace=True)

In [37]:
sentence_list1 = test_df_1a['sentence1'].tolist()
sentence_list2 = test_df_1a['sentence2'].tolist()
ids = test_df_1a['id'].tolist()

In [36]:
def calculate_similarity_scores(sentence_list1, sentence_list2, model, tokenizer, max_length=128):
    model.eval()  # Put the model in evaluation mode
    similarity_scores = []

    with torch.no_grad():
        for sentence1, sentence2 in zip(sentence_list1, sentence_list2):
            # Preprocess the sentences
            processed_sentence1 = preprocess_text(sentence1)
            processed_sentence2 = preprocess_text(sentence2)

            # Tokenize the sentences
            encoding = tokenizer.encode_plus(
                processed_sentence1, processed_sentence2,
                add_special_tokens=True,
                max_length=max_length,
                padding='max_length',
                truncation=True,
                return_tensors='pt'
            )

            # Predict the similarity score
            outputs = model(
                input_ids=encoding['input_ids'],
                attention_mask=encoding['attention_mask']
            )

            # Extract the similarity score and convert it to a scalar
            similarity_score = outputs.squeeze().item()
            similarity_scores.append(similarity_score)

    return similarity_scores

In [39]:
model_1a = BertForTextSimilarity()
model_1a.load_state_dict(torch.load("/content/BERT_Setup1A.pt", map_location=device))
# Calculate similarity scores
similarity_scores = calculate_similarity_scores(sentence_list1, sentence_list2, model_1a, tokenizer)

In [41]:
# Combine the lists into a pandas DataFrame
out_df_1a = pd.DataFrame({
    'ID': ids,
    'Similarity Score': similarity_scores,
    'Sentence 1': sentence_list1,
    'Sentence 2': sentence_list2
})

In [43]:
# Export the DataFrame to a CSV file
csv_file_path = 'demo_output_1a.csv'  # Specify your desired file path and name
out_df_1a.to_csv(csv_file_path, index=False)  # Set index=False to exclude row indices from the CSV

print(f"CSV file has been created at: {csv_file_path}")

CSV file has been created at: demo_output_1a.csv


### Inference Pipeline for 1C

In [44]:
test_df_1c = pd.read_csv('A3_task1_data_files/sample_test.csv', sep='\t')

test_df_1c.rename(columns={'setence1': 'sentence1'}, inplace=True)

In [45]:
sentence_list1 = test_df_1c['sentence1'].tolist()
sentence_list2 = test_df_1c['sentence2'].tolist()
ids = test_df_1c['id'].tolist()

In [46]:
test_examples_1c = [InputExample(texts=[str(row['sentence1']), str(row['sentence2'])]) for _, row in test_df_1c.iterrows()]

In [47]:
# Load the final model
model = SentenceTransformer(f'/content/fine-tuned-model_epoch_4')

In [53]:
# Function to compute predictions
def compute_predictions(model, examples):
    embeddings1 = model.encode([example.texts[0] for example in examples], convert_to_tensor=True, show_progress_bar=False)
    embeddings2 = model.encode([example.texts[1] for example in examples], convert_to_tensor=True, show_progress_bar=False)

    # Calculate cosine similarities
    cosine_similarities = torch.cosine_similarity(embeddings1, embeddings2).cpu().numpy()
    return cosine_similarities

# Compute predictions for the dev dataset
predicted_similarities = compute_predictions(model, test_examples_1c)
predicted_similarities = [5*x for x in predicted_similarities]

In [54]:
# Combine the lists into a pandas DataFrame
out_df_1c = pd.DataFrame({
    'ID': ids,
    'Similarity Score': predicted_similarities,
    'Sentence 1': sentence_list1,
    'Sentence 2': sentence_list2
})

In [55]:
# Export the DataFrame to a CSV file
csv_file_path = 'demo_output_1c.csv'  # Specify your desired file path and name
out_df_1c.to_csv(csv_file_path, index=False)  # Set index=False to exclude row indices from the CSV

print(f"CSV file has been created at: {csv_file_path}")

CSV file has been created at: demo_output_1c.csv
