In [1]:
import pandas as pd

train_df = pd.read_csv('train.csv', sep='\t')
dev_df = pd.read_csv('dev.csv', sep='\t')

dev_df.rename(columns={'setence1': 'sentence1'}, inplace=True)

In [2]:
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import torch
from transformers import BertTokenizer
import string

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
class Task1B_Dataset(Dataset):
    def __init__(self, dataframe, max_length = 128):
        self.dataframe = dataframe
        self.max_length = max_length

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        text1 = str(self.dataframe.iloc[idx]['sentence1'])
        text2 = str(self.dataframe.iloc[idx]['sentence2'])
        score = self.dataframe.iloc[idx]['score']
        score = score / 5.0
        return text1, text2, score

In [4]:
train_dataset = Task1B_Dataset(train_df)
dev_dataset = Task1B_Dataset(dev_df)

In [5]:
train_dataset[0]

('A plane is taking off.', 'An air plane is taking off.', 1.0)

In [6]:
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
dev_dataloader = DataLoader(dev_dataset, batch_size=64, shuffle=False)

In [8]:
!pip install -U sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-2.6.1-py3-none-any.whl (163 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/163.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/163.3 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m122.9/163.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.3/163.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m45.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.11.0->sentence-trans

In [10]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
model = SentenceTransformer("all-MiniLM-L6-v2")

In [21]:
# Function to compute cosine similarities between pairs of sentences
def compute_cosine_similarities(dataset):
    similarities = []
    model.eval()
    with torch.no_grad():
        for sentence1, sentence2, _ in DataLoader(dataset, batch_size=64, shuffle=False):
            sentence1_embeddings = model.encode(sentence1, convert_to_tensor=True)
            sentence2_embeddings = model.encode(sentence2, convert_to_tensor=True)
            cosine_scores = util.pytorch_cos_sim(sentence1_embeddings, sentence2_embeddings)
            cosine_scores_diagonal = cosine_scores.cpu().numpy().diagonal()
            scaled_scores = (cosine_scores_diagonal + 1) / 2
            final_scores = scaled_scores*5
            similarities.extend(final_scores)

    return np.array(similarities)

In [22]:
train_similarities = compute_cosine_similarities(train_dataset)
dev_similarities = compute_cosine_similarities(dev_dataset)

In [28]:
train_labels = train_df['score'].tolist()
dev_labels = dev_df['score'].tolist()

In [29]:
print(train_labels)

[5.0, 3.8, 3.8, 2.6, 4.25, 4.25, 0.5, 1.6, 2.2, 5.0, 4.2, 4.6, 3.867, 4.667, 1.667, 3.75, 5.0, 0.5, 3.8, 5.0, 3.2, 2.8, 4.6, 3.0, 5.0, 4.8, 5.0, 4.2, 4.2, 4.0, 4.0, 4.909, 3.0, 2.4, 4.2, 3.4, 5.0, 3.75, 2.75, 5.0, 4.0, 3.6, 1.6, 1.75, 5.0, 1.0, 1.0, 2.375, 3.8, 3.2, 3.2, 4.4, 3.75, 4.75, 3.2, 1.556, 3.938, 5.0, 5.0, 4.0, 1.6, 4.75, 3.5, 1.4, 1.4, 4.0, 5.0, 3.833, 0.6, 2.917, 4.2, 2.0, 2.6, 1.6, 2.0, 4.2, 2.0, 4.8, 4.4, 5.0, 3.0, 4.25, 4.25, 3.8, 2.4, 1.6, 2.0, 1.6, 4.0, 2.2, 4.4, 3.6, 3.6, 0.5, 0.8, 0.6, 2.6, 2.0, 2.2, 2.4, 3.6, 2.2, 4.8, 1.643, 1.75, 2.25, 4.0, 4.8, 3.2, 4.0, 4.4, 4.6, 3.8, 4.8, 4.857, 5.0, 2.533, 1.0, 1.0, 2.0, 0.143, 2.0, 1.6, 1.6, 3.4, 4.0, 4.8, 2.5, 1.75, 1.0, 5.0, 1.4, 4.0, 3.8, 4.0, 4.0, 4.8, 0.6, 4.75, 2.2, 3.0, 0.0, 2.2, 0.4, 4.8, 4.8, 3.8, 3.0, 4.0, 5.0, 3.8, 3.0, 4.4, 3.8, 3.0, 0.667, 4.0, 3.75, 4.133, 4.0, 3.6, 1.2, 1.6, 4.0, 4.0, 3.2, 1.0, 1.0, 0.6, 0.4, 3.4, 3.6, 3.0, 4.0, 1.2, 0.6, 1.6, 2.6, 3.6, 3.5, 3.765, 2.75, 2.25, 2.75, 3.8, 4.8, 3.6, 1.2, 0.4, 2.4

In [30]:
print(train_similarities)

[4.8482585 4.7550797 4.7300024 ... 2.5925746 3.2570932 2.4058995]


In [32]:
from scipy.stats import pearsonr
pearson_corr, _ = pearsonr(dev_similarities, dev_labels)
pearson_corr

0.8611497764888689