### Phrase Similarity - Bert Embeddings
 evaluating the similarity of phrases using BERT embeddings and logistic regression.

#### Libraries

In [1]:
!pip install transformers datasets scikit-learn numpy

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[

In [13]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

#### Load Dataset
The dataset used here is the "PiC/phrase_similarity" dataset. This dataset is divided into three subsets: train, validation (dev), and test.

In [2]:
from datasets import load_dataset

dataset = load_dataset("PiC/phrase_similarity")
train_dataset = dataset['train']
dev_dataset = dataset['validation']
test_dataset = dataset['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/1.42M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/202k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/403k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7004 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

#### Load Pretrained Model and Tokenizer
A pretrained BERT model and tokenizer are loaded. BERT (Bidirectional Encoder Representations from Transformers) is used to convert phrases into embeddings.

In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

#### Bert Embeddings
A function is defined to get the BERT embeddings for a given text. The tokenizer converts the text to tensors and the model outputs the embeddings.

In [5]:
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

#### Preprocess Data
The datasets are processed to generate embeddings for both phrases in each pair. This involves mapping the embedding function to the dataset.

In [6]:
def preprocess_data(dataset):
    dataset = dataset.map(lambda x: {'phrase1_embedding': get_bert_embedding(x['phrase1']),
                                     'phrase2_embedding': get_bert_embedding(x['phrase2'])})
    return dataset

train_dataset = preprocess_data(train_dataset)
dev_dataset = preprocess_data(dev_dataset)
test_dataset = preprocess_data(test_dataset)

Map:   0%|          | 0/7004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

#### Conmbine Embeddings
The embeddings for each pair of phrases are concatenated to form a single feature vector. This combined embedding represents the relationship between the two phrases.

In [8]:
def combine_embeddings(example):
    phrase1_embedding = example['phrase1_embedding']
    phrase2_embedding = example['phrase2_embedding']
    combined_embedding = np.concatenate((phrase1_embedding, phrase2_embedding))
    return {'combined_embedding': combined_embedding}

train_dataset = train_dataset.map(combine_embeddings)
dev_dataset = dev_dataset.map(combine_embeddings)
test_dataset = test_dataset.map(combine_embeddings)

Map:   0%|          | 0/7004 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

#### Train a classifier
A logistic regression model is trained using the combined embeddings from the training dataset. The labels indicate whether the phrases are similar or not.

In [9]:
X_train = np.vstack(train_dataset['combined_embedding'])
y_train = np.array(train_dataset['label'])

X_dev = np.vstack(dev_dataset['combined_embedding'])
y_dev = np.array(dev_dataset['label'])

In [14]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred_dev = model.predict(X_dev)

In [15]:
accuracy_dev = accuracy_score(y_dev, y_pred_dev)
precision_dev, recall_dev, f1_dev, _ = precision_recall_fscore_support(y_dev, y_pred_dev, average='binary')

print(f'Validation Accuracy: {accuracy_dev:.4f}')
print(f'Validation Precision: {precision_dev:.4f}')
print(f'Validation Recall: {recall_dev:.4f}')
print(f'Validation F1-Score: {f1_dev:.4f}')

Validation Accuracy: 0.3000
Validation Precision: 0.3054
Validation Recall: 0.3140
Validation F1-Score: 0.3097


#### Evaluate on Test Set
The model's performance is evaluated on both the validation (dev) and test datasets using accuracy, precision, recall, and F1-score.

In [16]:
X_test = np.vstack(test_dataset['combined_embedding'])
y_test = np.array(test_dataset['label'])

y_pred_test = model.predict(X_test)

In [17]:
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(y_test, y_pred_test, average='binary')

print(f'Test Accuracy: {accuracy_test:.4f}')
print(f'Test Precision: {precision_test:.4f}')
print(f'Test Recall: {recall_test:.4f}')
print(f'Test F1-Score: {f1_test:.4f}')

Test Accuracy: 0.2790
Test Precision: 0.2794
Test Recall: 0.2800
Test F1-Score: 0.2797


### Phrase Similarity - Averaging Word Embeddings

In [18]:
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [21]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2024-06-08 11:42:59--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-06-08 11:42:59--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-06-08 11:43:00--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [22]:
def load_glove_embeddings(file_path):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    return embeddings_index

glove_path = 'glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_path)

In [None]:
def get_average_embedding(phrase, embeddings_index, embedding_dim=300):
    words = word_tokenize(phrase.lower())
    valid_words = [word for word in words if word in embeddings_index]
    if not valid_words:
        return np.zeros(embedding_dim)
    embeddings = np.array([embeddings_index[word] for word in valid_words])
    return np.mean(embeddings, axis=0)

In [None]:
def preprocess_dataset(dataset, embeddings_index):
    dataset = dataset.map(lambda x: {'phrase1_embedding': get_average_embedding(x['phrase1'], embeddings_index),
                                     'phrase2_embedding': get_average_embedding(x['phrase2'], embeddings_index)})
    return dataset

train_dataset = preprocess_dataset(train_dataset, glove_embeddings)
dev_dataset = preprocess_dataset(dev_dataset, glove_embeddings)
test_dataset = preprocess_dataset(test_dataset, glove_embeddings)

#### Combine Embedding

In [None]:
def combine_embeddings(example):
    phrase1_embedding = example['phrase1_embedding']
    phrase2_embedding = example['phrase2_embedding']
    combined_embedding = np.concatenate((phrase1_embedding, phrase2_embedding))
    return {'combined_embedding': combined_embedding}

In [None]:
train_dataset = train_dataset.map(combine_embeddings)
dev_dataset = dev_dataset.map(combine_embeddings)
test_dataset = test_dataset.map(combine_embeddings)

X_train = np.vstack(train_dataset['combined_embedding'])
y_train = np.array(train_dataset['label'])

X_dev = np.vstack(dev_dataset['combined_embedding'])
y_dev = np.array(dev_dataset['label'])

X_test = np.vstack(test_dataset['combined_embedding'])
y_test = np.array(test_dataset['label'])

#### Model Train

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred_dev = model.predict(X_dev)
accuracy_dev = accuracy_score(y_dev, y_pred_dev)
precision_dev, recall_dev, f1_dev, _ = precision_recall_fscore_support(y_dev, y_pred_dev, average='binary')

print(f'Validation Accuracy: {accuracy_dev:.4f}')
print(f'Validation Precision: {precision_dev:.4f}')
print(f'Validation Recall: {recall_dev:.4f}')
print(f'Validation F1-Score: {f1_dev:.4f}')

#### Test

In [None]:
y_pred_test = model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(y_test, y_pred_test, average='binary')

print(f'Test Accuracy: {accuracy_test:.4f}')
print(f'Test Precision: {precision_test:.4f}')
print(f'Test Recall: {recall_test:.4f}')
print(f'Test F1-Score: {f1_test:.4f}')