#### Import Libararies

In [2]:
!pip install datasets transformers scikit-learn torch

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K

In [3]:
from datasets import load_dataset
from transformers import BertTokenizer, BertModel
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import numpy as np

#### Load Dataset

In [4]:
dataset = load_dataset("google-research-datasets/paws", "labeled_final")
train_dataset = dataset['train']
dev_dataset = dataset['validation']
test_dataset = dataset['test']

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/9.79k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/8.43M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/49401 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/8000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8000 [00:00<?, ? examples/s]

#### Tokeniser and Model

In [5]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

#### Sentance Embedding

In [6]:
def get_bert_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

#### Preprocess Data

In [None]:
def preprocess_data(dataset):
    dataset = dataset.map(lambda x: {
        'sentence1_embedding': get_bert_embedding(x['sentence1']),
        'sentence2_embedding': get_bert_embedding(x['sentence2'])
    })
    return dataset

train_dataset = preprocess_data(train_dataset)
dev_dataset = preprocess_data(dev_dataset)
test_dataset = preprocess_data(test_dataset)

Map:   0%|          | 0/49401 [00:00<?, ? examples/s]

#### Combine Embeddings

In [None]:
def combine_embeddings(example):
    sentence1_embedding = example['sentence1_embedding']
    sentence2_embedding = example['sentence2_embedding']
    combined_embedding = np.concatenate((sentence1_embedding, sentence2_embedding))
    return {'combined_embedding': combined_embedding}

train_dataset = train_dataset.map(combine_embeddings)
dev_dataset = dev_dataset.map(combine_embeddings)
test_dataset = test_dataset.map(combine_embeddings)

#### Training

In [None]:
X_train = np.vstack(train_dataset['combined_embedding'])
y_train = np.array(train_dataset['label'])

X_dev = np.vstack(dev_dataset['combined_embedding'])
y_dev = np.array(dev_dataset['label'])

X_test = np.vstack(test_dataset['combined_embedding'])
y_test = np.array(test_dataset['label'])

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

#### Test and Validate

In [None]:
y_pred_dev = model.predict(X_dev)
accuracy_dev = accuracy_score(y_dev, y_pred_dev)
precision_dev, recall_dev, f1_dev, _ = precision_recall_fscore_support(y_dev, y_pred_dev, average='binary')

print(f'Validation Accuracy: {accuracy_dev:.4f}')
print(f'Validation Precision: {precision_dev:.4f}')
print(f'Validation Recall: {recall_dev:.4f}')
print(f'Validation F1-Score: {f1_dev:.4f}')

In [None]:
y_pred_test = model.predict(X_test)
accuracy_test = accuracy_score(y_test, y_pred_test)
precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(y_test, y_pred_test, average='binary')

print(f'Test Accuracy: {accuracy_test:.4f}')
print(f'Test Precision: {precision_test:.4f}')
print(f'Test Recall: {recall_test:.4f}')
print(f'Test F1-Score: {f1_test:.4f}')