In [1]:
!pip install -qU sentence_transformers datasets==2.14.5 pyarrow

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 MB[0m [31m46.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import pandas as pd

### Load Data

In [3]:
df = pd.read_csv('extractedNews.csv')

In [3]:
df

Unnamed: 0,content,view,url
0,Historic Supreme Court confirmation comes at a...,left,https://www.cnn.com/2022/04/07/politics/lgbtq-...
1,South Korea warns it can send arms to Ukraine ...,center,https://ny1.com/nyc/queens/politics/2024/10/22...
2,CNN Site Map (Articles) for March - 2023 CNN S...,left,https://www.cnn.com/intl_planning/article/site...
3,More than 200 medical professionals from nearl...,left,https://www.cnn.com/2020/02/07/health/doctors-...
4,"Bud Light wanted to market to all. Now, it’s a...",left,https://www.cnn.com/2023/05/01/business/bud-li...
...,...,...,...
383,Missouri voters approve abortion rights in sta...,left,https://apnews.com/article/abortion-missouri-a...
384,Abortion-rights ballot measures pass in 7 stat...,left,https://www.nbcnews.com/politics/2024-election...
385,Abortion rights advocates say they need more m...,left,https://japantoday.com/category/features/opini...
386,Associated Press Sues Trump Administration Ove...,left,https://deadline.com/2025/02/trump-associated-...


#### convert view label to numerical value

In [4]:
label_map = {'left': 0, 'center': 1, 'right': 2}
df['label_numeric'] = df['view'].map(label_map)


In [6]:
df

Unnamed: 0,content,view,url,label_numeric
0,Historic Supreme Court confirmation comes at a...,left,https://www.cnn.com/2022/04/07/politics/lgbtq-...,0
1,South Korea warns it can send arms to Ukraine ...,center,https://ny1.com/nyc/queens/politics/2024/10/22...,1
2,CNN Site Map (Articles) for March - 2023 CNN S...,left,https://www.cnn.com/intl_planning/article/site...,0
3,More than 200 medical professionals from nearl...,left,https://www.cnn.com/2020/02/07/health/doctors-...,0
4,"Bud Light wanted to market to all. Now, it’s a...",left,https://www.cnn.com/2023/05/01/business/bud-li...,0
...,...,...,...,...
383,Missouri voters approve abortion rights in sta...,left,https://apnews.com/article/abortion-missouri-a...,0
384,Abortion-rights ballot measures pass in 7 stat...,left,https://www.nbcnews.com/politics/2024-election...,0
385,Abortion rights advocates say they need more m...,left,https://japantoday.com/category/features/opini...,0
386,Associated Press Sues Trump Administration Ove...,left,https://deadline.com/2025/02/trump-associated-...,0


#### Split the dataset into train,test and validation

In [5]:
from sentence_transformers import SentenceTransformer, InputExample, losses, util, evaluation

In [6]:
training_split_percentage = 24
val_split_percentage = 12
test_split_percentage = 64

train_split_len = int(len(df) * training_split_percentage / 100)
val_split_len = int(len(df) * val_split_percentage / 100)
test_split_len = len(df) - train_split_len - val_split_len

In [7]:
training_split_documents = df[:train_split_len]
test_split_documents = df[train_split_len:train_split_len+test_split_len]
val_split_documents = df[train_split_len+test_split_len:]

In [8]:
train_text = training_split_documents['content'].tolist()
train_labels = training_split_documents['label_numeric'].tolist()
val_text = val_split_documents['content'].tolist()
val_labels = val_split_documents['label_numeric'].tolist()
test_text = test_split_documents['content'].tolist()
test_labels = test_split_documents['label_numeric'].tolist()

#### Create pairs for contrastive learning

##### Train examples

In [9]:
train_examples = []
for i in range(len(training_split_documents)):
    for j in range(i + 1, len(training_split_documents)):
      if train_labels[i] == train_labels[j]:
        train_examples.append(InputExample(texts=[train_text[i], train_text[j]], label=1.0)) ## Similar pair
      else:
        train_examples.append(InputExample(texts=[train_text[i], train_text[j]], label=0.0)) ## Dissimilar pair

##### Validation examples

In [10]:
val_examples = []
for i in range(len(val_split_documents)):
    for j in range(i + 1, len(val_split_documents)):
      if val_labels[i] == val_labels[j]:
        val_examples.append(InputExample(texts=[val_text[i], val_text[j]], label=1.0)) ## Similar pair
      else:
        val_examples.append(InputExample(texts=[val_text[i], val_text[j]], label=0.0)) ## Dissimilar pair

### Finetuning `all-mpnet-base-v`

#### Verify GPU

In [11]:
import torch

In [12]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU.")

GPU available: NVIDIA A100-SXM4-40GB


#### Load model into GPU for learning

In [13]:
model_name = 'all-mpnet-base-v2'
model = SentenceTransformer(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [14]:
from torch.utils.data import DataLoader

While training a model, we typically want to pass samples in “minibatches”, reshuffle the data at every epoch to reduce model overfitting, and use Python’s multiprocessing to speed up data retrieval.
DataLoader is an iterable that abstracts this complexity for us in an easy API.

In [15]:
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

#### Loss function
Contrastive loss. Expects as input two texts and a label of either 0 or 1. If the label == 1, then the distance between the two embeddings is reduced. If the label == 0, then the distance between the embeddings is increased.

In [16]:
train_loss = losses.ContrastiveLoss(model=model)

#### Validation setup

In [17]:
# Validation setup
val_evaluator = evaluation.EmbeddingSimilarityEvaluator(
    sentences1=[example.texts[0] for example in val_examples],
    sentences2=[example.texts[1] for example in val_examples],
    scores=[example.label for example in val_examples],
    batch_size=16
)

#### Training

In [18]:
import wandb
wandb.init(mode="disabled")

Log into Huggingface to save/upload the model

In [19]:
from huggingface_hub import notebook_login

notebook_login()

hf_username = "ashwinpatti"

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
EPOCHS = 5
warmup_steps = int(len(train_dataloader) * EPOCHS * 0.1)
name = f"{model_name}_political_view_ft"
#move model to gpu for fine tuning
model.to(device)
model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=EPOCHS,
    warmup_steps=warmup_steps,
    output_path=model_name,
    show_progress_bar=True,
    evaluator=val_evaluator,
    evaluation_steps=50
)

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



Step,Training Loss,Validation Loss,Pearson Cosine,Spearman Cosine
50,No log,No log,0.251823,0.22894
100,No log,No log,0.648337,0.65698
150,No log,No log,0.960044,0.824942
200,No log,No log,0.943991,0.824942
250,No log,No log,0.952018,0.824942
268,No log,No log,0.961736,0.824942
300,No log,No log,0.971607,0.824942
350,No log,No log,0.971053,0.824942
400,No log,No log,0.96998,0.824942
450,No log,No log,0.966639,0.824942


In [21]:
#move model back to cpu
model.to("cpu")

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

##### Save the model to hugging face

In [22]:
model.push_to_hub(f"{hf_username}/{name}-legal-ft-v0")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

'https://huggingface.co/ashwinpatti/all-mpnet-base-v2_political_view_ft-legal-ft-v0/commit/a7b3b8c57c49e37a017ecf3f3e8b997fdea80e2f'

#### Model evaluation

##### Test examples and evaluator

In [26]:
test_examples = []
for i in range(len(test_split_documents)):
    for j in range(i + 1, len(test_split_documents)):
      if test_labels[i] == test_labels[j]:
        test_examples.append(InputExample(texts=[test_text[i], test_text[j]], label=1.0)) ## Similar pair
      else:
        test_examples.append(InputExample(texts=[test_text[i], test_text[j]], label=0.0)) ## Dissimilar pair

In [27]:
test_evaluator = evaluation.EmbeddingSimilarityEvaluator(
    sentences1=[example.texts[0] for example in test_examples],
    sentences2=[example.texts[1] for example in test_examples],
    scores=[example.label for example in test_examples],
    batch_size=16
)

##### Evaluation using validation data

In [25]:
model.to(device)
val_evaluator(model)


{'pearson_cosine': 0.9799696092976358, 'spearman_cosine': 0.8249423833511556}

##### Evaluation using test data

In [28]:
test_evaluator(model)

{'pearson_cosine': 0.9007315394216624, 'spearman_cosine': 0.7997436966943793}

In [29]:
model.to("cpu")

SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)