In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

To authenticate with the Hugging Face Hub, you need to create a token in your settings tab (https://huggingface.co/settings/tokens). Then, add it to the secrets manager in Colab under the "🔑" in the left panel. Give it the name `HF_TOKEN`.

After successfully logging in, you can retry loading the dataset.

In [2]:
!pip install datasets scikit-learn pandas requests pillow nltk transformers torch openpyxl tqdm evaluate accelerate

Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.5


In [3]:
from datasets import load_dataset
import pandas as pd

# Specify the 'train' split for ds2
ds2 = load_dataset("mediabiasgroup/BABE")
df2 = pd.DataFrame(ds2['train'])
df2.to_csv("BABE.csv", index=False)

print(ds2.column_names)

README.md:   0%|          | 0.00/770 [00:00<?, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/712k [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/233k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3121 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'train': ['text', 'outlet', 'label', 'topic', 'news_link', 'biased_words', 'uuid', 'type', 'label_opinion'], 'test': ['text', 'outlet', 'label', 'topic', 'news_link', 'biased_words', 'uuid', 'type', 'label_opinion']}


In [4]:
df2.columns

Index(['text', 'outlet', 'label', 'topic', 'news_link', 'biased_words', 'uuid',
       'type', 'label_opinion'],
      dtype='object')

In [5]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words and len(w) > 2]
    return ' '.join(words)

df2['clean_text'] = df2['text'].apply(clean_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [6]:
import pandas as pd
import torch
import random
import nltk
from nltk.corpus import wordnet
from transformers import (
    BertTokenizerFast,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback,
    get_scheduler,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.nn import CrossEntropyLoss
from torch.utils.data import Dataset

In [7]:
# Synonym replacement augmentation
def synonym_replacement(text, n=1):
    words = text.split()
    new_words = words.copy()
    random_word_list = list(set([word for word in words if wordnet.synsets(word)]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = wordnet.synsets(random_word)
        if not synonyms:
            continue
        synonym = synonyms[0].lemmas()[0].name().replace("_", " ")
        new_words = [synonym if word == random_word else word for word in new_words]
        num_replaced += 1
        if num_replaced >= n:
            break
    return ' '.join(new_words)

# Apply augmentation to BABE (uses "text" and "label")
augmented_texts = [synonym_replacement(t) for t in df2['text']]
augmented_df2 = pd.DataFrame({'text': augmented_texts, 'label': df2['label']})

# Concatenate original + augmented dataset
df2 = pd.concat([df2, augmented_df2]).reset_index(drop=True)

print("Original size:", len(df2) - len(augmented_df2))
print("Augmented size:", len(df2))
print(df2.head())

Original size: 3121
Augmented size: 6242
                                                text      outlet  label  \
0  NYPD Commissioner Dermot Shea on Monday expres...   Breitbart      0   
1  School systems across the country are adopting...  Federalist      1   
2  And then along came President Barry Obama, who...   Breitbart      1   
3  The curfews, which have never before occurred ...    Alternet      1   
4  Rather than help be a part of the solution, Tr...    Alternet      1   

               topic                                          news_link  \
0  marriage-equality  http://feedproxy.google.com/~r/breitbart/~3/F5...   
1              islam  https://thefederalist.com/2020/07/08/black-liv...   
2  marriage-equality  http://feedproxy.google.com/~r/breitbart/~3/ks...   
3     elections-2020  https://www.alternet.org/2020/06/we-just-got-a...   
4     elections-2020  https://www.alternet.org/2020/06/trump-thought...   

                                biased_words             

In [8]:
# 3. Train/validation split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df2['text'].tolist(), df2['label'].tolist(), test_size=0.2, random_state=42
)

# 4. Tokenization
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
import torch
from torch.utils.data import Dataset

# 5. Dataset class
class BiasDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.labels[idx])
        }

# Create train & test datasets for BABE
train_dataset = BiasDataset(train_encodings, train_labels)
test_dataset = BiasDataset(test_encodings, test_labels)

In [10]:
# 6. Compute class weights (for BABE)
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# 7. Custom model with weighted loss
from transformers import BertPreTrainedModel, BertModel
import torch.nn as nn
from torch.nn import CrossEntropyLoss

class CustomBERTClassifier(BertPreTrainedModel):
    def __init__(self, config, class_weights):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        self.class_weights = class_weights
        self.loss_fct = CrossEntropyLoss(weight=self.class_weights)

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss = self.loss_fct(logits, labels)
        return {'loss': loss, 'logits': logits}

# Instantiate model for BABE
model = CustomBERTClassifier.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    class_weights=class_weights
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of CustomBERTClassifier were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'loss_fct.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# 8. Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    warmup_steps=100,
    weight_decay=0.01,
    learning_rate=2e-5,
    logging_dir='./logs',
    logging_steps=100
)

# 9. Define metrics
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='weighted')
    }

# 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # Still needed for final evaluation
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# 11. Train the model
trainer.train()

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mi-am-the-arw[0m ([33mi-am-the-arw-indian-institute-of-information-technology-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
100,0.64
200,0.4762
300,0.3825
400,0.2521
500,0.2317
600,0.1877
700,0.0944
800,0.0811
900,0.0885
1000,0.0431


TrainOutput(global_step=1252, training_loss=0.20422544932593933, metrics={'train_runtime': 603.038, 'train_samples_per_second': 33.119, 'train_steps_per_second': 2.076, 'total_flos': 1211079632270880.0, 'train_loss': 0.20422544932593933, 'epoch': 4.0})

In [12]:
# 12. Evaluate
preds_output = trainer.predict(test_dataset)
y_pred = np.argmax(preds_output.predictions, axis=1)
print("\nClassification Report:")
print(classification_report(test_labels, y_pred, target_names=['Biased', 'Non-biased']))


Classification Report:
              precision    recall  f1-score   support

      Biased       0.95      0.97      0.96       580
  Non-biased       0.97      0.95      0.96       669

    accuracy                           0.96      1249
   macro avg       0.96      0.96      0.96      1249
weighted avg       0.96      0.96      0.96      1249



In [15]:
# Save model
model.save_pretrained("sentence_model")
tokenizer.save_pretrained("sentence_model")

('sentence_model/tokenizer_config.json',
 'sentence_model/special_tokens_map.json',
 'sentence_model/vocab.txt',
 'sentence_model/added_tokens.json',
 'sentence_model/tokenizer.json')

In [None]:
!zip -r sentence_model.zip NLP_Axios/
from google.colab import files
files.download("sentence_model.zip")