In [1]:
# 1️⃣ Uninstall any existing Transformers version
!pip uninstall -y transformers tokenizers

# 2️⃣ Install stable 4.44.2 version
!pip install transformers==4.44.2 datasets evaluate sentencepiece accelerate --upgrade

# 3️⃣ Verify version
import transformers
print("Transformers version:", transformers.__version__)  # Should print 4.44.2


Found existing installation: transformers 4.57.1
Uninstalling transformers-4.57.1:
  Successfully uninstalled transformers-4.57.1
Found existing installation: tokenizers 0.22.1
Uninstalling tokenizers-0.22.1:
  Successfully uninstalled tokenizers-0.22.1
Collecting transformers==4.44.2
  Downloading transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.44.2)
  Downloading tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading transformers-4.44.2-py3-none-any.whl (9.5 MB)
[

In [2]:
# 4️⃣ Import libraries
import pandas as pd
import transformers  # Needed for version check
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import numpy as np
import evaluate
import torch

print("Setup complete ✅")
print("Transformers version:", transformers.__version__)

Setup complete ✅
Transformers version: 4.44.2


In [3]:
# 5️⃣ Load dataset (example: AG News, 3-class mapping)
DATASET = "ag_news"
TEXT_COL = "text"
LABEL_COL = "label"

ds = load_dataset(DATASET)

# Map labels to 0,1,2
def map_labels(example):
    example["label"] = example["label"] % 3
    return example

ds = ds.map(map_labels)
print(ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [4]:
 #6️⃣ Tokenization
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(batch[TEXT_COL], truncation=True, padding=False)

tokenized = ds.map(tokenize_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ===============================================
# 7️⃣ Load model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3
)

# ==

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 8️⃣ TrainingArguments with proper epochs and evaluation
training_args = TrainingArguments(
    output_dir="distilbert_bias_model",
    num_train_epochs=3,                 # ✅ Number of epochs you want
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy="epoch",        # ✅ Evaluate after each epoch
    save_strategy="epoch",              # ✅ Save model after each epoch
    save_total_limit=2                  # Optional: keep only 2 checkpoints
)

metric = evaluate.load("accuracy")




Downloading builder script: 0.00B [00:00, ?B/s]

In [6]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized.get("test", tokenized["train"]),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [7]:
!pip install wandb

import wandb
wandb.login()  # It will prompt you for your API key
wandb.init(project="media-bias-distilbert")
# 9️⃣ Train the model
trainer.train()




  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33martigupta-gupta68[0m ([33martigupta-gupta68-shree-bala-ji-agenecy[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [8]:
# 🔟 Save model for Streamlit
trainer.save_model("distilbert_bias_model")
tokenizer.save_pretrained("distilbert_bias_model")
print("Model saved to distilbert_bias_model/ ✅")

Model saved to distilbert_bias_model/ ✅


In [9]:
# 1️⃣1️⃣ Test single headline inference
from transformers import pipeline
pipe = pipeline("text-classification", model="distilbert_bias_model", return_all_scores=True)

test_headline = "Government announces new tax reductions for corporations."
print(pipe(test_headline))

[[{'label': 'LABEL_0', 'score': 0.14497864246368408}, {'label': 'LABEL_1', 'score': 0.02821577712893486}, {'label': 'LABEL_2', 'score': 0.8268055319786072}]]




In [12]:
import os
import zipfile

# Create folder structure
os.makedirs("media-bias-streamlit/distilbert_bias_model", exist_ok=True)

# Write app.py
app_code = """
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

st.set_page_config(page_title="Media Bias Detection", layout="wide")
st.title("Media Bias Detection in News Headlines")

headline = st.text_area("Enter a news headline:")

@st.cache_resource
def load_model():
    model = AutoModelForSequenceClassification.from_pretrained("distilbert_bias_model")
    tokenizer = AutoTokenizer.from_pretrained("distilbert_bias_model")
    return pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)

nlp_pipe = load_model()

if st.button("Predict"):
    if headline.strip() == "":
        st.warning("Please enter a headline.")
    else:
        result = nlp_pipe(headline)
        st.write("Prediction Scores:")
        st.json(result)
"""
with open("media-bias-streamlit/app.py", "w") as f:
    f.write(app_code)

# Write requirements.txt
requirements = """
streamlit
transformers==4.44.2
datasets
sentencepiece
scikit-learn
pandas
joblib
evaluate
torch
"""
with open("media-bias-streamlit/requirements.txt", "w") as f:
    f.write(requirements)



