In [2]:
# 1Ô∏è‚É£ Uninstall any existing Transformers version
!pip uninstall -y transformers tokenizers

# 2Ô∏è‚É£ Install stable 4.44.2 version
!pip install transformers==4.44.2 datasets evaluate sentencepiece accelerate --upgrade

# 3Ô∏è‚É£ Verify version
import transformers
print("Transformers version:", transformers.__version__)  # Should print 4.44.2

!pip install sentencepiece

Found existing installation: transformers 4.44.2
Uninstalling transformers-4.44.2:
  Successfully uninstalled transformers-4.44.2
Found existing installation: tokenizers 0.19.1
Uninstalling tokenizers-0.19.1:
  Successfully uninstalled tokenizers-0.19.1
Collecting transformers==4.44.2
  Using cached transformers-4.44.2-py3-none-any.whl.metadata (43 kB)
Collecting tokenizers<0.20,>=0.19 (from transformers==4.44.2)
  Using cached tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Using cached transformers-4.44.2-py3-none-any.whl (9.5 MB)
Using cached tokenizers-0.19.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.19.1 transformers-4.44.2


Transformers version: 4.44.2


In [1]:
# 4Ô∏è‚É£ Import libraries
import pandas as pd
import transformers  # Needed for version check
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import numpy as np
import evaluate
import torch

print("Setup complete ‚úÖ")
print("Transformers version:", transformers.__version__)

Setup complete ‚úÖ
Transformers version: 4.44.2


In [3]:
# 5Ô∏è‚É£ Load dataset (example: AG News, 3-class mapping)
DATASET = "ag_news"
TEXT_COL = "text"
LABEL_COL = "label"

ds = load_dataset(DATASET)

# Map labels to 0,1,2
def map_labels(example):
    example["label"] = example["label"] % 3
    return example

ds = ds.map(map_labels)
print(ds)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [4]:
 #6Ô∏è‚É£ Tokenization
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_fn(batch):
    return tokenizer(batch[TEXT_COL], truncation=True, padding=False)

tokenized = ds.map(tokenize_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ===============================================
# 7Ô∏è‚É£ Load model
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3
)

# ==



Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 8Ô∏è‚É£ TrainingArguments with proper epochs and evaluation
training_args = TrainingArguments(
    output_dir="distilbert_bias_model",
    num_train_epochs=3,                 # ‚úÖ Number of epochs you want
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy="epoch",        # ‚úÖ Evaluate after each epoch
    save_strategy="epoch",              # ‚úÖ Save model after each epoch
    save_total_limit=2                  # Optional: keep only 2 checkpoints
)

metric = evaluate.load("accuracy")




In [6]:
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=preds, references=p.label_ids)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    eval_dataset=tokenized.get("test", tokenized["train"]),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


In [None]:
!pip install wandb

import wandb
wandb.login()  # It will prompt you for your API key
wandb.init(project="media-bias-distilbert")
# Train the model
trainer.train()


In [8]:
# üîü Save model for Streamlit
trainer.save_model("distilbert_bias_model")
tokenizer.save_pretrained("distilbert_bias_model")
print("Model saved to distilbert_bias_model/ ‚úÖ")

Model saved to distilbert_bias_model/ ‚úÖ


In [9]:
# 1Ô∏è‚É£1Ô∏è‚É£ Test single headline inference
from transformers import pipeline
pipe = pipeline("text-classification", model="distilbert_bias_model", return_all_scores=True)

test_headline = "Government announces new tax reductions for corporations."
print(pipe(test_headline))

[[{'label': 'LABEL_0', 'score': 0.35182759165763855}, {'label': 'LABEL_1', 'score': 0.29249635338783264}, {'label': 'LABEL_2', 'score': 0.3556760549545288}]]




In [16]:
!pip install streamlit
import os
import zipfile
import pickle
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import pickle

# Create folder structure
os.makedirs("media-bias-streamlit/distilbert_bias_model", exist_ok=True)

# 1Ô∏è‚É£ Write app.py
app_code = """
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import pickle

st.set_page_config(page_title="Media Bias Detection", layout="wide")
st.title("Media Bias Detection in News Headlines")

headline = st.text_area("Enter a news headline:")

# Load DistilBERT model
@st.cache_resource
def load_model():
    try:
        model = AutoModelForSequenceClassification.from_pretrained("distilbert_bias_model")
        tokenizer = AutoTokenizer.from_pretrained("distilbert_bias_model")
        return pipeline("text-classification", model=model, tokenizer=tokenizer, return_all_scores=True)
    except:
        # Fallback to small TF-IDF model
        with open("model_demo.pkl", "rb") as f:
            return pickle.load(f)

nlp_pipe = load_model()

if st.button("Predict"):
    if headline.strip() == "":
        st.warning("Please enter a headline.")
    else:
        result = nlp_pipe(headline)
        st.write("Prediction Scores:")
        st.json(result)
"""
with open("media-bias-streamlit/app.py", "w") as f:
    f.write(app_code)



    from transformers import TextClassificationPipeline

def load_model():
    try:
        model = AutoModelForSequenceClassification.from_pretrained("distilbert_bias_model")
        tokenizer = AutoTokenizer.from_pretrained("distilbert_bias_model")
        return TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True)
    except:
        with open("model_demo.pkl", "rb") as f:
            return pickle.load(f)





In [41]:
for root, dirs, files in os.walk("/content", topdown=True):
  for name in files:
    if name.endswith(".pkl"):
      print(os.path.join(root, name))

In [35]:
# 4. ADD PICKLE FILE
# -----------------------------
pickle_file = "bias_classifier.pkl"  # change if yours has a different name
if os.path.exists(pickle_file):
    shutil.copy(pickle_file, f"{project_dir}/{pickle_file}")
else:
    print("‚ö†Ô∏è Pickle file not found!")

‚ö†Ô∏è Pickle file not found!


In [47]:
import os
import shutil
from google.colab import files

# -----------------------------
# 1. CREATE PROJECT FOLDER
# -----------------------------
project_dir = "media_bias_streamlit"
os.makedirs(project_dir, exist_ok=True)
# 2. SAVE requirements.txt
# -----------------------------
requirements = """streamlit
transformers==4.44.2
torch
scikit-learn
numpy
pandas
protobuf
"""

with open(f"{project_dir}/requirements.txt", "w") as f:
    f.write(requirements)

    # 3. COPY TRAINED MODEL FOLDER
# -----------------------------
# Replace YOUR_MODEL_FOLDER with actual path if different
model_src = "distilbert_bias_model"

if os.path.exists(model_src):
    shutil.copytree(model_src, f"{project_dir}/model", dirs_exist_ok=True)
else:
    print("‚ö†Ô∏è Model folder not found! Check the name.")

    # 4. ADD PICKLE FILE
# -----------------------------
pickle_file = "bias_classifier.pkl"  # change if yours has a different name
if os.path.exists(pickle_file):
    shutil.copy(pickle_file, f"{project_dir}/{pickle_file}")
else:
    print("‚ö†Ô∏è Pickle file not found!")
# 5. ADD app.py
# -----------------------------
app_code = """
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
import pickle

st.title("üì∞ Media Bias Classifier")

# Load model
model_path = "model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
nlp = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=False)

# Load Logistic Regression fallback
with open("bias_classifier.pkl", "rb") as f:
    clf = pickle.load(f)

user_text = st.text_area("Enter news text:", "")

if st.button("Predict Bias"):
    if len(user_text) < 5:
        st.warning("Please enter more text.")
    else:
        result = nlp(user_text)[0]['label']
        st.success(f"Predicted Bias: {result}")
"""

with open(f"{project_dir}/app.py", "w") as f:
    f.write(app_code)


In [48]:
# 6. ZIP EVERYTHING
# -----------------------------
zip_path = "media_bias_streamlit.zip"
shutil.make_archive("media_bias_streamlit", 'zip', project_dir)


'/content/media_bias_streamlit.zip'

In [49]:
# 7. DOWNLOAD ZIP + INDIVIDUAL FILES
# -----------------------------
files.download(zip_path)
files.download(f"{project_dir}/requirements.txt")
files.download(f"{project_dir}/app.py")

if os.path.exists(f"{project_dir}/{pickle_file}"):
    files.download(f"{project_dir}/{pickle_file}")

print("‚úÖ DONE ‚Äî All files are ready for Streamlit deployment!")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

‚úÖ DONE ‚Äî All files are ready for Streamlit deployment!
