<a href="https://colab.research.google.com/github/arkosarker07/Political-Biasness-identifier/blob/main/BiasDetection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/AI4Bharat/IndicTrans2.git
%cd /content/IndicTrans2/huggingface_interface
!pip install -r requirements.txt
!pip install sentencepiece bitsandbytes scipy accelerate
!git clone https://github.com/VarunGumma/IndicTransToolkit.git
%cd IndicTransToolkit
!pip install --editable ./
%cd ..

fatal: destination path 'IndicTrans2' already exists and is not an empty directory.
/content/IndicTrans2/huggingface_interface
[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
fatal: destination path 'IndicTransToolkit' already exists and is not an empty directory.
/content/IndicTrans2/huggingface_interface/IndicTransToolkit
Obtaining file:///content/IndicTrans2/huggingface_interface/IndicTransToolkit
  Installing build dependencies ... [?25l[?25hdone
  Checking if build backend supports build_editable ... [?25l[?25hdone
  Getting requirements to build editable ... [?25l[?25hdone
  Preparing editable metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: indictranstoolkit
  Building editable for indictranstoolkit (pyproject.toml) ... [?25l[?25hdone
  Created wheel for indictranstoolkit: filename=indictranstoolkit-1.1.1-0.editable-cp311-cp311-linux_x86_64.whl size=6523 sha256=19d78e3c8b

In [2]:
import torch
import re
import gradio as gr
from transformers import pipeline, AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
from IndicTransToolkit.processor import IndicProcessor

# Setup device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
BATCH_SIZE = 2  # Reduced for memory safety
src_lang, tgt_lang = "ben_Beng", "eng_Latn"

# Initialize model + tokenizer with 8-bit quantization
def initialize_model_and_tokenizer(ckpt_dir, quantization="8-bit"):
    if quantization == "4-bit":
        qconfig = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_compute_dtype=torch.bfloat16,
        )
    elif quantization == "8-bit":
        qconfig = BitsAndBytesConfig(
            load_in_8bit=True,
            bnb_8bit_use_double_quant=True,
            bnb_8bit_compute_dtype=torch.bfloat16,
        )
    else:
        qconfig = None

    tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True)
    model = AutoModelForSeq2SeqLM.from_pretrained(
        ckpt_dir,
        trust_remote_code=True,
        low_cpu_mem_usage=True,
        quantization_config=qconfig,
    )

    if qconfig is None:
        model = model.to(DEVICE)
        if DEVICE == "cuda":
            model.half()

    model.eval()
    return tokenizer, model

# Smart chunking
def split_into_chunks(text, max_chars=300):
    sentences = re.split(r'(?<=[।!?])\s+', text.strip())
    chunks = []
    current = ""
    for sentence in sentences:
        if len(current) + len(sentence) <= max_chars:
            current += " " + sentence
        else:
            if current:
                chunks.append(current.strip())
            current = sentence
    if current:
        chunks.append(current.strip())
    return chunks

# Batch translation
def batch_translate(input_sentences, model, tokenizer, ip, mode="Fast"):
    translations = []
    beam_size = 3 if mode == "Fast" else 5
    max_length = 200 if mode == "Fast" else 256

    for i in range(0, len(input_sentences), BATCH_SIZE):
        batch = input_sentences[i: i + BATCH_SIZE]
        batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang)

        inputs = tokenizer(
            batch,
            truncation=True,
            padding="longest",
            return_tensors="pt",
            return_attention_mask=True,
        ).to(DEVICE)

        try:
            with torch.no_grad():
                generated_tokens = model.generate(
                    **inputs,
                    use_cache=True,
                    min_length=0,
                    max_length=max_length,
                    num_beams=beam_size,
                    num_return_sequences=1,
                )

            generated_tokens = tokenizer.batch_decode(
                generated_tokens,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )

            translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang)

        except Exception as e:
            translations += ["[Translation Failed]"] * len(batch)

        del inputs
        torch.cuda.empty_cache()

    return translations

# Load model with 8-bit quantization
quantization = "8-bit"
indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-1B"
indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, quantization)
ip = IndicProcessor(inference=True)

# Bias classification model
bias_pipe = pipeline(
    "text-classification",
    model="matous-volf/political-leaning-deberta-large",
    tokenizer="microsoft/deberta-v3-large",
)

label_map = {
    "LABEL_0": "Left",
    "LABEL_1": "Center",
    "LABEL_2": "Right"
}

# Main function
def translate_and_classify(bn_text, mode):
    if not bn_text.strip():
        return "Please enter Bangla news content.", "", ""

    bn_sents = split_into_chunks(bn_text.strip(), max_chars=300)
    translations = batch_translate(bn_sents, indic_en_model, indic_en_tokenizer, ip, mode)
    english_text = " ".join(translations)

    if "[Translation Failed]" in translations:
        return "Translation failed for some parts.", "", english_text

    result = bias_pipe(english_text)[0]
    label = label_map.get(result["label"], result["label"])
    score = result["score"]

    return f"{label}", f"{score:.2%}", english_text

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## Bangla & English News Bias Detector")

    with gr.Row():
        with gr.Column():
            bangla_input = gr.Textbox(label="Paste News Article", lines=10, placeholder="Here...")
            mode_selector = gr.Radio(["Fast", "Accurate"], value="Fast", label="Translation Mode")
            submit_btn = gr.Button("Analyze Bias")

        with gr.Column():
            bias_output = gr.Textbox(label="Bias", interactive=False)
            score_output = gr.Textbox(label="Confidence", interactive=False)
            eng_translation = gr.Textbox(label="English Translation", lines=10, interactive=False)

    submit_btn.click(translate_and_classify, inputs=[bangla_input, mode_selector], outputs=[bias_output, score_output, eng_translation])

demo.launch()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://989c5564a59335dc66.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


