Install packages

In [None]:
!pip install transformers torch scikit-learn pandas numpy tqdm streamlit pyngrok



Upload Files

In [None]:
from google.colab import files
uploaded = files.upload()


Saving News_dataset.csv to News_dataset.csv


Read csv file

In [None]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("News_dataset.csv")

# Remove missing values
df = df.dropna(subset=["Title", "Description"])

# Combine text
df["full_text"] = df["Title"] + " " + df["Description"]

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (216885, 3)


Unnamed: 0,Title,Description,full_text
0,Obama Lays Wreath at Arlington National Cemetery,Obama Lays Wreath at Arlington National Cemete...,Obama Lays Wreath at Arlington National Cemete...
1,A Look at the Health of the Chinese Economy,"Tim Haywood, investment director business-unit...",A Look at the Health of the Chinese Economy Ti...
2,Nouriel Roubini: Global Economy Not Back to 2008,"Nouriel Roubini, NYU professor and chairman at...",Nouriel Roubini: Global Economy Not Back to 20...
3,Finland GDP Expands In Q4,Finland's economy expanded marginally in the t...,Finland GDP Expands In Q4 Finland's economy ex...
4,"Tourism, govt spending buoys Thai economy in J...",Tourism and public spending continued to boost...,"Tourism, govt spending buoys Thai economy in J..."


Create Proxy Popularity Score

In [None]:
def text_length_score(text):
    return len(text.split()) / 100

def title_length_score(title):
    return len(title.split()) / 10

df["popularity_score"] = (
    df["full_text"].apply(text_length_score) +
    df["Title"].apply(title_length_score)
)

print("Min:", df["popularity_score"].min())
print("Max:", df["popularity_score"].max())


Min: 0.18
Max: 3.3


Hugging Face Tokens

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

# Force CPU (safe for evaluation)
device = torch.device("cpu")

model_name = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()           # IMPORTANT for speed
model.to(device)

print("Using device:", device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertModel LOAD REPORT from: distilbert-base-uncased
Key                     | Status     |  | 
------------------------+------------+--+-
vocab_layer_norm.bias   | UNEXPECTED |  | 
vocab_transform.bias    | UNEXPECTED |  | 
vocab_layer_norm.weight | UNEXPECTED |  | 
vocab_projector.bias    | UNEXPECTED |  | 
vocab_transform.weight  | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Using device: cpu


Embedding

In [None]:
from tqdm import tqdm
import torch
import numpy as np

def get_cls_embeddings_batch(text_list, batch_size=64):
    model.eval()  # important for inference
    embeddings = []

    for i in tqdm(range(0, len(text_list), batch_size)):
        batch_texts = text_list[i:i+batch_size]

        inputs = tokenizer(
            batch_texts,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        # Move tensors to GPU safely
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)

        # CLS token embedding
        cls_embeddings = outputs.last_hidden_state[:, 0, :]

        # Move to CPU before storing
        embeddings.append(cls_embeddings.cpu())

    # Concatenate once at the end (more efficient)
    return torch.cat(embeddings, dim=0).numpy()


# Make sure model is on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

texts = df["full_text"].tolist()

embeddings = get_cls_embeddings_batch(texts, batch_size=64)

print("Embedding shape:", embeddings.shape)



100%|██████████| 3389/3389 [11:32<00:00,  4.89it/s]


Embedding shape: (216885, 768)


Saving embedding file

In [None]:
np.save("news_embeddings.npy", embeddings)
print("Saved embeddings")


Saved embeddings


In [None]:
from google.colab import files
files.download("news_embeddings.npy")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Popularity score

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

X = embeddings
y = df["popularity_score"].values

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model_reg = LinearRegression()
model_reg.fit(X_train, y_train)

preds = model_reg.predict(X_test)

mse = mean_squared_error(y_test, preds)
print("MSE:", mse)


MSE: 0.042055128304799134


Save popularity model

In [None]:
import pickle

with open("popularity_model.pkl", "wb") as f:
    pickle.dump(model_reg, f)

print("Model saved")


Model saved


In [None]:
files.download("popularity_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Predict popularity Score

In [None]:
def predict_popularity(title, description):
    text = title + " " + description

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

    prediction = model_reg.predict(cls_embedding)

    return prediction[0]

# Example
score = predict_popularity(
    "India wins cricket world cup",
    "India defeated Australia in thrilling final match."
)

print("Predicted Popularity Score:", score)


Predicted Popularity Score: 0.48383725


Requirement

In [None]:
%%writefile requirements.txt
streamlit
torch
transformers
scikit-learn
numpy


Writing requirements.txt


Streamlit Deployment

In [None]:
%%writefile app.py
import streamlit as st
import torch
import pickle
from transformers import AutoTokenizer, AutoModel

# ----------------------------------
# Page Configuration
# ----------------------------------
st.set_page_config(
    page_title="News Popularity Intelligence System",
    page_icon="📰",
    layout="centered"
)

# ----------------------------------
# Load Models
# ----------------------------------
@st.cache_resource
def load_models():
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
    bert_model = AutoModel.from_pretrained("bert-base-uncased")

    with open("popularity_model.pkl", "rb") as f:
        reg_model = pickle.load(f)

    return tokenizer, bert_model, reg_model

tokenizer, bert_model, reg_model = load_models()

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

# ----------------------------------
# Predict Popularity Score
# ----------------------------------
def predict_score(title, description):
    text = title + " " + description

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    ).to(device)

    with torch.no_grad():
        outputs = bert_model(**inputs)

    cls_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
    score = reg_model.predict(cls_embedding)[0]

    return float(score)

# ----------------------------------
# Predict Priority
# ----------------------------------
def predict_priority(title, description):
    text = (title + " " + description).lower()
    score = predict_score(title, description)

    # HIGH
    high_keywords = [
        "world cup", "historic", "record",
        "scientist", "scientists", "ai system",
        "cancer", "breakthrough",
        "earthquake", "flood", "cyclone",
        "prime minister", "national",
        "global", "international",
        "space mission", "satellite", "disaster"
    ]
    if any(word in text for word in high_keywords):
        return score, "HIGH", "🟢", "90%"

    # LOW
    low_keywords = [
        "college", "hostel", "department",
        "library", "school", "apartment",
        "association", "monthly meeting",
        "parent-teacher", "local",
        "routine", "internal", "residential",
        "community center", "yoga session"
    ]
    if any(word in text for word in low_keywords):
        return score, "LOW", "🔴", "75%"

    # MEDIUM
    medium_keywords = [
        "government", "state government",
        "scholarship", "education policy",
        "training program", "skill development",
        "employment scheme", "startup",
        "business", "funding", "expands",
        "digital portal", "policy", "scheme"
    ]
    if any(word in text for word in medium_keywords):
        return score, "MEDIUM", "🟡", "80%"

    # Fallback
    if score < 0.9:
        return score, "LOW", "🔴", "70%"
    elif score < 1.2:
        return score, "MEDIUM", "🟡", "80%"
    else:
        return score, "HIGH", "🟢", "85%"

# ----------------------------------
# Sidebar Navigation
# ----------------------------------
page = st.sidebar.radio(
    "📌 Navigation",
    ["🏠 Home", "📰 News Intelligence", "🧠 Model Reasoning"]
)

# =================================================
# PAGE 1: HOME
# =================================================
if page == "🏠 Home":

    st.title("📰 News Popularity Intelligence System")

    st.subheader("🔍 Problem Overview")
    st.write(
        "Predicting the popularity of news articles is challenging because "
        "real engagement metrics such as likes, shares, and views are often unavailable."
    )

    st.subheader("❓ Why Popularity Labels Are Unavailable")
    st.write(
        "- Social media platforms restrict engagement data\n"
        "- News datasets rarely contain popularity labels\n"
        "- Popularity varies across platforms and time"
    )

    st.subheader("🏗 System Architecture")
    st.code(
        """
        Title + Description
                ↓
           BERT Encoder
                ↓
         CLS Embedding
                ↓
       Regression Model
                ↓
     Popularity Score
                ↓
      Priority Level
        """
    )

# =================================================
# PAGE 2: NEWS INTELLIGENCE
# =================================================
elif page == "📰 News Intelligence":

    st.title("📰 News Intelligence")

    title = st.text_input("Enter News Title")
    description = st.text_area("Enter News Description")

    if st.button("🚀 Analyze News"):

        if title and description:

            with st.spinner("Analyzing news content..."):

                score, level, color, confidence = predict_priority(title, description)
                score = round(score, 2)

                st.subheader("📊 Output")
                st.write(f"**Popularity Score:** {score}")
                st.markdown(f"### {color} Priority Level: **{level}**")

                st.subheader("🧠 Key Explanatory Highlights")

                if level == "HIGH":
                    st.write(
                        "- Contains national or global impact keywords\n"
                        "- High semantic importance\n"
                        "- Strong engagement potential"
                    )
                elif level == "MEDIUM":
                    st.write(
                        "- Government, policy, or business related\n"
                        "- Moderate public relevance\n"
                        "- Informative content"
                    )
                else:
                    st.write(
                        "- Institutional or local updates\n"
                        "- Limited audience reach\n"
                        "- Routine information"
                    )

        else:
            st.warning("Please enter both title and description.")

# =================================================
# PAGE 3: MODEL REASONING
# =================================================
elif page == "🧠 Model Reasoning":

    st.title("🧠 Model Reasoning")

    st.subheader("⚙ Scoring Logic")
    st.write(
        "- Text is converted into embeddings using BERT\n"
        "- CLS token captures overall meaning\n"
        "- Regression model predicts popularity score"
    )

    st.subheader("📊 Example Comparisons")
    st.write(
        "**College Library Extends Working Hours** → LOW\n\n"
        "**Government Launches Skill Program** → MEDIUM\n\n"
        "**India Wins Cricket World Cup** → HIGH"
    )

    st.subheader("🔎 Model Interpretation")
    st.write(
        "- Model measures textual richness\n"
        "- Does not use real-time social media data\n"
        "- Rule-based logic improves clarity"
    )

    st.subheader("⚠ Limitations")
    st.write(
        "- No real engagement labels\n"
        "- Topic popularity may vary over time\n"
        "- Can be improved with labeled datasets"
    )


Writing app.py


In [None]:
from pyngrok import ngrok

ngrok.set_auth_token("36IeTvr7PlB5X7b8Kv3Kfw6FF9r_7DoFqQJNdrUww2ADZ5jFq")




In [None]:
!streamlit run app.py &>/dev/null &


In [None]:
from pyngrok import ngrok
public_url = ngrok.connect(8501)
print(public_url)


NgrokTunnel: "https://unreducible-twanda-unconsumptive.ngrok-free.dev" -> "http://localhost:8501"


Readme.md

In [None]:
%%writefile README.md
# 📰 News Popularity Intelligence System

A Streamlit-based application that predicts the popularity priority of news articles
as Low, Medium, or High using a hybrid Machine Learning and Rule-Based approach.

## Problem Statement
Predicting news popularity is difficult because real engagement metrics such as
likes, shares, and views are often unavailable in public datasets.

## Features
- BERT-based text analysis
- Popularity score prediction
- Priority classification (Low / Medium / High)
- Multi-page Streamlit interface

## How to Run
pip install -r requirements.txt
streamlit run app.py


Writing README.md
