## Step 1: Installing Dependencies

In [None]:
!pip install feedparser beautifulsoup4 scikit-learn requests python-dateutil --quiet

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone


## Step 2: Imports Dependencies

In [None]:
import re, requests, feedparser
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from datetime import datetime, timezone
from dateutil import parser as dtparse
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Step 3: Load RSS Feed data

In [None]:
RSS_URL = "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx1YlY4U0FtVnVHZ0pWVXlnQVAB?hl=bn&gl=BD&ceid=BD:bn"
rss = feedparser.parse(RSS_URL)
entries = rss.entries
print("Entries:", len(entries))
print("Sample title:", entries[0].title if entries else "No entries")

Entries: 37
Sample title: গাজা নগরীর দুর্ভিক্ষ নিয়ে বিভিন্ন পক্ষের ক্ষোভ, জাতিসংঘ মহাসচিব বললেন—‘মানবতার ব্যর্থতা’ - প্রথম আলো


## Step 4: Creating Helpers Functions (Normalize Text, Bangla Text, Fetching title from URL and Parse Time)

In [None]:
def norm_text(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"\s+", " ", s)
    return s

def bn_norm(s: str) -> str:
    s = (s or "").strip()
    s = re.sub(r"[^\w\s]", " ", s)  #Regex for removing Punchuations
    s = re.sub(r"\s+", " ", s)
    return s.lower()

def fetch_title_from_url(url: str) -> str:
    try:
        html = requests.get(url, timeout=8).text
        soup = BeautifulSoup(html, "html.parser")
        t = soup.title.string if soup.title and soup.title.string else ""
        return norm_text(t)
    except Exception:
        return ""

def parse_time(e):
    for key in ("published", "updated"):
        if hasattr(e, key):
            try:
                return dtparse.parse(getattr(e, key))
            except Exception:
                pass
    return None

## Step 5: Prepare Feed Items

In [None]:
feed_items = []
for e in entries:
    title = norm_text(getattr(e, "title", "")).strip()
    link  = getattr(e, "link", "")
    dom   = urlparse(link).netloc.lower()
    ts    = parse_time(e)
    feed_items.append({"title": title, "link": link, "domain": dom, "time": ts})
print("Prepared feed items:", len(feed_items))

Prepared feed items: 37


## Step 6: TF IDF fit on Feed Titles

In [None]:
feed_titles = [i["title"] for i in feed_items if i["title"]]
vectorizer = TfidfVectorizer(max_features=5000, stop_words=None)
_ = vectorizer.fit(feed_titles or ["খবর"])
print("Vectorizer vocabulary size:", len(vectorizer.vocabulary_))

Vectorizer vocabulary size: 179


## Step 7: Ask user input

In [None]:
user_input = input("Enter a news TITLE or URL: ").strip()
is_url = user_input.startswith("http://") or user_input.startswith("https://")

## Step 8: Rsolve input to title or URL

In [None]:
if is_url:
    user_title = fetch_title_from_url(user_input) or norm_text(urlparse(user_input).path.replace("/", " "))
    user_domain = urlparse(user_input).netloc.lower()
else:
    user_title = norm_text(user_input)
    user_domain = ""
print("Resolved title:", user_title[:120] + ("..." if len(user_title)>120 else ""))

## Step 9: Find best match in feed by cosine similarity

In [None]:
def best_match(title: str):
    if not feed_items or not title:
        return None, 0.0
    qv = vectorizer.transform([title])
    cv = vectorizer.transform([i["title"] for i in feed_items])
    sims = cosine_similarity(qv, cv).ravel()
    idx = sims.argmax()
    return feed_items[idx], float(sims[idx])

match_item, sim = best_match(user_title)
print("Best sim:", round(sim, 4))
if match_item:
    print("Matched:", match_item["title"])


## Step 10: Extra strong title/subphrase signal (partial-title fix)

In [None]:
norm_user  = bn_norm(user_title)
norm_match = bn_norm(match_item["title"]) if match_item else ""
strong_title_match = False
if norm_user and norm_match:
    toks = norm_user.split()
    strong_title_match = (norm_user in norm_match) or all(t in norm_match for t in toks)
print("Strong title match:", strong_title_match)

## Step 11: Presence flags and recency

In [None]:
def hours_ago(t):
    if not t:
        return None
    now = datetime.now(timezone.utc)
    if t.tzinfo is None:
        t = t.replace(tzinfo=timezone.utc)
    return max(0.0, (now - t).total_seconds()/3600.0)

# Title presence: decent similarity OR strong subphrase match
in_feed_by_title = (sim >= 0.50) or strong_title_match

# URL presence: only for URL input (exact link match)
in_feed_by_url = False
if is_url:
    in_feed_by_url = any(user_input.strip() == i["link"] for i in feed_items)

# Domain presence:
if is_url:
    domain_in_feed = any(i["domain"] == user_domain for i in feed_items)
else:
    domain_in_feed = bool(match_item)

recency_hours = hours_ago(match_item["time"]) if match_item else None
print("Flags → in_title:", in_feed_by_title, "| in_url:", in_feed_by_url, "| in_domain:", domain_in_feed, "| recency(h):", None if recency_hours is None else round(recency_hours,1))


## Step 12: Weight calculation (0..1) before DT decision

In [None]:
w_sim      = sim
w_presence = (0.5 if in_feed_by_title else 0.0) + (0.3 if in_feed_by_url else 0.0) + (0.2 if domain_in_feed else 0.0)
if strong_title_match:
    w_presence += 0.2  # bonus for subphrase containment
w_recency  = 0.0 if recency_hours is None else max(0.0, 1.0 - min(recency_hours, 48)/48.0)  # fresh→1, >48h→0
raw_weight = 0.6*w_sim + 0.3*w_presence + 0.1*w_recency
weight     = max(0.0, min(1.0, raw_weight))
print({"sim": round(w_sim,3), "presence": round(w_presence,3), "recency": round(w_recency,3), "weight": round(weight,3)})


## Step 13: Decision Tree (rules) → Fake / Neutral / Well Sourced

In [None]:
def dt_decide(sim, in_title, in_url, in_domain, recency_h, weight, strong_title_match):
    # Node 1: strong textual match OR strong subphrase match (partial-title)
    if (in_title and sim >= 0.70) or strong_title_match:
        if in_url or (recency_h is not None and recency_h <= 72):
            return "Well Sourced"
        return "Neutral"
    # Node 2: moderate similarity + domain present (feed-backed)
    if sim >= 0.50 and in_domain:
        return "Well Sourced" if weight >= 0.60 else "Neutral"
    # Node 3: weak similarity, no presence anywhere
    if sim < 0.25 and (not in_title) and (not in_domain):
        return "Fake"
    # Default
    return "Neutral"

final_label = dt_decide(sim, in_feed_by_title, in_feed_by_url, domain_in_feed, recency_hours, weight, strong_title_match)
print("Label:", final_label)


## Step 14: Output Summery

In [1]:
print("\n=== RESULT ===")
print("Label:", final_label)
print("Weight (0..1):", round(weight, 3))
if match_item:
    print("Matched Feed Title:", match_item["title"])
    print("Matched Feed Link :", match_item["link"])
    print("Similarity       :", round(sim, 3))
    print("Recency (hours)  :", None if recency_hours is None else round(recency_hours, 1))
print("Signals → in_title:", in_feed_by_title, "| in_url:", in_feed_by_url, "| domain_in_feed:", domain_in_feed)


=== RESULT ===


NameError: name 'final_label' is not defined