In [7]:
import pandas as pd
import numpy as np

import re
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [9]:
OFFICIAL_KEYWORDS = ["gov", "government", "ndrf", "imd", "police", "fire", "disaster", "relief", "district", "collector"]
LOW_TRUST_KEYWORDS = ["whatsapp", "forwarded", "rumor", "unverified", "old blog"]

In [10]:
df=pd.read_csv("truth_clean.csv")

In [18]:
df["text"]=df["text"].astype(str)
df["source"]=df["source"].astype(str)
df["source_type"]=df["source_type"].astype(str)
df["location"]=df["location"].astype(str)
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)

In [11]:
vectorizer = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    ngram_range=(1, 2),
    min_df=1
)
X = vectorizer.fit_transform(df["text"])

In [12]:
def trust_score(row):
    st = row["source_type"].lower()
    s  = row["source"].lower()

    score = 0.35  # base

    # source_type boosts (edit based on your real categories)
    if "official" in st:
        score += 0.45
    elif "news" in st:
        score += 0.25
    elif "social" in st:
        score -= 0.10
    elif "unknown" in st:
        score -= 0.05

    # source keyword boosts/penalties
    if any(k in s for k in OFFICIAL_KEYWORDS):
        score += 0.20
    if any(k in s for k in LOW_TRUST_KEYWORDS):
        score -= 0.20

    return float(np.clip(score, 0.0, 1.0))

df["trust"] = df.apply(trust_score, axis=1)

In [19]:
def freshness_score(ts, now, tau_hours):
    age_hours = (now - ts).total_seconds() / 3600.0
    age_hours = max(age_hours, 0.0)
    return float(np.exp(-age_hours / tau_hours))

# ---------- Step 5: emergency trigger ----------
EMERGENCY_WORDS = set("""
evacuation evacuate shelter helpline warning alert flood earthquake wildfire cyclone landslide
road closure closed bridge collapse aftershock rescue hospital missing
""".split())

def is_emergency_query(q: str) -> bool:
    tokens = re.findall(r"[a-zA-Z]+", q.lower())
    return any(t in EMERGENCY_WORDS for t in tokens)

# ---------- Step 7: pogo memory ----------
bad_clicks = {}  # doc_id -> count

def pogo_penalty(doc_id: int) -> float:
    c = bad_clicks.get(doc_id, 0)
    return min(0.30, 0.05 * c)

def log_click(doc_id: int, dwell_seconds: float):
    # if user bounces fast, treat as pogo
    if dwell_seconds < 10:
        bad_clicks[doc_id] = bad_clicks.get(doc_id, 0) + 1

# ---------- Search function ----------
def search(query: str, top_k=10, candidate_n=50):
    now = pd.Timestamp.now(tz="UTC")

    # retrieve candidates by relevance
    qv = vectorizer.transform([query])
    sims = cosine_similarity(qv, X).ravel()  # relevance in [0,1]-ish

    # take top candidate_n by relevance
    cand_idx = np.argsort(-sims)[:candidate_n]
    cands = df.iloc[cand_idx].copy()
    cands["relevance"] = sims[cand_idx]

    emergency = is_emergency_query(query)

    


    # freshness
    tau = 12 if emergency else 72  # emergency decays faster
    cands["freshness"] = cands["timestamp"].apply(lambda ts: freshness_score(ts, now, tau))

    # pogo penalty
    cands["pogo"] = cands["id"].apply(pogo_penalty)

    # truth filter (simple): in emergency mode, heavily penalize very low trust
    if emergency:
        cands["truth_penalty"] = np.where(cands["trust"] < 0.30, 0.35, 0.0)
    else:
        cands["truth_penalty"] = 0.0

    # final score
    
    if emergency:
        cands["final"] = (
            0.40*cands["relevance"] +
            0.40*cands["trust"] +
            0.20*cands["freshness"] -
            cands["pogo"] -
            cands["truth_penalty"]
        )
        cands.loc[cands["freshness"] < 0.05, "final"] -= 0.4
    else:
        cands["final"] = (
            0.70*cands["relevance"] +
            0.20*cands["trust"] +
            0.10*cands["freshness"] -
            cands["pogo"]
        )

    cands = cands.sort_values("final", ascending=False).head(top_k)

    # return clean view
    cols = ["id", "final", "relevance", "trust", "freshness", "source", "source_type", "location", "timestamp", "text"]
    return emergency, cands[cols]

# Example:
# emergency, results = search("bihar flood evacuation route", top_k=10)
# print("EMERGENCY MODE:", emergency)
# results

In [28]:
emergency, results = search("kerala")
results

Unnamed: 0,id,final,relevance,trust,freshness,source,source_type,location,timestamp,text
509,510,0.393332,0.322837,0.6,0.473463,Indian Express,news,Kerala,2026-01-15 06:31:00+00:00,Report on flooding in Kerala and the response ...
364,365,0.391928,0.322837,0.6,0.459427,BBC India,news,Kerala,2026-01-15 04:21:00+00:00,Report on flooding in Kerala and the response ...
361,362,0.391918,0.322837,0.6,0.459321,Indian Express,news,Kerala,2026-01-15 04:20:00+00:00,Report on flooding in Kerala and the response ...
317,318,0.391473,0.322837,0.6,0.454877,Indian Express,news,Kerala,2026-01-15 03:38:00+00:00,Report on flooding in Kerala and the response ...
301,302,0.391337,0.322837,0.6,0.45351,The Hindu,news,Kerala,2026-01-15 03:25:00+00:00,Report on flooding in Kerala and the response ...
292,293,0.391242,0.322837,0.6,0.452566,The Hindu,news,Kerala,2026-01-15 03:16:00+00:00,Report on flooding in Kerala and the response ...
265,266,0.390867,0.322837,0.6,0.448811,BBC India,news,Kerala,2026-01-15 02:40:00+00:00,Report on flooding in Kerala and the response ...
256,257,0.390742,0.322837,0.6,0.447566,The Hindu,news,Kerala,2026-01-15 02:28:00+00:00,Report on flooding in Kerala and the response ...
235,236,0.390299,0.322837,0.6,0.443133,Indian Express,news,Kerala,2026-01-15 01:45:00+00:00,Report on flooding in Kerala and the response ...
217,218,0.389982,0.322837,0.6,0.439964,The Hindu,news,Kerala,2026-01-15 01:14:00+00:00,Report on flooding in Kerala and the response ...
