<a href="https://colab.research.google.com/github/britbrat0/cs676/blob/main/project1_deliverable1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%pip install tldextract

Collecting tldextract
  Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.3.0-py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.3.0


In [4]:
import requests
from bs4 import BeautifulSoup
import tldextract
import json
import os
import numpy as np
import joblib

# --------------------------------------------------------
# FEATURE EXTRACTION
# --------------------------------------------------------
def extract_features(url: str = None, text_input: str = None):
    """
    Extract credibility-related features.
    Returns scores + details including warnings and errors.
    """
    features = {
        "scores": {
            "source_authority": 0,
            "publication_quality": 0,
            "citation_patterns": 0,
            "content_accuracy": 0
        },
        "details": {}
    }

    text = ""
    soup = None

    # Fetch content if URL is provided
    if url:
        try:
            response = requests.get(url, timeout=10, headers={"User-Agent": "Mozilla/5.0"})
            response.raise_for_status()
            html = response.text
            soup = BeautifulSoup(html, "html.parser")
            text = soup.get_text(separator=" ", strip=True)
        except Exception as e:
            features["details"]["error"] = f"Failed to fetch URL: {str(e)}"
            return features
    else:
        text = text_input or ""
        if not text.strip():
            features["details"]["error"] = "No content provided."
            return features

    # --- 1. Source Authority ---
    if url:
        domain_info = tldextract.extract(url)
        domain = f"{domain_info.domain}.{domain_info.suffix}"
        features["details"]["domain"] = domain

        if domain_info.suffix in ["gov", "edu"]:
            authority_score = 9
        elif domain_info.suffix in ["org"]:
            authority_score = 7
        else:
            authority_score = 5
    else:
        authority_score = 5

    features["scores"]["source_authority"] = authority_score

    # --- 2. Publication Quality ---
    word_count = len(text.split())
    ads = 0 if soup is None else (len(soup.find_all("iframe")) + len(soup.find_all("script")))

    quality_score = 5
    if word_count > 800:
        quality_score += 2
    if ads > 10:
        quality_score -= 2
    quality_score = max(0, min(10, quality_score))

    features["scores"]["publication_quality"] = quality_score
    features["details"]["word_count"] = word_count

    # --- 3. Citation Patterns ---
    if soup:
        links = [a['href'] for a in soup.find_all("a", href=True)]
        domain = features["details"].get("domain", "")
        external_links = [l for l in links if domain not in l]
        citation_score = min(10, len(external_links))
        features["details"]["external_links_count"] = len(external_links)
    else:
        citation_score = 3

    features["scores"]["citation_patterns"] = citation_score

    # --- 4. Content Accuracy (heuristics) ---
    suspicious_keywords = ["miracle cure", "shocking secret", "click here", "you won’t believe"]
    suspicious_hits = sum([text.lower().count(kw) for kw in suspicious_keywords])

    accuracy_score = 8
    if suspicious_hits > 0:
        accuracy_score -= suspicious_hits
    accuracy_score = max(0, min(10, accuracy_score))

    features["scores"]["content_accuracy"] = accuracy_score
    features["details"]["suspicious_keywords_found"] = suspicious_hits

    return features


# --------------------------------------------------------
# RULE-BASED SCORING
# --------------------------------------------------------
def compute_rule_based_score(features):
    weights = {
        "source_authority": 0.3,
        "publication_quality": 0.25,
        "citation_patterns": 0.25,
        "content_accuracy": 0.2
    }
    score = sum(features["scores"][f] * w for f, w in weights.items())
    return round(score / 10, 2)  # normalize to 0–1


# --------------------------------------------------------
# HYBRID CREDIBILITY ASSESSOR
# --------------------------------------------------------
def assess_credibility(url: str = None, text: str = None, model_path="credibility_model.pkl"):
    """
    Hybrid credibility assessment returning simplified JSON:
    { "score": float, "explanation": string }
    """
    features = extract_features(url=url, text_input=text)

    # If error fetching data
    if "error" in features["details"]:
        return json.dumps({
            "score": 0.0,
            "explanation": f"Analysis failed: {features['details']['error']}"
        }, indent=2)

    rule_score = compute_rule_based_score(features)
    ml_score = None

    # Try ML model
    if os.path.exists(model_path):
        try:
            model = joblib.load(model_path)
            X = np.array([[
                features["scores"]["source_authority"],
                features["scores"]["publication_quality"],
                features["scores"]["citation_patterns"],
                features["scores"]["content_accuracy"]
            ]])
            ml_score = float(model.predict(X)[0])
        except Exception:
            pass

    # Hybrid combination
    if ml_score is not None:
        final_score = 0.4 * rule_score + 0.6 * ml_score
    else:
        final_score = rule_score

    # --- Build Explanation ---
    explanation_parts = []
    if features["scores"]["source_authority"] >= 7:
        explanation_parts.append("source has strong domain authority")
    if features["scores"]["citation_patterns"] > 5:
        explanation_parts.append("it provides external references")
    if features["scores"]["content_accuracy"] < 5:
        explanation_parts.append("content shows potential bias or suspicious claims")
    if features["scores"]["publication_quality"] > 6:
        explanation_parts.append("article is detailed and well-structured")

    if not explanation_parts:
        explanation = "Credibility is assessed based on available features."
    else:
        explanation = "This source is considered credible because " + ", and ".join(explanation_parts) + "."

    return json.dumps({
        "score": round(final_score, 2),
        "explanation": explanation
    }, indent=2)


# --------------------------------------------------------
# DEMO
# --------------------------------------------------------
if __name__ == "__main__":
    # Example URL
    print(assess_credibility(url="https://www.bbc.com/news/science-environment-123456"))

    # Example raw text
    sample_text = "Scientists discovered a shocking secret cure that the government doesn’t want you to know!"
    print(assess_credibility(text=sample_text))

    # Example URL
    print(assess_credibility(url="https://www.bbc.com/news/world-us-canada-55568621"))

{
  "score": 0.0,
  "explanation": "Analysis failed: Failed to fetch URL: 404 Client Error: Not Found for url: https://www.bbc.com/news/science-environment-123456"
}
{
  "score": 0.49,
  "explanation": "Credibility is assessed based on available features."
}
{
  "score": 0.68,
  "explanation": "This source is considered credible because it provides external references."
}
