# Deliverable 1
Student name = Aditya Bhavsar


In [2]:
#Required installation of the following library
!pip install tldextract

Collecting tldextract
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.1.3-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.1.3


In [17]:
import requests
import tldextract
from datetime import datetime
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import openai  # For AI-based fact-checking

# === SERPAPI & OpenAI API Keys ===
SERP_API_KEY = "Your api key here" # Replace with your OpenAI API Key
OPENAI_API_KEY = "Your api key here"  # Replace with your OpenAI API Key

openai.api_key = OPENAI_API_KEY

# === 1st metric - Domain Trust ===
def get_domain_trust(url):
    """Estimates domain trust using search ranking, WHOIS data, and Tranco list."""
    domain = tldextract.extract(url).registered_domain

    # === Check Tranco List ===
    try:
        tranco_response = requests.get("https://tranco-list.eu/top-1m.csv").text
        if domain in tranco_response:
            tranco_rank = tranco_response.split(domain)[0].strip().split("\n")[-1]
            tranco_score = max(100 - (int(tranco_rank) / 10000), 50)
        else:
            tranco_score = 40
    except:
        tranco_score = 40

    # === WHOIS Lookup for Domain Age ===
    try:
        whois_response = requests.get(f"https://api.ip2whois.com/v2?key=demo&domain={domain}").json()
        creation_date = whois_response.get("created_date", "2000-01-01")
        domain_age = (datetime.now() - datetime.strptime(creation_date, "%Y-%m-%d")).days // 365
        age_score = min(domain_age * 5, 100)
    except:
        age_score = 50

    # === Extract Backlink Score (Using SerpAPI) ===
    try:
        search = requests.get(f"https://serpapi.com/search.json?q={domain}&api_key={SERP_API_KEY}").json()
        backlink_count = len(search.get("organic_results", []))
        backlink_score = min(backlink_count * 10, 100)
    except:
        backlink_score = 50

    # === Final Domain Trust Calculation ===
    domain_trust = (0.4 * tranco_score) + (0.3 * age_score) + (0.3 * backlink_score)
    return round(domain_trust, 2)

# === 2nd metric - AI Based fact check score ===
import requests
from sentence_transformers import SentenceTransformer, util

def get_fact_check_score(text):
    """Evaluates the factual accuracy of a given claim using multiple layers of verification."""

    # === Step 1: Verify with Search Engine Results ===
    try:
        serpapi_key = "78d578eb49c83a63f5945da59e8ae9e37f865cc14da2c714ccb9e990a195d787"
        params = {
            "q": f"fact check {text}",
            "engine": "google",
            "api_key": serpapi_key
        }
        response = requests.get("https://serpapi.com/search", params=params)
        data = response.json()

        # Check for fact-checking websites in top results
        trusted_sources = ["snopes.com", "politifact.com", "factcheck.org", "bbc.com", "reuters.com"]
        source_mentions = sum(1 for result in data.get("organic_results", []) if any(domain in result.get("link", "") for domain in trusted_sources))

        fact_check_score = min(source_mentions * 20, 100)  # Normalize (max 5 sources = 100)
    except Exception:
        fact_check_score = 50  # Default neutral score

    # === Step 2: Cross-Check with Wikipedia ===
    try:
        wiki_response = requests.get(f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch={text}")
        wiki_data = wiki_response.json()
        wiki_matches = len(wiki_data.get("query", {}).get("search", []))

        fact_check_score += min(wiki_matches * 10, 30)  # Add extra points if Wikipedia has related articles
    except Exception:
        pass  # Ignore errors, keep previous score

    # === Step 3: NLP Semantic Similarity ===
    try:
        model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
        trusted_texts = " ".join([result["title"] + result["snippet"] for result in data.get("organic_results", []) if "title" in result and "snippet" in result])

        similarity_score = util.pytorch_cos_sim(model.encode(text), model.encode(trusted_texts)).item() * 100
        fact_check_score += min(similarity_score / 2, 30)  # Weight this factor to prevent over-scaling
    except Exception:
        pass  # Ignore errors, keep previous score

    # === Final Normalization ===
    return max(0, min(fact_check_score, 100))  # Ensure within range

# === 3rd metric - Bias Score ===
def get_bias_score(text, domain):
    """Calculates bias score based on sentiment analysis and media bias ratings."""
    sentiment_pipeline = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")
    sentiment_result = sentiment_pipeline(text[:512])[0]

    # Map sentiment to bias score
    sentiment_bias = {
        "1 star": 30,  # Strongly Negative (Potentially biased)
        "2 stars": 50,  # Slightly Negative (Neutral to biased)
        "3 stars": 70,  # Neutral
        "4 stars": 80,  # Slightly Positive (Neutral to trusted)
        "5 stars": 100  # Strongly Positive (Trusted)
    }
    bias_score = sentiment_bias.get(sentiment_result["label"], 50)

    # === Fetch Bias Rating from AllSides API ===
    try:
        bias_response = requests.get(f"https://api.allsides.com/bias/{domain}").json()
        media_bias = bias_response.get("bias", "center")
        bias_adjustment = {
            "left": -20,
            "lean left": -10,
            "center": 0,
            "lean right": 10,
            "right": 20
        }.get(media_bias.lower(), 0)
        bias_score = max(0, min(bias_score + bias_adjustment, 100))
    except:
        pass  # If API fails, use sentiment score only

    return round(bias_score, 2)

# === 4th metric - Content Relevance Score ===
def compute_content_relevance(user_query, page_text):
    """Computes content relevance using semantic similarity."""
    model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
    similarity_score = util.pytorch_cos_sim(model.encode(user_query), model.encode(page_text)).item() * 100
    return round(similarity_score, 2)

# === 5th metric - Citation Score ===
def check_google_scholar(url):
    """
    Computes Citation Score by:
    1) Checking backlinks using Google Search (SerpAPI).
    2) Searching for academic mentions (PDFs, research papers).
    3) Normalizing results to a 0-100 scale.
    """
    try:
        # === Step 1: Count Backlinks from Google Search ===
        backlink_params = {
            "q": f"link:{url}",
            "engine": "google",
            "api_key": SERP_API_KEY
        }
        backlink_response = requests.get("https://serpapi.com/search", params=backlink_params).json()
        backlink_count = len(backlink_response.get("organic_results", []))

        # === Step 2: Count Academic Mentions (PDFs & Research Papers) ===
        academic_params = {
            "q": f"\"{url}\" filetype:pdf OR site:researchgate.net OR site:arxiv.org OR site:semanticscholar.org",
            "engine": "google",
            "api_key": SERP_API_KEY
        }
        academic_response = requests.get("https://serpapi.com/search", params=academic_params).json()
        academic_count = len(academic_response.get("organic_results", []))

        # === Step 3: Normalize Score to 0-100 ===
        citation_score = min((backlink_count * 5) + (academic_count * 15), 100)  # Adjust weights if needed

        return round(citation_score, 2)

    except Exception as e:
        print(f"Error fetching citation data: {e}")
        return 0  # Default if API fails

# === Final Step - Final score evaluation method ===
def rate_url_validity(user_query, url):
    """
    Evaluates the validity of a given URL by computing multiple credibility metrics.
    Returns a final credibility score (0-100).
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, "html.parser")
        page_text = " ".join([p.text for p in soup.find_all("p")])  # Extract paragraph text
    except Exception as e:
        return {"error": f"Failed to fetch content: {str(e)}"}

    domain = tldextract.extract(url).registered_domain

    # Compute all credibility scores
    domain_trust = get_domain_trust(url)
    content_relevance = compute_content_relevance(user_query, page_text)
    fact_check_score = get_fact_check_score(page_text)
    bias_score = get_bias_score(page_text, domain)
    citation_score = check_google_scholar(url)

    # === Compute Final Validity Score ===
    final_score = (
        (0.10 * domain_trust) +
        (0.60 * content_relevance) +
        (0.15 * fact_check_score) +
        (0.15 * bias_score) +
        (0.10 * citation_score)
    )

    return {
        "Domain Trust": domain_trust,
        "Content Relevance": content_relevance,
        "Fact-Check Score": fact_check_score,
        "Bias Score": bias_score,
        "Citation Score": citation_score,
        "Final Validity Score": round(final_score, 2)
    }


In [18]:
# === TESTING ===
test_urls = [
    "https://www.bbc.com/news",
    "https://www.nasa.gov",
    "https://www.factcheck.org"
]

for url in test_urls:
    print(f"\nResults for {url}:")
    print(rate_url_validity("was moon landing by NASA fake!", url))


Results for https://www.bbc.com/news:


Device set to use cpu


{'Domain Trust': 61.0, 'Content Relevance': 3.29, 'Fact-Check Score': 6.9031402468681335, 'Bias Score': 80, 'Citation Score': 100, 'Final Validity Score': 31.11}

Results for https://www.nasa.gov:


Device set to use cpu


{'Domain Trust': 58.0, 'Content Relevance': 27.55, 'Fact-Check Score': 30, 'Bias Score': 80, 'Citation Score': 100, 'Final Validity Score': 48.83}

Results for https://www.factcheck.org:


Device set to use cpu


{'Domain Trust': 64.0, 'Content Relevance': 17.99, 'Fact-Check Score': 5.511181801557541, 'Bias Score': 30, 'Citation Score': 0, 'Final Validity Score': 22.52}


In [20]:
# === TESTING ===
test_urls_2 = [
    "https://www.pcgamer.com/hardware/graphics-cards/is-the-new-rtx-5070-really-as-fast-as-nvidias-previous-flagship-rtx-4090-gpu-turns-out-the-answer-is-yes-kinda/",
    "https://www.fool.com/investing/2025/02/09/is-nvidia-still-a-millionaire-maker-stock/",
    "https://www.tomshardware.com/pc-components/gpus/nvidias-geforce-rtx-5070-at-usd549-how-does-it-stack-up-to-the-previous-generation-rtx-4070"
]

for url in test_urls_2:
    print(f"\nResults for {url}:")
    print(rate_url_validity("Nvidia's new RTX 5070 is it really good ?", url))


Results for https://www.pcgamer.com/hardware/graphics-cards/is-the-new-rtx-5070-really-as-fast-as-nvidias-previous-flagship-rtx-4090-gpu-turns-out-the-answer-is-yes-kinda/:


Device set to use cpu


{'Domain Trust': 46.0, 'Content Relevance': 65.21, 'Fact-Check Score': 5.508620664477348, 'Bias Score': 50, 'Citation Score': 0, 'Final Validity Score': 52.05}

Results for https://www.fool.com/investing/2025/02/09/is-nvidia-still-a-millionaire-maker-stock/:


Device set to use cpu


{'Domain Trust': 46.0, 'Content Relevance': 30.44, 'Fact-Check Score': 5.225852131843567, 'Bias Score': 100, 'Citation Score': 0, 'Final Validity Score': 38.65}

Results for https://www.tomshardware.com/pc-components/gpus/nvidias-geforce-rtx-5070-at-usd549-how-does-it-stack-up-to-the-previous-generation-rtx-4070:


Device set to use cpu


{'Domain Trust': 46.0, 'Content Relevance': 65.95, 'Fact-Check Score': 0, 'Bias Score': 70, 'Citation Score': 0, 'Final Validity Score': 54.67}


In [19]:
# === TESTING ===
test_urls_3 = [
    "https://www.rpgsite.net/review/16825-kingdom-come-deliverance-ii-review",
    "https://www.pcgamer.com/games/rpg/20-hours-in-kingdom-come-deliverance-2-is-a-mad-systems-driven-sandbox-that-captures-some-of-the-best-parts-of-games-like-stalker/",
    "https://www.kingdomcomehome.shop/"
]

for url in test_urls_3:
    print(f"\nResults for {url}:")
    print(rate_url_validity("kingdome come deliverance 2 got good rating compare to one", url))


Results for https://www.rpgsite.net/review/16825-kingdom-come-deliverance-ii-review:


Device set to use cpu


{'Domain Trust': 46.0, 'Content Relevance': 59.95, 'Fact-Check Score': 50, 'Bias Score': 70, 'Citation Score': 0, 'Final Validity Score': 58.57}

Results for https://www.pcgamer.com/games/rpg/20-hours-in-kingdom-come-deliverance-2-is-a-mad-systems-driven-sandbox-that-captures-some-of-the-best-parts-of-games-like-stalker/:


Device set to use cpu


{'Domain Trust': 46.0, 'Content Relevance': 41.16, 'Fact-Check Score': 8.470191061496735, 'Bias Score': 100, 'Citation Score': 0, 'Final Validity Score': 45.57}

Results for https://www.kingdomcomehome.shop/:


Device set to use cpu


{'Domain Trust': 46.0, 'Content Relevance': 14.35, 'Fact-Check Score': 40.282234847545624, 'Bias Score': 100, 'Citation Score': 0, 'Final Validity Score': 34.25}
