#Deliverable-2
Student - Aditya Bhavsar



In [None]:
#Required installation of the following library
!pip install tldextract

Collecting tldextract
  Downloading tldextract-5.1.3-py3-none-any.whl.metadata (11 kB)
Collecting requests-file>=1.4 (from tldextract)
  Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Downloading tldextract-5.1.3-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.9/104.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.1.3


In [None]:
!pip install readability

Collecting readability
  Downloading readability-0.3.2.tar.gz (36 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: readability
  Building wheel for readability (setup.py) ... [?25l[?25hdone
  Created wheel for readability: filename=readability-0.3.2-py3-none-any.whl size=36384 sha256=c8d141a08c943eef470e0af2043a70e5a9705364f7c04f8caf8b7bfd7304acdb
  Stored in directory: /root/.cache/pip/wheels/6a/a8/01/0b6587e224d9731dae317fdad11b081f0e8b7be7d8367fc6eb
Successfully built readability
Installing collected packages: readability
Successfully installed readability-0.3.2


In [None]:
!pip install textstat



In [None]:
import os
os.environ["HF_TOKEN"] = "Your Hf Token"

In [None]:
from huggingface_hub import login
login(token=os.getenv("HF_TOKEN"))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [23]:
import requests
import tldextract
from datetime import datetime
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline
import time
from textstat import flesch_reading_ease
import os

class CredibilityScorer:
    def __init__(self, hf_token, serp_api_key):
        self.hf_token = hf_token
        self.serp_api_key = serp_api_key
        os.environ["HF_TOKEN"] = hf_token
        self.model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
        self.sentiment_pipeline = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")

    def get_domain_trust(self, url):
        domain = tldextract.extract(url).registered_domain
        try:
            tranco_response = requests.get("https://tranco-list.eu/top-1m.csv").text
            if domain in tranco_response:
                tranco_rank = tranco_response.split(domain)[0].strip().split("\n")[-1]
                tranco_score = max(100 - (int(tranco_rank) / 10000), 50)
            else:
                tranco_score = 40
        except:
            tranco_score = 40

        try:
            whois_response = requests.get(f"https://api.ip2whois.com/v2?key=demo&domain={domain}").json()
            creation_date = whois_response.get("created_date", "2000-01-01")
            domain_age = (datetime.now() - datetime.strptime(creation_date, "%Y-%m-%d")).days // 365
            age_score = min(domain_age * 5, 100)
        except:
            age_score = 50

        try:
            search = requests.get(f"https://serpapi.com/search.json?q={domain}&api_key={self.serp_api_key}").json()
            backlink_count = len(search.get("organic_results", []))
            backlink_score = min(backlink_count * 10, 100)
        except:
            backlink_score = 50

        domain_trust = (0.4 * tranco_score) + (0.3 * age_score) + (0.3 * backlink_score)
        return round(domain_trust, 2)

    def get_fact_check_score(self, text):
        try:
            params = {
                "q": f"fact check {text}",
                "engine": "google",
                "api_key": self.serp_api_key
            }
            response = requests.get("https://serpapi.com/search", params=params)
            data = response.json()
            trusted_sources = ["snopes.com", "politifact.com", "factcheck.org", "bbc.com", "reuters.com"]
            source_mentions = sum(1 for result in data.get("organic_results", [])
                                  if any(domain in result.get("link", "") for domain in trusted_sources))
            fact_check_score = min(source_mentions * 20, 100)
        except Exception:
            fact_check_score = 50

        try:
            wiki_response = requests.get(
                f"https://en.wikipedia.org/w/api.php?action=query&format=json&list=search&srsearch={text}"
            )
            wiki_data = wiki_response.json()
            wiki_matches = len(wiki_data.get("query", {}).get("search", []))
            fact_check_score += min(wiki_matches * 10, 30)
        except Exception:
            pass

        try:
            trusted_texts = " ".join([
                result["title"] + result["snippet"]
                for result in data.get("organic_results", [])
                if "title" in result and "snippet" in result
            ])
            similarity_score = util.pytorch_cos_sim(
                self.model.encode(text), self.model.encode(trusted_texts)
            ).item() * 100
            fact_check_score += min(similarity_score / 2, 30)
        except Exception:
            pass

        return max(0, min(fact_check_score, 100))

    def get_bias_score(self, text, domain):
        sentiment_result = self.sentiment_pipeline(text[:512])[0]
        sentiment_bias = {
            "1 star": 30,
            "2 stars": 50,
            "3 stars": 70,
            "4 stars": 80,
            "5 stars": 100
        }
        bias_score = sentiment_bias.get(sentiment_result["label"], 50)
        try:
            bias_response = requests.get(f"https://api.allsides.com/bias/{domain}").json()
            media_bias = bias_response.get("bias", "center")
            bias_adjustment = {
                "left": -20,
                "lean left": -10,
                "center": 0,
                "lean right": 10,
                "right": 20
            }.get(media_bias.lower(), 0)
            bias_score = max(0, min(bias_score + bias_adjustment, 100))
        except:
            pass
        return round(bias_score, 2)

    def compute_content_relevance(self, user_query, page_text):
        similarity_score = util.pytorch_cos_sim(
            self.model.encode(user_query), self.model.encode(page_text)
        ).item() * 100
        return round(similarity_score, 2)

    def check_google_scholar(self, url):
        try:
            backlink_params = {
                "q": f"link:{url}",
                "engine": "google",
                "api_key": self.serp_api_key
            }
            backlink_response = requests.get("https://serpapi.com/search", params=backlink_params).json()
            backlink_count = len(backlink_response.get("organic_results", []))

            academic_params = {
                "q": f'"{url}" filetype:pdf OR site:researchgate.net OR site:arxiv.org OR site:semanticscholar.org',
                "engine": "google",
                "api_key": self.serp_api_key
            }
            academic_response = requests.get("https://serpapi.com/search", params=academic_params).json()
            academic_count = len(academic_response.get("organic_results", []))

            citation_score = min((backlink_count * 5) + (academic_count * 15), 100)
            return round(citation_score, 2)
        except Exception as e:
            print(f"Error fetching citation data: {e}")
            return 0

    def get_page_load_speed(self, url):
        start_time = time.time()
        try:
            response = requests.get(url, timeout=10)
            load_time = time.time() - start_time
            return max(0, min(100 - (load_time * 10), 100))
        except:
            return 50

    def check_plagiarism(self, text):
        try:
            search_query = f'"{text[:100]}"'
            response = requests.get(f"https://serpapi.com/search?q={search_query}&api_key={self.serp_api_key}")
            duplicate_results = len(response.json().get("organic_results", []))
            return max(0, 100 - (duplicate_results * 20))
        except:
            return 50

    def get_readability_score(self, text):
        try:
            score = flesch_reading_ease(text)
            return max(0, min(score, 100))
        except:
            return 50

    def check_ssl_security(self, url):
        try:
            domain = tldextract.extract(url).registered_domain
            response = requests.get(f"https://{domain}", timeout=5)
            return 100 if response.url.startswith("https") else 0
        except:
            return 0

    def check_language_complexity(self, text):
        try:
            coherence_score = util.pytorch_cos_sim(self.model.encode(text[:500]), self.model.encode("High-quality journalistic content.")).item() * 100
            return round(coherence_score, 2)
        except:
            return 50

    def get_user_engagement(self, url):
        try:
            response = requests.get(f"https://serpapi.com/search?q=site:{url}&api_key={self.serp_api_key}")
            social_mentions = len(response.json().get("organic_results", []))
            return min(social_mentions * 10, 100)
        except:
            return 50

    def get_star_rating(self, score: float) -> tuple:
        stars = max(1, min(5, round(score / 20)))
        return stars, "⭐" * stars

    def generate_explanation(self, metrics: dict, final_score: float) -> str:
        reasons = []
        if metrics.get("Domain Trust", 0) < 50:
            reasons.append("The source has low domain authority.")
        if metrics.get("Content Relevance", 0) < 50:
            reasons.append("The content is not highly relevant to your query.")
        if metrics.get("Fact-Check Score", 0) < 50:
            reasons.append("Limited fact-checking verification found.")
        if metrics.get("Bias Score", 0) < 50:
            reasons.append("Potential bias detected in the content.")
        if metrics.get("Citation Score", 0) < 30:
            reasons.append("Few citations found for this content.")
        if metrics.get("Page Load Speed Score", 0) < 50:
            reasons.append("The page load speed is slow.")
        if metrics.get("Plagiarism Score", 0) < 50:
            reasons.append("High similarity to other sources detected.")
        if metrics.get("Readability Score", 0) < 50:
            reasons.append("The content is difficult to read.")
        if metrics.get("SSL Security Score", 0) < 50:
            reasons.append("The website may not be secure.")
        if metrics.get("Language Coherence Score", 0) < 50:
            reasons.append("The language coherence is below expectations.")
        if metrics.get("User Engagement Score", 0) < 50:
            reasons.append("Low user engagement detected.")

        if not reasons:
            reasons.append("This source is highly credible and relevant.")

        explanation = " ".join(reasons) + f" Overall credibility score: {round(final_score, 2)}."
        return explanation

    def rate_url_validity(self, user_query: str, url: str) -> dict:
        try:
            response = requests.get(url, timeout=10)
            soup = BeautifulSoup(response.text, "html.parser")
            page_text = " ".join([p.text for p in soup.find_all("p")])
        except Exception:
            return {"error": "Failed to fetch content."}

        domain = tldextract.extract(url).registered_domain

        # Compute each metric
        domain_trust       = self.get_domain_trust(url)
        content_relevance  = self.compute_content_relevance(user_query, page_text)
        fact_check_score   = self.get_fact_check_score(page_text)
        bias_score         = self.get_bias_score(page_text, domain)
        citation_score     = self.check_google_scholar(url)
        page_load_speed    = self.get_page_load_speed(url)
        plagiarism_score   = self.check_plagiarism(page_text)
        readability_score  = self.get_readability_score(page_text)
        ssl_security       = self.check_ssl_security(url)
        language_coherence = self.check_language_complexity(page_text)
        user_engagement    = self.get_user_engagement(url)

        # Detailed final score calculation using all metrics and weights
        final_score_detailed = (
            (0.10 * domain_trust) +
            (0.50 * content_relevance) +
            (0.05 * fact_check_score) +
            (0.05 * bias_score) +
            (0.10 * citation_score) +
            (0.05 * page_load_speed) +
            (0.05 * plagiarism_score) +
            (0.025 * readability_score) +
            (0.03 * ssl_security) +
            (0.02 * language_coherence) +
            (0.025 * user_engagement)
        )

        # Gather metrics into a dictionary
        metrics = {
            "Domain Trust": domain_trust,
            "Content Relevance": content_relevance,
            "Fact-Check Score": fact_check_score,
            "Bias Score": bias_score,
            "Citation Score": citation_score,
            "Page Load Speed Score": page_load_speed,
            "Plagiarism Score": plagiarism_score,
            "Readability Score": readability_score,
            "SSL Security Score": ssl_security,
            "Language Coherence Score": language_coherence,
            "User Engagement Score": user_engagement
        }

        # Use detailed score for star rating and explanation
        stars, star_icon = self.get_star_rating(final_score_detailed)
        explanation = self.generate_explanation(metrics, final_score_detailed)

        return {
            "final_score_detailed": round(final_score_detailed, 2),
            "stars": {
                "score": stars,
                "icon": star_icon
            },
            "explanation": explanation
        }


In [27]:
if __name__ == "__main__":
    hf_token = "your_huggingface_token"
    serp_api_key = "your_serpapi_key"

    scorer = CredibilityScorer(hf_token, serp_api_key)

    user_query = "Nvidias rtx 5070 new gpu is it really good?"
    url = "https://www.fool.com/investing/2025/02/09/is-nvidia-still-a-millionaire-maker-stock/"

    result = scorer.rate_url_validity(user_query, url)
    print(result)


Device set to use cpu


{'final_score_detailed': 41.82, 'stars': {'score': 2, 'icon': '⭐⭐'}, 'explanation': 'The content is not highly relevant to your query. Limited fact-checking verification found. High similarity to other sources detected. The content is difficult to read. The language coherence is below expectations. Low user engagement detected. Overall credibility score: 41.82.'}


In [26]:
if __name__ == "__main__":
    hf_token = "your_huggingface_token"
    serp_api_key = "your_serpapi_key"

    scorer = CredibilityScorer(hf_token, serp_api_key)

    user_query = "Nvidias rtx 5070 new gpu is it really good?"
    url = "https://www.tomshardware.com/pc-components/gpus/nvidias-geforce-rtx-5070-at-usd549-how-does-it-stack-up-to-the-previous-generation-rtx-4070"

    result = scorer.rate_url_validity(user_query, url)
    print(result)


Device set to use cpu


{'final_score_detailed': 63.2, 'stars': {'score': 3, 'icon': '⭐⭐⭐'}, 'explanation': 'Limited fact-checking verification found. The language coherence is below expectations. Low user engagement detected. Overall credibility score: 63.2.'}


In [28]:
if __name__ == "__main__":
    hf_token = "your_huggingface_token"
    serp_api_key = "your_serpapi_key"

    scorer = CredibilityScorer(hf_token, serp_api_key)

    user_query = "kingdome come deliverance 2 got good rating compare to one give me a proper review about this game"
    url = "	https://www.rpgsite.net/review/16825-kingdom-come-deliverance-ii-review"

    result = scorer.rate_url_validity(user_query, url)
    print(result)


Device set to use cpu


{'final_score_detailed': 56.28, 'stars': {'score': 3, 'icon': '⭐⭐⭐'}, 'explanation': 'The source has low domain authority. High similarity to other sources detected. The content is difficult to read. The website may not be secure. The language coherence is below expectations. Overall credibility score: 56.28.'}


In [29]:
if __name__ == "__main__":
    hf_token = "your_huggingface_token"
    serp_api_key = "your_serpapi_key"

    scorer = CredibilityScorer(hf_token, serp_api_key)

    user_query = "kingdome come deliverance 2 got good rating compare to one give me a proper review about this game"
    url = "https://www.kingdomcomehome.shop/"

    result = scorer.rate_url_validity(user_query, url)
    print(result)


Device set to use cpu


{'final_score_detailed': 36.11, 'stars': {'score': 2, 'icon': '⭐⭐'}, 'explanation': 'The content is not highly relevant to your query. Limited fact-checking verification found. High similarity to other sources detected. The content is difficult to read. The language coherence is below expectations. Overall credibility score: 36.11.'}


In [30]:
if __name__ == "__main__":
    hf_token = "your_huggingface_token"
    serp_api_key = "your_serpapi_key"

    scorer = CredibilityScorer(hf_token, serp_api_key)

    user_query = "delta airlines planes crashed in toronto airport give me some relevant news regarding it"
    url = "https://www.usatoday.com/story/travel/news/2025/02/17/delta-regional-flight-crashes-toronto-airport/78983808007/"

    result = scorer.rate_url_validity(user_query, url)
    print(result)


Device set to use cpu


{'final_score_detailed': 56.31, 'stars': {'score': 3, 'icon': '⭐⭐⭐'}, 'explanation': 'Limited fact-checking verification found. Potential bias detected in the content. High similarity to other sources detected. The content is difficult to read. The language coherence is below expectations. Low user engagement detected. Overall credibility score: 56.31.'}


In [32]:
if __name__ == "__main__":
    hf_token = "your_huggingface_token"
    serp_api_key = "your_serpapi_key"

    scorer = CredibilityScorer(hf_token, serp_api_key)

    user_query = "deep learning sleep algorithm details and its working"
    url = "https://pmc.ncbi.nlm.nih.gov/articles/PMC10140398/"

    result = scorer.rate_url_validity(user_query, url)
    print(result)


Device set to use cpu


{'final_score_detailed': 56.24, 'stars': {'score': 3, 'icon': '⭐⭐⭐'}, 'explanation': 'Limited fact-checking verification found. Potential bias detected in the content. High similarity to other sources detected. The content is difficult to read. The language coherence is below expectations. Low user engagement detected. Overall credibility score: 56.24.'}


In [33]:
if __name__ == "__main__":
    hf_token = "your_huggingface_token"
    serp_api_key = "your_serpapi_key"

    scorer = CredibilityScorer(hf_token, serp_api_key)

    user_query = "was moon landing by NASA fake!"
    url = "https://www.bbc.com/news"

    result = scorer.rate_url_validity(user_query, url)
    print(result)

Device set to use cpu


{'final_score_detailed': 45.02, 'stars': {'score': 2, 'icon': '⭐⭐'}, 'explanation': 'The content is not highly relevant to your query. Limited fact-checking verification found. The language coherence is below expectations. Overall credibility score: 45.02.'}


In [34]:
if __name__ == "__main__":
    hf_token = "your_huggingface_token"
    serp_api_key = "your_serpapi_key"

    scorer = CredibilityScorer(hf_token, serp_api_key)

    user_query = "does whey protein deteriorate kidney function"
    url = "https://pubmed.ncbi.nlm.nih.gov/32702243/"

    result = scorer.rate_url_validity(user_query, url)
    print(result)

Device set to use cpu


{'final_score_detailed': 64.43, 'stars': {'score': 3, 'icon': '⭐⭐⭐'}, 'explanation': 'Limited fact-checking verification found. The content is difficult to read. The language coherence is below expectations. Low user engagement detected. Overall credibility score: 64.43.'}


In [35]:
if __name__ == "__main__":
    hf_token = "your_huggingface_token"
    serp_api_key = "your_serpapi_key"

    scorer = CredibilityScorer(hf_token, serp_api_key)

    user_query = "will amd FSR get better than nvidia dlss 4"
    url = "https://www.techradar.com/computing/gpu/nvidias-dlss-4-is-amazing-heres-what-amds-fsr-4-needs-to-do-to-take-it-on"

    result = scorer.rate_url_validity(user_query, url)
    print(result)

Device set to use cpu


{'final_score_detailed': 71.16, 'stars': {'score': 4, 'icon': '⭐⭐⭐⭐'}, 'explanation': 'Limited fact-checking verification found. The content is difficult to read. The language coherence is below expectations. Low user engagement detected. Overall credibility score: 71.16.'}


In [36]:
if __name__ == "__main__":
    hf_token = "your_huggingface_token"
    serp_api_key = "your_serpapi_key"

    scorer = CredibilityScorer(hf_token, serp_api_key)

    user_query = "will amd FSR get better than nvidia dlss 4"
    url = "https://steamcommunity.com/discussions/forum/11/3543798390532636155/"

    result = scorer.rate_url_validity(user_query, url)
    print(result)

Device set to use cpu


{'final_score_detailed': 31.42, 'stars': {'score': 2, 'icon': '⭐⭐'}, 'explanation': 'The content is not highly relevant to your query. Few citations found for this content. The language coherence is below expectations. Overall credibility score: 31.42.'}
