In [2]:
import wikipediaapi
import json
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from fuzzywuzzy import fuzz
from serpapi import GoogleSearch
from openai import OpenAI
from scholarly import scholarly
import os
import re

In [3]:
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
SERPAPI_KEY = os.getenv("SERPAPI_KEY")

# Utils functions for Wiki scraping

In [4]:
def fetch_wikipedia_page(title, lang="en"):
    ''' Fetch wiki page using Wikipedia API
    '''
    wiki_wiki = wikipediaapi.Wikipedia(user_agent='your-user-agent', language=lang)
    page = wiki_wiki.page(title)
    if not page.exists():
        raise ValueError(f"Page '{title}' does not exist.")
    return page

def extract_sections(page, level=0):
    '''Extract section from scrap
    '''
    sections = {}
    for section in page.sections:
        sections[section.title] = {
            "level": level,
            "text": section.text,
            "subsections": extract_sections(section, level + 1)
        }
    return sections

def extract_references(title):
    ''' Extract reference only
    '''
    url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    
    references = []
    for ref in soup.select("ol.references li cite"):
        ref_text = ref.get_text(separator=" ", strip=True)
        references.append(ref_text)

    return references

def scrape_wikipedia(title, lang="en"):
    ''' Wrapper to call all helper methods
    '''
    page = fetch_wikipedia_page(title, lang)
    data = {
        "title": page.title,
        "summary": page.summary,
        "sections": extract_sections(page),
        "references": extract_references(title),
        "url": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"
    }
    return data

In [7]:
title = "Nuclear fission"

In [8]:
# Save to JSON
data = scrape_wikipedia(title)
with open(f"{title.replace(' ', '_')}.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4)

In [9]:
# Extract references
references = extract_references(title)

# Classify references

In [10]:
# ---- STEP 1: SEARCH GOOGLE USING SERPAPI ----
def search_google(query):
    """
    Search Google for the given query using SerpAPI.
    """
    params = {
        "engine": "google",
        "q": query,
        "api_key": SERPAPI_KEY
    }

    try:
        search = GoogleSearch(params)
        results = search.get_dict()
        top_results = results.get("organic_results", [])
        return top_results[:5]  # Return top 5 results
    except Exception as e:
        print(f"SerpAPI Error: {e}")
        return None

# ---- STEP 2: VERIFY REFERENCE WITH OPENAI ----
def verify_reference_with_ai(reference):
    """
    Uses OpenAI to check if a Wikipedia reference is valid, credible, and formatted correctly.
    """
    search_results = search_google(reference)

    prompt = f"""
    Given the search results below, check if this reference is:
    1. Real (Does it exist online?)
    2. Reliable (Academic, News, Official Source, Blog, or Forum?)
    3. Correctly formatted (Does it have Title, Author, Year, Source, and URL?)
    
    **Reference**: "{reference}"
    
    **Search Results**:
    {search_results}
    
    Provide a response in this format:
    Exists: Yes/No
    Category: Academic Paper, News Article, Book, Official Documentation, Blog, Forum, Unknown
    Trustworthiness (1-5): (1 = Low, 5 = High)
    Suggested Citation Format: <Formatted APA/MLA Citation>
    """

    try:
        client = OpenAI(api_key=OPENAI_API_KEY)

        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{
                "role": "user",
                "content": prompt
            }],
        )

        return response.choices[0].message.content
    except Exception as e:
        print(f"OpenAI Error: {e}")
        return None

# ---- STEP 3: PROCESS REFERENCES ----
verified_references = []

for ref in references:
    time.sleep(5)  # Avoid rate limits
    verification_result = verify_reference_with_ai(ref)

    # Extract structured data from OpenAI response
    if verification_result:
        lines = verification_result.split("\n")
        extracted_data = { "Reference": ref }
        
        for line in lines:
            if ":" in line:
                key, value = line.split(":", 1)
                extracted_data[key.strip()] = value.strip()

        verified_references.append(extracted_data)
    else:
        verified_references.append({"Reference": ref, "Exists": "Unknown", "Category": "Unknown", "Trustworthiness": "Unknown", "Suggested Citation": "Unknown"})

# Convert to DataFrame
df_verified = pd.DataFrame(verified_references)

In [11]:
df_verified

Unnamed: 0,Reference,Exists,Category,Trustworthiness (1-5),Suggested Citation Format,(Note
0,M. G. Arora & M. Singh (1994). Nuclear Chemist...,No,Unknown,1,"Arora, M. G., & Singh, M. (1994). *Nuclear Che...",
1,Gopal B. Saha (1 November 2010). Fundamentals ...,Yes,Book,5,"Saha, G. B. (2010). *Fundamentals of Nuclear P...",
2,"Петржак, Константин (1989). ""Как было открыто ...",No,Unknown,1,"Петржак, К. (1989). Как было открыто спонтанно...",
3,"Younes, Walid; Loveland, Walter (2021). An Int...",Yes,Book,5,"Younes, W., & Loveland, W. (2021). *An introdu...",
4,"Rhodes, Richard (1986). The Making of the Atom...",Yes,Book,5,"Rhodes, R. (1986). *The Making of the Atomic B...",
5,"Dempster, A.J. (1938). ""The Atomic Masses of t...",Yes,Academic Paper,5,"Dempster, A.J. (1938). The atomic masses of th...",
6,"Feenberg, eugene (1939). ""On the Shape and Sta...",Yes,Academic Paper,5,"Feenberg, E. (1939). On the shape and stabilit...",
7,"Lilley, John (2001). Nuclear Physics: Principl...",Yes,Book,5,"Lilley, J. (2001). *Nuclear physics: Principle...",
8,"Bohr, N. (1939). ""Resonance in Uranium and Tho...",Yes,Academic Paper,5,"Bohr, N. (1939). Resonance in uranium and thor...",
9,"""Essential cross sections"" . LibreTexts Librar...",No,Unknown,1,Not applicable due to lack of existence.,


# Compare references to Google Scholar

In [25]:
# Utils to extract author
def normalize_author_name(author):
    """
    Cleans and normalizes an author name: 
    - Removes punctuation
    - Converts to lowercase
    - Trims spaces
    """
    return re.sub(r"[^\w\s]", "", author).strip().lower()

def author_match_fct(wiki_authors, scholar_authors):
    """
    Compares authors using normalized names and fuzzy matching.
    Returns True if at least one Wikipedia author is in Scholar authors.
    """
    if not wiki_authors or not scholar_authors:
        return False

    wiki_authors = [normalize_author_name(author) for author in wiki_authors]
    scholar_authors = [normalize_author_name(author) for author in scholar_authors]

    for wiki_author in wiki_authors:
        for scholar_author in scholar_authors:
            # Exact match OR Partial match (First or Last name appears)
            if wiki_author == scholar_author or fuzz.partial_ratio(wiki_author, scholar_author) > 85:
                return True

    return False

In [31]:
# ---- STEP 1: EXTRACT AUTHORS FROM SUGGESTED CITATION FORMAT ----
def extract_authors(citation):
    """
    Extracts author names from Wikipedia's Suggested Citation Format.
    Assumes names appear before the first parenthesis '('.
    """
    match = re.match(r"^(.*?)(\(|\.)", citation)
    if match:
        authors = match.group(1).strip()
        return authors
    return "Unknown"

df_verified["Extracted Authors"] = df_verified["Suggested Citation Format"].apply(extract_authors)

# ---- STEP 2: FETCH TOP GOOGLE SCHOLAR RESULTS ----
def fetch_google_scholar_results(query, num_results):
    """
    Fetch top results from Google Scholar for a given query.
    """
    search_results = scholarly.search_pubs(query)
    papers = []

    for _ in range(num_results):
        try:
            paper = next(search_results)
            papers.append({
                "Title": paper["bib"]["title"].lower(),
                "Authors": ", ".join(paper["bib"].get("author", ["Unknown"])),
                "Year": paper["bib"].get("pub_year", "Unknown"),
                "Citations": paper.get("num_citations", 0),
                "URL": paper.get("pub_url", "N/A"),
            })
        except StopIteration:
            break  # Stop if no more results

    return papers

# ---- STEP 3: COMPARE WIKIPEDIA REFERENCES WITH GOOGLE SCHOLAR ----
def compare_references(wiki_df, scholar_df):
    """
    Compare Wikipedia references with Google Scholar results using:
    1. Title similarity (Fuzzy Matching)
    2. Author matching (Check if at least one author is common)
    """

    matches = []
    for _, wiki_row in wiki_df.iterrows():
        best_match = None
        best_score = 0

        for _, scholar_row in scholar_df.iterrows():
            title_similarity = fuzz.partial_ratio(wiki_row["Reference"].lower(), scholar_row["Title"].lower())

            # Extract Wikipedia authors and Google Scholar authors
            wiki_authors = wiki_row["Extracted Authors"].split(", ") if "Extracted Authors" in wiki_row else []
            scholar_authors = scholar_row["Authors"].split(", ") if "Authors" in scholar_row else []

            # Check if at least one author matches
            author_match = author_match_fct(wiki_authors, scholar_authors)

            # Define confidence level based on matching criteria
            confidence = "Low"
            if title_similarity > 80 and author_match:
                confidence = "High"
            elif title_similarity > 70:
                confidence = "Medium"

            if title_similarity > best_score:
                best_score = title_similarity
                best_match = {
                    "Wikipedia Reference": wiki_row["Reference"],
                    "Matching Scholar Title": scholar_row["Title"],
                    "Category": wiki_row["Category"],
                    "Trustworthiness (1-5)": wiki_row["Trustworthiness (1-5)"],
                    "Extracted Wikipedia Authors": wiki_row["Extracted Authors"],
                    "Scholar Authors": scholar_row["Authors"],
                    "Title Similarity (%)": title_similarity,
                    "Author Match": "Yes" if author_match else "No",
                    "Match Confidence": confidence,
                    "Scholar URL": scholar_row["URL"],
                    "Citations": scholar_row["Citations"],
                }

        if best_match:
            matches.append(best_match)

    return pd.DataFrame(matches)

# ---- STEP 4: PROCESS COMPARISON ----
scholar_results = fetch_google_scholar_results(title, num_results=30)

# Convert to DataFrame
df_scholar = pd.DataFrame(scholar_results)

# Perform comparison
df_comparison = compare_references(df_verified, df_scholar)

In [30]:
df_verified.to_csv(f"{title.replace(" ", "_").lower()}_reference.csv")
df_scholar.to_csv(f"{title.replace(" ", "_").lower()}_scholar.csv")

df_comparison.to_csv(f"{title.replace(" ", "_").lower()}_comparison.csv")