# Import Libraries

In [None]:
from __future__ import annotations

import json
import logging
from pathlib import Path
from typing import Dict, Any

import argparse
import requests
import urllib.parse

from dotenv import load_dotenv
import os

# Specify File Paths

## Current file's absolute path

In [2]:

try:    
    current_file = Path(__file__).resolve()

    # Or navigate multiple levels
    PROJECT_DIR = current_file.parents[2]  # 2 levels up

except:
    PROJECT_DIR = Path.cwd().parent

# print(PROJECT_DIR)



In [3]:
CACHE_FILE = Path(f"{PROJECT_DIR}/web_search_cache_results/search_cache.json")

# Configure basic logging
logging.basicConfig(level=logging.INFO, format="[%(levelname)s] %(message)s")

# Load Search API Key

In [4]:
load_dotenv()  # reads variables from a .env file and sets them in os.environ

True

In [5]:
# # Wikipedia Search API
# # Send a search query → Wikipedia returns ranked article titles → pick the top one
# def find_closest_wikipedia_article(topic):
#     url = "https://en.wikipedia.org/w/api.php"
#     params = {
#         "action": "query",
#         "list": "search",
#         "srsearch": topic,
#         "format": "json"
#     }

#     headers = {
#         "User-Agent": "WikiSearchBot/1.0 (contact: your_email@example.com)"
#     }

#     response = requests.get(url, params=params, headers=headers)
#     response.raise_for_status()

#     results = response.json()["query"]["search"]
#     if not results:
#         return None

#     top_result = results[0]
#     title = top_result["title"]
#     page_url = f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}"

#     return {
#         "title": title,
#         "url": page_url
#     }

In [6]:
# # Example
# topic = "transformer neural networks"
# result = find_closest_wikipedia_article(topic)

# print(result)

# Search API

In [7]:
"""Utility functions for caching Google SERP results from SearchAPI.io.

If the cache file already contains results for a given query, those are
returned immediately. Otherwise, the SearchAPI.io endpoint is called and the
results are persisted for future runs.
"""

def _save_cache(cache: Dict[str, Any]) -> None:
    CACHE_FILE.parent.mkdir(parents=True, exist_ok=True)
    with CACHE_FILE.open("w", encoding="utf-8") as fp:
        json.dump(cache, fp, ensure_ascii=False, indent=2)


def _load_cache() -> Dict[str, Any]:
    if CACHE_FILE.exists():
        try:
            with CACHE_FILE.open("r", encoding="utf-8") as fp:
                return json.load(fp)
        except json.JSONDecodeError:
            # Corrupted cache → start fresh
            return {}
    return {}

In [8]:
def _fetch_from_api(query: str, api_key: str, engine: str = "google") -> Dict[str, Any]:
    """Call SearchAPI.io and return parsed JSON response."""
    url = "https://www.searchapi.io/api/v1/search"
    params = {"engine": engine, "q": query, "api_key": api_key}
    response = requests.get(url, params=params, timeout=10)
    logging.info(f"Response: {response}")
    response.raise_for_status()
    return response.json()


def get_search_results(query: str, api_key: str, engine: str = "google") -> Dict[str, Any]:
    """Return SERP results for query, using local cache when possible."""
    cache = _load_cache()

    if query in cache:
        logging.info("Using cached SERP results for query: '%s'", query)
        return cache[query]

    logging.info("Fetching SERP results from API for query: '%s'", query)
    fresh_results = _fetch_from_api(query, api_key, engine)
    logging.info(f"Fetched the results") 
    cache[query] = fresh_results
    _save_cache(cache)
    logging.info("Saved results to cache for query: '%s'", query)
    return fresh_results

# Sample Google Search

In [9]:
query = "What are some of the famous places I can visit in India?"

# Just fetch from wikipedia 
query = f"site:wikipedia.org {query}"

top_results = get_search_results(query, api_key = os.environ["SEARCH_API_KEY"])

[INFO] Using cached SERP results for query: 'site:wikipedia.org What are some of the famous places I can visit in India?'


In [10]:
for result in top_results['organic_results']:
    if 'wikipedia.org' in result['link']:
        top_ranked_link = result['link']
        break

In [11]:
# class WikipediaAPI:
#     """Clean Wikipedia API wrapper - no scraping needed!"""
    
#     BASE_URL = "https://en.wikipedia.org/w/api.php"
    
#     def __init__(self):
#         self.session = requests.Session()
    
#     def get_page_content(self, url_or_title):
#         """
#         Get Wikipedia page content using the official API.
        
#         Args:
#             url_or_title (str): Wikipedia URL or page title
#                               e.g., "https://en.wikipedia.org/wiki/Python_(programming_language)"
#                               or "Python (programming language)"
            
#         Returns:
#             dict: Page content and metadata
#         """
#         # Extract title from URL if URL is provided
#         if url_or_title.startswith("http"):
#             # Parse the title from URL
#             # e.g., https://en.wikipedia.org/wiki/Python_(programming_language)
#             title = url_or_title.split("/wiki/")[-1]
#             # URL decode the title
#             import urllib.parse
#             title = urllib.parse.unquote(title)
#         else:
#             title = url_or_title
        
#         params = {
#             "action": "query",
#             "format": "json",
#             "titles": title,
#             "prop": "extracts|info|pageprops",
#             "explaintext": True,  # Plain text, no HTML
#             "inprop": "url",
#         }
        
#         response = self.session.get(self.BASE_URL, params=params)
#         data = response.json()
        
#         # Extract page data
#         pages = data["query"]["pages"]
#         page = next(iter(pages.values()))
        
#         if "missing" in page:
#             return {"error": f"Page '{title}' not found"}
        
#         return {
#             "title": page.get("title"),
#             "page_id": page.get("pageid"),
#             "url": page.get("fullurl"),
#             "content": page.get("extract", ""),
#             "word_count": len(page.get("extract", "").split())
#         }
    
#     def search(self, query, limit=10):
#         """
#         Search Wikipedia articles.
        
#         Args:
#             query (str): Search query
#             limit (int): Number of results
            
#         Returns:
#             list: Search results
#         """
#         params = {
#             "action": "opensearch",
#             "format": "json",
#             "search": query,
#             "limit": limit
#         }
        
#         response = self.session.get(self.BASE_URL, params=params)
#         data = response.json()
        
#         # Format results
#         results = []
#         for i in range(len(data[1])):
#             results.append({
#                 "title": data[1][i],
#                 "description": data[2][i],
#                 "url": data[3][i]
#             })
        
#         return results

In [12]:
# url = "https://en.wikipedia.org/wiki/Tourism_in_India_by_state"
# wiki = WikipediaAPI()

# content = wiki.get_page_content(url)
# if "error" not in content:
#     print(f"Title: {content['title']}")
#     print(f"URL: {content['url']}")
#     print(f"Word count: {content['word_count']:,}")
#     print(f"First 200 chars: {content['content'][:200]}...\n")
        
#     # Save to file
#     # save_to_file(content, "wikipedia_content.json")

In [13]:
# Your Wikipedia URL
# url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
url = "https://en.wikipedia.org/wiki/Tourism_in_India_by_state"

# Step 1: Extract title from URL
title = url.split("/wiki/")[-1]
title = urllib.parse.unquote(title)

print(f"Extracted title: {title}")

# Step 2: Call Wikipedia API
api_url = "https://en.wikipedia.org/w/api.php"
params = {
    "action": "query",
    "format": "json",
    "titles": title,
    "prop": "extracts",
    "explaintext": False     # If False -> Keep HTML to preserve tables/infoboxes
}

# Add headers to avoid 403 error
headers = {
    "User-Agent": "WikipediaBot/1.0 (Educational purposes; your_email@example.com)"
}

# Step 3: Get the data
response = requests.get(api_url, params=params, headers=headers)
data = response.json()


Extracted title: Tourism_in_India_by_state


In [14]:
response



<Response [200]>

In [15]:
data = response.json()

In [16]:
data

{'batchcomplete': '',
 'query': {'normalized': [{'from': 'Tourism_in_India_by_state',
    'to': 'Tourism in India by state'}],
  'pages': {'52709358': {'pageid': 52709358,
    'ns': 0,
    'title': 'Tourism in India by state',
    'extract': 'Tourism plays a significant role in the Indian economy. According to the Ministry of Tourism, the sector contributed to 5.22% of India\'s GDP in 2023-24 (provisional estimates), recovering from pandemic lows of 1.50% in 2020-21 to pre-pandemic levels of around 5%. The World Travel and Tourism Council reported that India\'s travel and tourism sector contributed nearly Rs 21 trillion to the economy in 2024, supporting approximately 46.5 million jobs (9.1% of total employment).\n\nStates and Union territories of India with their names.\n\n\n== Andaman and Nicobar Islands ==\n\nThe Andaman and Nicobar Islands, first settled around 26,000 years ago, form an archipelago of 572 tropical islands, of which only 37 are inhabited. Tourism plays a major econo

In [17]:
# Step 4: Extract the content
pages = data["query"]["pages"]


In [18]:
pages.keys()

dict_keys(['52709358'])

In [19]:
page_id = list(pages.keys())[0]

In [20]:
page = pages[page_id]

In [21]:
# Step 5: Print results
print(f"\nTitle: {page['title']}")
print(f"\nContent preview:")
print(page['extract'][:500])
print("\n...")


Title: Tourism in India by state

Content preview:
Tourism plays a significant role in the Indian economy. According to the Ministry of Tourism, the sector contributed to 5.22% of India's GDP in 2023-24 (provisional estimates), recovering from pandemic lows of 1.50% in 2020-21 to pre-pandemic levels of around 5%. The World Travel and Tourism Council reported that India's travel and tourism sector contributed nearly Rs 21 trillion to the economy in 2024, supporting approximately 46.5 million jobs (9.1% of total employment).

States and Union terr

...


In [22]:
page['extract'].find("Beach destination on the Bay of Bengal")

page['extract'][35884:37000]

'Beach destination on the Bay of Bengal coast noted by the state tourism authority.\n\n\n== References ==\n\n\n== External links ==\n India travel guide from Wikivoyage\nMinistry of Tourism, Govt of India\nOfficial India Tourism website'

# Wikipedia - Extract Page Content

In [23]:
class WikipediaAPI:
    """Clean Wikipedia API wrapper - no scraping needed!"""
    
    BASE_URL = "https://en.wikipedia.org/w/api.php"
    
    def __init__(self):
        self.session = requests.Session()
    
    def get_page_content(self, url_or_title):
        """
        Get Wikipedia page content using the official API.
        
        Args:
            url_or_title (str): Wikipedia URL or page title
                              e.g., "https://en.wikipedia.org/wiki/Python_(programming_language)"
                              or "Python (programming language)"
            
        Returns:
            dict: Page content and metadata
        """
        # Step 1: Extract title from URL if URL is provided
        if url_or_title.startswith("http"):
            # Parse the title from URL
            # e.g., https://en.wikipedia.org/wiki/Python_(programming_language)
            title = url_or_title.split("/wiki/")[-1]
            # URL decode the title
            import urllib.parse
            title = urllib.parse.unquote(title)
        else:
            title = url_or_title

        print(f"Extracted title: {title}")

        # Step 2: Call Wikipedia API
        params = {
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            "explaintext": False     # If False -> Keep HTML to preserve tables/infoboxes
        }

        # Add headers to avoid 403 error
        headers = {
            "User-Agent": "WikipediaBot/1.0 (Educational purposes; your_email@example.com)"
        }

        response = requests.get(self.BASE_URL, params=params, headers=headers)

        # Step 3: Get the data
        data = response.json()

        # Step 4: Extract the content
        pages = data["query"]["pages"]
        page = next(iter(pages.values()))       # page_id = list(pages.keys())[0]  , and then  page = pages[page_id]
        
        if "missing" in page:
            return {"error": f"Page '{title}' not found"}
        
        return {
            "title": page.get("title"),
            "page_id": page.get("pageid"),
            "url": page.get("fullurl"),
            "content": page.get("extract", ""),
            "word_count": len(page.get("extract", "").split())
        }


In [24]:
wiki = WikipediaAPI()

Extracted title: Tourism_in_India_by_state


In [26]:
# Your Wikipedia URL
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
# url = "https://en.wikipedia.org/wiki/Tourism_in_India_by_state"

parsed_webpage = wiki.get_page_content(url)

# Open file to write
title = parsed_webpage['title']
title = title.replace(" ", "_")

output_file = f"{PROJECT_DIR}/wikipedia_pages/{title}_wikipedia.txt"

with open(output_file, 'w', encoding='utf-8') as f:
    f.write(f"{parsed_webpage['title']}\n\n")
    f.write(parsed_webpage['content'])
    
print(f"\nSaved to {output_file}")

Extracted title: Python_(programming_language)

Saved to /Users/abhishek/Desktop/This PC/D drive/After BU/IndicConversations/data_collection/Python_(programming_language)_wikipedia.txt


# BeautifulSoup

In [None]:
from bs4 import BeautifulSoup

In [92]:
url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
# url = "https://en.wikipedia.org/wiki/Tourism_in_India_by_state"

In [None]:
# Fetch the webpage
response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
response.raise_for_status()

# Parse with BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Remove script and style elements
for script in soup(['script', 'style', 'meta', 'noscript']):
    script.decompose()

# Get text in document order
text = soup.get_text(separator='\n', strip=True)

# Clean up excessive whitespace
lines = [line.strip() for line in text.splitlines()]
lines = [line for line in lines if line]  # Remove empty lines

text_ = '\n'.join(lines)

In [108]:
len(text_)

9946

In [110]:
soup

<!DOCTYPE html>

<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 skin-theme-clientpref-day vector-sticky-header-enabled vector-toc-available" dir="ltr" lang="en">
<head>

<title>Python (programming language) - Wikipedia</title>


<link href="/w/load.php?lang=en&amp;modules=ext.cite.styles%7Cext.pygments%2CwikimediaBadges%7Cext.uls.interlanguage%7Cext.visualEditor.desktopArticleTarget.noscript%7Cext.wikimediamessages.styles%7Cjquery.makeCollapsible.styles%7Cskins.vector.icons%2Cstyles%7Cskins.vector.search.codex.styles%7Cwikibase.client.init&amp;only=styles&amp;skin=vector-2022" rel="stylesheet"/>


<link href="/w/load.ph

In [115]:
print(text_[9000:])

rded phrases
Articles with specifically marked weasel-worded phrases from August 2025
All articles with unsourced statements
Articles with unsourced statements from August 2025
Pages using Sister project links with wikidata namespace mismatch
Pages using Sister project links with hidden wikidata
Articles with example Python (programming language) code
This page was last edited on 25 January 2026, at 00:11
(UTC)
.
Text is available under the
Creative Commons Attribution-ShareAlike 4.0 License
;
additional terms may apply. By using this site, you agree to the
Terms of Use
and
Privacy Policy
. Wikipedia® is a registered trademark of the
Wikimedia Foundation, Inc.
, a non-profit organization.
Privacy policy
About Wikipedia
Disclaimers
Contact Wikipedia
Legal & safety contacts
Code of Conduct
Developers
Statistics
Cookie statement
Mobile view
Search
Search
Toggle the table of contents
Python (programming language)
117 languages
Add topic


In [None]:
def get_website_text(url):
    """
    Extract all visible text from a website in top-to-bottom order.
    
    Args:
        url: The website URL to scrape
        
    Returns:
        String containing all text from the page
    """
    # Fetch the webpage
    response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
    response.raise_for_status()
    
    # Parse with BeautifulSoup
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Remove script and style elements
    for script in soup(['script', 'style', 'meta', 'noscript']):
        script.decompose()
    
    # Get text in document order
    text = soup.get_text(separator='\n', strip=True)
    
    # Clean up excessive whitespace
    lines = [line.strip() for line in text.splitlines()]
    lines = [line for line in lines if line]  # Remove empty lines
    
    return '\n'.join(lines)


# Example usage
if __name__ == '__main__':
    url = input("Enter URL: ")
    
    try:
        text = get_website_text(url)
        print("\n" + "="*50)
        print("EXTRACTED TEXT:")
        print("="*50 + "\n")
        print(text)
        
    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL: {e}")
    except Exception as e:
        print(f"Error: {e}")