In [1]:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
URLS = [
    "https://www.epassport.gov.bd/instructions/five-step-to-your-epassport",
    "https://www.epassport.gov.bd/instructions/urgent-applications",
    "https://www.epassport.gov.bd/instructions/passport-fees",
]

In [3]:
def setup_driver():
    """Initializes the Selenium WebDriver."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage") 
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36")

    service = ChromeService(executable_path=ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver


In [4]:
def scrape_page_content(url, driver):
    """Fetches page content using Selenium and extracts text using BeautifulSoup."""
    try:
        driver.get(url)

        time.sleep(2)

        html_content = driver.page_source

    except Exception as e:
        print(f"Error loading page {url} with Selenium: {e}")
        return None

    soup = BeautifulSoup(html_content, 'html.parser')

    content_area = soup.find('div', class_='wrapper wrapper--margined') # Adjust based on site inspection

    if not content_area:
        content_area = soup.find('body')

    text_blocks = []
    for element in content_area.find_all(['p', 'li', 'h2', 'h3', 'h4']):
        text = element.get_text(strip=True)
        if text:
            text_blocks.append(text)

    page_title = soup.title.string if soup.title else "Untitled"
    return {"content": "\n".join(text_blocks), "source_url": url, "title": page_title}


In [5]:
def get_all_data():
    """Initializes driver and scrapes all target URLs."""
    driver = setup_driver()
    all_documents = []
    for url in URLS:
        print(f"Scraping {url}...")
        data = scrape_page_content(url, driver)
        if data:
            all_documents.append(data)
    driver.quit()
    return all_documents

In [6]:
get_all_data()

Scraping https://www.epassport.gov.bd/instructions/five-step-to-your-epassport...
Scraping https://www.epassport.gov.bd/instructions/urgent-applications...
Scraping https://www.epassport.gov.bd/instructions/passport-fees...


[{'content': "Home\n5 steps to e‑Passport\nLast updated:5 May 2025\nStep 1: Check if the new e-Passport is already available in your area\nList of functional e-Passport Offices\nStep 2: Fill in your e-Passport application online\nFor Online Application ClickHere\nStep 3: Pay passport fees\nFor Passport Fees and Bank List ClickHere\nStep 4: Visit your Passport Office for biometric enrolment\nMake sure you have allrequired documentswith you when you visit the passport office.\nStep 5: Collect your e-Passport at the passport office\nDelivery slip you received during passport enrolment\nAuthorized representatives(has to bring his/her NID card) can collect the applicant's new passport.",
  'source_url': 'https://www.epassport.gov.bd/instructions/five-step-to-your-epassport',
  'title': 'E‑Passport Online Registration Portal'},
 {'content': 'Home\nUrgent applications\nLast updated:1 June 2025\nWhat is Super Express passport delivery service?\nWho can apply for Super Express delivery?\nWhere 