In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup as soup
from webdriver_manager.chrome import ChromeDriverManager
import time
import csv

In [14]:
# Setup Chrome options
options = Options()
# options.add_argument("--headless=new")  # uncomment if you want headless mode
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")

# Initialize driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.set_page_load_timeout(40)
driver.set_script_timeout(30)

try:
    print("Opening page...")
    driver.get("https://www.livemint.com/economy")
    WebDriverWait(driver, 10).until(
        lambda d: d.execute_script("return document.readyState") == "complete"
    )

    print("Scrolling to load more content...")
    scroll_pause = 5
    max_scrolls = 3
    last_height = driver.execute_script("return document.body.scrollHeight")

    for i in range(max_scrolls):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause)

        # Wait for page to load more content
        WebDriverWait(driver, 10).until(
            lambda d: d.execute_script("return document.readyState") == "complete"
        )

        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            print("No more content to load.")
            break
        last_height = new_height
        print(f"Scrolled {i+1}/{max_scrolls} times")

    print("Scrolling done!")

    # Parse the fully loaded page
    page_source = driver.page_source
    parsed_page = soup(page_source, "lxml")

    links = []
    for story_div in parsed_page.select('div.listtostory.clearfix'):
        content_div = story_div.find('div', class_='headlineSec')
        if content_div:
            a_tag = content_div.find('a', href=True)
            if a_tag:
                links.append(a_tag['href'])

    # De-duplicate links
    links = list(set(links))

    print(f"\n✅ Found {len(links)} main article links:")
    for link in links:
        print(link)

finally:
    driver.quit()


Opening page...
Scrolling to load more content...
Scrolled 1/3 times
Scrolled 2/3 times
Scrolled 3/3 times
Scrolling done!

✅ Found 459 main article links:
/economy/more-than-5-trillion-in-investment-and-rising-donald-trump-dubs-tariffs-an-economic-revolution-eyeing-a-win-for-us-11743859920653.html
/economy/china-kicks-off-1st-sovereign-green-bond-sale-aims-to-raise-826-million-11743557635127.html
/economy/ecb-rate-cuts-are-neither-finished-nor-automatic-villeroy-says-11742924689409.html
/economy/reeves-says-trump-s-tariffs-will-hurt-uk-even-if-britain-secures-us-deal-11743608628092.html
/economy/eu-eyes-emergency-plans-to-shield-economy-from-trump-tariffs-11743608693656.html
/economy/kuwait-cabinet-approves-debt-law-paving-way-for-bond-sales-11741912270517.html
/economy/rba-sees-no-tension-in-being-forward-looking-data-dependent-11742255592714.html
/economy/a-fire-a-mushroom-and-kashmirs-vanishing-spring-11743834647562.html
/economy/rbi-said-to-ask-lenders-to-report-more-offshore-swap

In [None]:
# Setup Chrome
options = Options()
options.add_argument("--window-size=1920,1080")
# options.add_argument("--headless=new")  # Uncomment to enable headless mode

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.maximize_window()
driver.set_page_load_timeout(30)  # Timeout if page takes too long to load

# Define the base URL
news_url = 'https://www.livemint.com'

csv_filename = "articles_livemint_final.csv"
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Title", "URL", "Content"])  # Writing header

    indian_terms = [
        "India", "Indian", "Delhi", "Rupee", "RBI", "Modi", "Nirmala Sitharaman",
        "Indian government", "Indian Economy", "NITI Aayog", "India's"
    ]

    for i, link in enumerate(links):
        full_url = news_url + link if link.startswith("/") else link

        try:
            driver.get(full_url)
            time.sleep(2)
        except Exception as e:
            print(f"❌ Skipping {full_url} due to error: {e}")
            driver.quit()
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
            driver.set_page_load_timeout(30)
            print("🔄 Restarted browser session due to Connection Error!")
            continue

        psoup = soup(driver.page_source, 'html.parser')

        # Extract title
        title_tag = psoup.find("h1")
        title_text = title_tag.get_text(strip=True) if title_tag else "No title found"

        # Extract all <p> tags as content
        paragraphs = psoup.find_all('p')
        content = "\n".join(p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True))
        content = content if content else "Content not found"

        if any(term.lower() in content.lower() for term in indian_terms) and ("mint premium" not in content.lower()):
            writer.writerow([title_text, full_url, content])
            print(f"✅ Saved: {full_url}")
        else:
            print(f"⏭ Skipped (no match): {full_url}")

        # Restart browser every 10 pages to prevent crashes
        if (i + 1) % 10 == 0:
            driver.quit()
            driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
            driver.set_page_load_timeout(30)
            print("🔄 Restarted browser session")

driver.quit()
print(f"\n✅ Scraping completed. Data saved in '{csv_filename}'.")


⏭ Skipped (no match): https://www.livemint.com/economy/more-than-5-trillion-in-investment-and-rising-donald-trump-dubs-tariffs-an-economic-revolution-eyeing-a-win-for-us-11743859920653.html
⏭ Skipped (no match): https://www.livemint.com/economy/china-kicks-off-1st-sovereign-green-bond-sale-aims-to-raise-826-million-11743557635127.html
⏭ Skipped (no match): https://www.livemint.com/economy/ecb-rate-cuts-are-neither-finished-nor-automatic-villeroy-says-11742924689409.html
✅ Saved: https://www.livemint.com/economy/reeves-says-trump-s-tariffs-will-hurt-uk-even-if-britain-secures-us-deal-11743608628092.html
⏭ Skipped (no match): https://www.livemint.com/economy/eu-eyes-emergency-plans-to-shield-economy-from-trump-tariffs-11743608693656.html
⏭ Skipped (no match): https://www.livemint.com/economy/kuwait-cabinet-approves-debt-law-paving-way-for-bond-sales-11741912270517.html
⏭ Skipped (no match): https://www.livemint.com/economy/rba-sees-no-tension-in-being-forward-looking-data-dependent-11742