In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from time import sleep

def get_ridership_from_google_news(city: str, month: str, year: str):
    query = f"{city} Metro ridership {month} {year}"
    url = f"https://news.google.com/rss/search?q={query.replace(' ', '+')}"

    try:
        response = requests.get(url, timeout=10)
        soup = BeautifulSoup(response.content, "xml")
        items = soup.find_all("item")

        for item in items:
            title = item.title.text
            description = item.description.text
            link = item.link.text
            full_text = title + " " + description

            match = re.search(r"(\d+\.?\d*)\s*(crore|lakh|million|mn)", full_text, re.IGNORECASE)
            if match:
                number = float(match.group(1))
                unit = match.group(2).lower()

                if unit == "crore":
                    value = number * 10
                elif unit == "lakh":
                    value = number * 0.1
                elif unit in ["million", "mn"]:
                    value = number
                else:
                    value = number

                return {
                    "City": city,
                    "Month": f"{year}-{month}",
                    "Ridership (Million)": round(value, 2),
                    "Source": title[:100] + "...",
                    "URL": link
                }

    except Exception as e:
        return None

    return None

# Define date range and cities
months_full = ["January", "February", "March", "April", "May", "June",
               "July", "August", "September", "October", "November", "December"]
year_month_pairs = [(year, month) for year in [2023, 2024] for month in months_full][:18]
cities = ["Delhi", "Mumbai", "Bengaluru"]

# Collect data
all_data = []
for city in cities:
    for year, month in year_month_pairs:
        result = get_ridership_from_google_news(city, month, str(year))
        if result:
            all_data.append(result)
            print(f"✅ Found: {city}, {month} {year} - {result['Ridership (Million)']} M")
        else:
            print(f"❌ Not found: {city}, {month} {year}")
        sleep(1.5)  # be polite to Google News

# Save to DataFrame and CSV
df = pd.DataFrame(all_data)
df.to_csv("metro_ridership_2023_2024.csv", index=False)
print("✅ Saved to metro_ridership_2023_2024.csv")


✅ Found: Delhi, January 2023 - 355.3 M
✅ Found: Delhi, February 2023 - 7.2 M
✅ Found: Delhi, March 2023 - 9.0 M
✅ Found: Delhi, April 2023 - 7.24 M
✅ Found: Delhi, May 2023 - 4.0 M
✅ Found: Delhi, June 2023 - 7.24 M
✅ Found: Delhi, July 2023 - 7.24 M
✅ Found: Delhi, August 2023 - 7.24 M
✅ Found: Delhi, September 2023 - 7.75 M
✅ Found: Delhi, October 2023 - 4.0 M
✅ Found: Delhi, November 2023 - 7.87 M
✅ Found: Delhi, December 2023 - 355.3 M
✅ Found: Delhi, January 2024 - 6.7 M
✅ Found: Delhi, February 2024 - 7.24 M
✅ Found: Delhi, March 2024 - 6.0 M
✅ Found: Delhi, April 2024 - 0.29 M
✅ Found: Delhi, May 2024 - 4.0 M
✅ Found: Delhi, June 2024 - 0.29 M
✅ Found: Mumbai, January 2023 - 0.29 M
✅ Found: Mumbai, February 2023 - 9.0 M
✅ Found: Mumbai, March 2023 - 0.48 M
✅ Found: Mumbai, April 2023 - 0.29 M
✅ Found: Mumbai, May 2023 - 0.47 M
✅ Found: Mumbai, June 2023 - 0.29 M
✅ Found: Mumbai, July 2023 - 0.29 M
✅ Found: Mumbai, August 2023 - 0.47 M
✅ Found: Mumbai, September 2023 - 0.29 M
✅ F

In [2]:
from IPython.display import FileLink
FileLink('metro_ridership_2023_2024.csv')


In [3]:
import pandas as pd

# Load the original file
df = pd.read_csv("metro_ridership_2023_2024.csv")

# Convert 'Ridership (Million)' to numeric, forcing errors to NaN
df['Ridership (Million)'] = pd.to_numeric(df['Ridership (Million)'], errors='coerce')

# Drop rows where Ridership couldn't be parsed
df_cleaned = df.dropna(subset=['Ridership (Million)'])

# (Optional) Sort by City and Month for easier reading
df_cleaned = df_cleaned.sort_values(by=["City", "Month"])

# Save cleaned file
df_cleaned.to_csv("metro_ridership_cleaned.csv", index=False)

print("✅ Cleaned file saved as metro_ridership_cleaned.csv")


✅ Cleaned file saved as metro_ridership_cleaned.csv
