# 📰 NPR

## 📌 Instructions

1. Enter your **search term** by changing the `query` variable (e.g., `"economy"`, `"inflation"`, `"interest rates"`).  
2. Set the **start date** using `start_date` in `"YYYY-MM-DD"` format  
   - Example: `"2020-03-01"`.  
3. Define the **number of pages** to scrape using `max_pages`.  
4. The script retrieves:  
   - Title  
   - Date  
   - Link  
   - Full article content
5. The results are stored in a **pandas DataFrame** and can be exported to CSV:

```python
npr_df.to_csv("data_npr_df.csv", index=False)

In [None]:
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import time
from urllib.parse import quote
from datetime import datetime

# User-defined inputs
query = "economy"
start_date = "2020-03-01"
max_pages = 1  # Number of pages to scrape

# Function to convert a date (YYYY-MM-DD) to Unix timestamp
def date_to_unix(date_str):
    dt = datetime.strptime(date_str, "%Y-%m-%d")
    return int(dt.timestamp())

# Function to build the NPR search URL dynamically
def build_npr_url(query, page, start_date):
    encoded_query = quote(query)
    start_unix = date_to_unix(start_date)
    url = f"https://www.npr.org/search/?query={encoded_query}&page={page}&range%5BlastModifiedDate%5D%5Bmin%5D={start_unix}&sortType=byDateAsc"
    return url

# Set up headless Chrome
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
driver = webdriver.Chrome(options=options)

# Initialize containers
titles, dates, links = [], [], []

# Loop through pages
for page in range(1, max_pages + 1):
    url = build_npr_url(query, page, start_date)
    driver.get(url)
    time.sleep(5)  # Wait for JavaScript to load content

    # Get full rendered HTML
    html = driver.page_source

    # Parse with BeautifulSoup
    soup = BeautifulSoup(html, "html.parser")
    articles = soup.select("article")

    for article in articles:
        h2_tag = article.find("h2", class_="title")
        a_tag = h2_tag.find("a") if h2_tag else None
        title = a_tag.get_text(strip=True) if a_tag else None
        link = a_tag["href"] if a_tag and a_tag.has_attr("href") else None

        time_tag = article.find("time", datetime=True)
        date = time_tag["datetime"].split("T")[0] if time_tag else None

        if title and link:
            titles.append(title)
            dates.append(date)
            links.append(link)

driver.quit()

npr_df = pd.DataFrame({
    "Title": titles,
    "Date": dates,
    "Link": links
})

def extract_npr_content(url):
    try:
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
        }
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, 'html.parser')

        # Find the main content block
        story_div = soup.find('div', {'id': 'storytext'})
        if not story_div:
            return None

        # Extract all paragraphs
        paragraphs = story_div.find_all('p')
        text = ' '.join(p.get_text(strip=True) for p in paragraphs)
        return text

    except Exception as e:
        pass
        return None

npr_df['Content'] = npr_df['Link'].apply(lambda x: extract_npr_content(x))
npr_df.head()

Unnamed: 0,Title,Date,Link,Content
0,"Oil Prices Plummet As Coronavirus Outbreak, Qu...",2020-03-01,https://www.npr.org/2020/03/01/810873469/oil-p...,"China is the world's largest importer of oil, ..."
1,Trump Says 'Markets Will Take Care Of Themselv...,2020-03-01,https://www.npr.org/2020/03/01/810797303/trump...,President Trump takes questions at a press con...
2,Democrats Must Assess How To Campaign In Oil A...,2020-03-01,https://www.npr.org/2020/03/01/810873490/democ...,Climate change has become a key issue in the D...
3,Jeff Sessions Embraces President Trump In Come...,2020-03-01,https://www.npr.org/2020/03/01/810458266/jeff-...,Former U.S. Attorney General Jeff Sessions cam...
4,What Black Women Want To See In Candidates' Po...,2020-03-01,https://www.npr.org/2020/03/01/810873406/what-...,NPR's Leila Fadel asks Higher Heights of Amer...


In [None]:
npr_df.to_csv("data_npr_df.csv", index=False)