## Web scraping The New York Times' homepage
https://www.nytimes.com/

### Requests: Fetching a Web Page

In [51]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

url = "https://www.nytimes.com/"
response = requests.get(url)
response

<Response [200]>

In [52]:
response.content

b'<!DOCTYPE html>\n<html lang="en" class=" nytapp-vi-homepage "  data-nyt-compute-assignment="fallback" xmlns:og="http://opengraphprotocol.org/schema/">\n  <head>\n    \n    \n    <meta charset="utf-8" />\n    <title data-rh="true">The New York Times - Breaking News, US News, World News and Videos</title>\n    <meta data-rh="true" name="description" content="Live news, investigations, opinion, photos and video by the journalists of The New York Times from more than 150 countries around the world. Subscribe for coverage of U.S. and international news, politics, business, technology, science, health, arts, sports and more."/><meta data-rh="true" property="og:url" content="https://www.nytimes.com"/><meta data-rh="true" property="og:type" content="website"/><meta data-rh="true" property="og:title" content="The New York Times - Breaking News, US News, World News and Videos"/><meta data-rh="true" property="og:description" content="Live news, investigations, opinion, photos and video by the j

In [53]:
response.headers

{'Connection': 'close', 'Content-Length': '231980', 'x-b3-traceid': '7e0ffd6896b6c4d4', 'x-nyt-data-last-modified': 'Sat, 06 Sep 2025 13:30:31 GMT', 'last-modified': 'Sat, 06 Sep 2025 13:30:31 GMT', 'build-timestamp': '1757107707000', 'x-pagetype': 'vi-homepage', 'x-xss-protection': '1; mode=block', 'x-content-type-options': 'nosniff', 'content-type': 'text/html; charset=utf-8', 'x-envoy-upstream-service-time': '352', 'server': 'envoy', 'x-envoy-decorator-operation': 'vi.nyt.net:443/*', 'content-encoding': 'gzip', 'cache-control': 's-maxage=30,no-cache', 'x-nyt-route': 'homepage', 'X-Origin-Time': '2025-09-06 13:30:33 UTC', 'Accept-Ranges': 'bytes', 'Date': 'Sat, 06 Sep 2025 13:32:52 GMT', 'Age': '17', 'X-Served-By': 'cache-lga21961-LGA, cache-fra-etou8220131-FRA', 'X-Cache': 'HIT, HIT', 'X-Cache-Hits': '1, 1', 'X-Timer': 'S1757165572.135936,VS0,VE5', 'Vary': 'Accept-Encoding, Fastly-SSL', 'Set-Cookie': 'nyt-a=9GRTahZalPCdRkIDnli0jW; Expires=Sun, 06 Sep 2026 13:32:52 GMT; Path=/; Domai

In [54]:
print(response.headers['Content-Type'])

text/html; charset=utf-8


### Parsing HTML with Beautiful Soup


In [55]:
soup = BeautifulSoup(response.content, "html.parser")

In [56]:
type(soup)

bs4.BeautifulSoup

In [57]:
print(soup.prettify())

<!DOCTYPE html>
<html class="nytapp-vi-homepage" data-nyt-compute-assignment="fallback" lang="en" xmlns:og="http://opengraphprotocol.org/schema/">
 <head>
  <meta charset="utf-8"/>
  <title data-rh="true">
   The New York Times - Breaking News, US News, World News and Videos
  </title>
  <meta content="Live news, investigations, opinion, photos and video by the journalists of The New York Times from more than 150 countries around the world. Subscribe for coverage of U.S. and international news, politics, business, technology, science, health, arts, sports and more." data-rh="true" name="description"/>
  <meta content="https://www.nytimes.com" data-rh="true" property="og:url"/>
  <meta content="website" data-rh="true" property="og:type"/>
  <meta content="The New York Times - Breaking News, US News, World News and Videos" data-rh="true" property="og:title"/>
  <meta content="Live news, investigations, opinion, photos and video by the journalists of The New York Times from more than 150 

#### Extracting Data

In [58]:
# Find all <div> tags containing the headlines
todays_headlines = []

headlines = soup.find_all('div', class_='css-cfnhvx')[:-20]
for item in headlines:
    headline = item.get_text() 
    todays_headlines.append(headline) if headline != '' else None

df = pd.DataFrame(todays_headlines, columns=['headline'])
df.head()


Unnamed: 0,headline
0,"Many Cities Say Yes to Federal Police Help, bu..."
1,Grand Juries in D.C. Reject Wave of Charges Un...
2,Chicago Braces for a Surge of Federal Agents
3,How Trump’s Blunt-Force Diplomacy Is Pushing H...
4,Trump Says U.S. Military Has ‘Never Fought to ...


### New York Times' Today's articles. Each article block can contain up to 3 headlines and paragraphs, as well as article and image link

In [60]:
data = []

# Adjust these class names if necessary based on current NYTimes HTML
article_blocks = soup.find_all('div', class_='css-eyfo4n e1yccyp20')

for article in article_blocks:
    # Get the image link for the block
    img_tag = article.find('img')
    img_url = img_tag['src'] if img_tag and 'src' in img_tag.attrs else None

    # Find up to 3 sub-articles within the block
    sub_articles = article.find_all('div', class_='tpl-lb css-0')
    headlines = []
    paragraphs = []
    links = []

    for sub in sub_articles[:3]:
        # Headline and link
        a_tag = sub.find('a', class_='tpl-lbl css-5mgoji')
        link = a_tag['href'] if a_tag and 'href' in a_tag.attrs else None

        headline_tag = sub.find('p', class_='indicate-hover')
        headline = headline_tag.get_text(strip=True) if headline_tag else None

        p_tag = sub.find('p', class_='summary-class')
        paragraph = p_tag.get_text(strip=True) if p_tag else None

        headlines.append(headline)
        paragraphs.append(paragraph)
        links.append(link)

    # Pad to length 3 to avoid index errors
    while len(headlines) < 3:
        headlines.append(None)
        paragraphs.append(None)
        links.append(None)

    # Choose the first available link as the article_link
    article_link = next((l for l in links if l), None)

    data.append({
        'headline_1': headlines[0],
        'paragraph_1': paragraphs[0],
        'headline_2': headlines[1],
        'paragraph_2': paragraphs[1],
        'headline_3': headlines[2],
        'paragraph_3': paragraphs[2],
        'article_link': article_link,
        'img_link': img_url
    })

### Extracted article blocks saved as a Dataframe and exported as a .csv file

In [61]:
# Create DataFrame and drop duplicate rows
df = pd.DataFrame(data)
df = df.drop_duplicates(subset=['headline_1', 'headline_2', 'headline_3']).reset_index(drop=True)

# Save to CSV
df.to_csv('ny_times_articles.csv', index=False)

# Print sample
print(df.head())

                                          headline_1  \
0  Many Cities Say Yes to Federal Police Help, bu...   
1  How Trump’s Blunt-Force Diplomacy Is Pushing H...   
2  Settlement Talks Stall Between Harvard and the...   
3  Thrust Into the Line of Fire, Iranians Worry A...   
4  Israel Targets More Buildings in Gaza City and...   

                                         paragraph_1  \
0  Some officials said they would welcome more tr...   
1  Some of President Trump’s pressure tactics app...   
2  One major reason is said to be an emerging div...   
3  We visited Tehran and talked to Iranians livin...   
4                                               None   

                                          headline_2  \
0  Grand Juries in D.C. Reject Wave of Charges Un...   
1  Trump Says U.S. Military Has ‘Never Fought to ...   
2  At George Mason University, Trump Has Found an...   
3                                               None   
4                                             

In [62]:
# Export to CSV file
df.to_csv('ny_times_articles.csv', index=False)