In [3]:
# Project two part 1

# Scrape the content of The Globe and Mail's trending section and save it as a CSV.

# We want: titles, subhead, article URL, byline, article type, image URL.

# Bonus, if you want to get fancy:

#     Make the CSV file auto-updating. Use this tutorial (https://www.youtube.com/watch?v=QNKxzkNpsko) (https://jonathansoma.com/everything/git/auto-updating-scaper-viz/) 


In [4]:
#Columns: title, subhead, article URL, whether it's premium or not, byline, article type, image URL.

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.theglobeandmail.com"

#for anonymnity
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    doc = BeautifulSoup(response.text, 'html.parser')
    # Now you can use soup.find(), soup.find_all(), etc.
else:
    print(f"Request failed with status code: {response.status_code}")


In [6]:
#find all the article in the trending section (there are 8) 
article = doc.find_all('a', class_='c-card__grid c-card__link')
print(f"There are {len(article)} articles in the trending box")

There are 8 articles in the trending box


In [7]:

with open("globe_homepage.html", "w", encoding="utf-8") as f:
    f.write(response.text)

print("HTML saved to globe_homepage.html")

HTML saved to globe_homepage.html


In [8]:
# #this is the code for a sample chunk in the list of articles that I'm trying to parse 
# <div class="c-card"><a
#                   href="/business/commentary/article-what-was-mark-carney-thinking-when-he-walked-back-the-digital-services/"
#                   class="c-card__grid c-card__link"
#                   data-sophi-label="What was Mark Carney thinking when he walked back the digital services tax?"
#                   data-content-id="D6G7CUTGPZHD3FTL7QTQSU6LHI" data-lt-pos="sec:homepage:trending:recommend-tgam">
#                   <div class="c-card__content"><span
#                       class="StoryLabel__StyledStoryLabel-sc-19mm1gg-0 jZNZzC tgam-label tgam-label--story-default-light text-gml-3"
#                       mode="light">Opinion</span>
#                     <div class="c-card__hed">
#                       <h3 class="c-card__hed-text text-pb-9">What was Mark Carney thinking when he walked back the
#                         digital services tax?</h3>
#                     </div>
#                     <div class="c-card__meta"><span class="c-card__meta__authors"><span
#                           class="c-card__author text-gmr-5">Taylor C. Noakes</span></span></div>
#                   </div>
#                 </a></div>"

In [9]:
# Loop through the eight elements in the list, pulling the information I want 
for rank, link in enumerate(article, start=1):
    title_tag = link.find('h3', class_='c-card__hed-text')
    author_tag = link.find('span', class_='c-card__author')
    label_tag = link.find('span', class_='tgam-label')

    title = title_tag.get_text(strip=True) if title_tag else 'No title'
    author = author_tag.get_text(strip=True) if author_tag else 'No author'
    label = label_tag.get_text(strip=True) if label_tag else 'No label'
    href = link.get('href', '')
    full_url = url + href if href.startswith('/') else href

    print("Rank:", rank)
    print("Title:", title)
    print("Author:", author)
    print("Label:", label)
    print("URL:", full_url)
    print("---")

Rank: 1
Title: Monday’s analyst upgrades and downgrades
Author: No author
Label: No label
URL: https://www.theglobeandmail.com/investing/markets/inside-the-market/article-mondays-analyst-upgrades-and-downgrades-232/
---
Rank: 2
Title: Workers exposed to elevated levels of airborne fentanyl at Vancouver supportive housing offices
Author: No author
Label: No label
URL: https://www.theglobeandmail.com/canada/article-workers-exposed-to-elevated-levels-of-airborne-fentanyl-at-vancouver/
---
Rank: 3
Title: Vehicle found in river north of Montreal could be linked to man who went missing in 1988
Author: No author
Label: No label
URL: https://www.theglobeandmail.com/canada/article-vehicle-river-montreal-missing-laval-man/
---
Rank: 4
Title: Russia launches major attack on Kyiv hours before NATO talks on Ukraine weapons
Author: No author
Label: No label
URL: https://www.theglobeandmail.com/world/article-russia-launches-major-attack-on-kyiv-hours-before-nato-talks-on/
---
Rank: 5
Title: CNN’s Jak

In [10]:
rows = []  # List to store all article dicts

for rank, item in enumerate(article, start=1):
    row = {}

    # Title (required)
    title_tag = item.find('h3', class_='c-card__hed-text')
    row['Rank'] = rank 
    row['Article title'] = title_tag.get_text(strip=True) if title_tag else "No title found"

    # URL (required)
    href = item.get('href', '')
    base_url = "https://www.theglobeandmail.com"
    row['url'] = base_url + href if href.startswith('/') else href

    # Byline / Author
    author_tag = item.find('span', class_='c-card__author')
    row['Byline'] = author_tag.get_text(strip=True) if author_tag else "No byline found"

    # Article type / Label (e.g., "Opinion")
    label_tag = item.find('span', class_='tgam-label')
    row['Article type'] = label_tag.get_text(strip=True) if label_tag else "No article type found"

    rows.append(row)

print(f"Total articles scraped: {len(rows)}")


Total articles scraped: 8


In [11]:
df = pd.json_normalize(rows)

In [12]:
df.head(10)

Unnamed: 0,Rank,Article title,url,Byline,Article type
0,1,Monday’s analyst upgrades and downgrades,https://www.theglobeandmail.com/investing/mark...,No byline found,No article type found
1,2,Workers exposed to elevated levels of airborne...,https://www.theglobeandmail.com/canada/article...,No byline found,No article type found
2,3,Vehicle found in river north of Montreal could...,https://www.theglobeandmail.com/canada/article...,No byline found,No article type found
3,4,Russia launches major attack on Kyiv hours bef...,https://www.theglobeandmail.com/world/article-...,No byline found,No article type found
4,5,CNN’s Jake Tapper reflects on what Biden’s dec...,https://www.theglobeandmail.com/world/us-polit...,No byline found,No article type found
5,6,The bitter truth is that cheaper housing means...,https://www.theglobeandmail.com/business/comme...,John Turley-Ewart,Opinion
6,7,Count on Scottie Scheffler lifting many more t...,https://www.theglobeandmail.com/sports/article...,Cathal Kelly,Opinion
7,8,Home prices expected to remain under pressure,https://www.theglobeandmail.com/investing/mark...,No byline found,No article type found


In [14]:
df["title_with_link"] = df.apply(lambda row: f'<a href="{row["url"]}">{row["Article title"]}</a>', axis=1)


In [15]:
df.to_csv("GlobeAndMailScrape.csv")