In [1]:
# Project two part 1

# Scrape the content of Global News''s trending section and save it as a CSV.

# We want: titles, subhead, article URL, byline, article type, image URL.

# Bonus, if you want to get fancy:

#     Make the CSV file auto-updating. Use this tutorial (https://www.youtube.com/watch?v=QNKxzkNpsko) (https://jonathansoma.com/everything/git/auto-updating-scaper-viz/) 


In [2]:
#Columns: title, subhead, article URL, whether it's premium or not, byline, article type, image URL.

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://globalnews.ca/"

#for anonymnity
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    doc = BeautifulSoup(response.text, 'html.parser')
    # Now you can use soup.find(), soup.find_all(), etc.
else:
    print(f"Request failed with status code: {response.status_code}")


In [4]:
with open("GlobalNews_homepage.html", "w", encoding="utf-8") as f:
    f.write(response.text)

print("HTML saved to GlobalNews_homepage.html")

HTML saved to GlobalNews_homepage.html


In [5]:
#Find where the articles are nested
trending_container = doc.find("div", id="home-trendingPosts")

In [6]:
#Find all article items inside (there are 5)
articles = trending_container.find_all("li", class_="c-posts__item")  # or correct tag/class for each article
print(len(articles))


5


In [7]:
# # #this is the code for a sample chunk in the list of articles that I'm trying to parse 
# <li class="c-posts__item" data-post-id="11291049">
# 				<a href="https://globalnews.ca/news/11291049/canadians-renewing-mortgages-payments-go-up/" class="c-posts__inner" >
# 					<div class="c-posts__rank">
# 							<svg class="c-icon  c-posts__rankIcon" focusable="false">
# 		<use xlink:href="https://globalnews.ca/wp-content/themes/shaw-globalnews/assets/dist/icons/out/symbol/svg/sprite.symbol.svg?v=cv3j60zbx#2"></use>
# 	</svg>
# 						</div>
					
# 		<div class="c-posts__media  c-imageContainer"  >
# 						<img class="c-posts__thumbnail"
# 				data-src=https://globalnews.ca/wp-content/uploads/2025/07/Bank-of-Canada-mortgages.jpg?quality=65&#038;strip=all&#038;w=720				data-width=720 data-height=489 data-ratio=0.67									loading="lazy"
# 								width="720" height="489" alt="" 
# 												srcset='' sizes=''
# 								data-srcset="https://globalnews.ca/wp-content/uploads/2025/07/Bank-of-Canada-mortgages.jpg?quality=65&#038;strip=all&#038;w=336 336w,https://globalnews.ca/wp-content/uploads/2025/07/Bank-of-Canada-mortgages.jpg?quality=65&#038;strip=all&#038;w=560 560w,https://globalnews.ca/wp-content/uploads/2025/07/Bank-of-Canada-mortgages.jpg?quality=65&#038;strip=all&#038;w=720 720w,"				data-sizes="auto"						/>
# 													</div>
# 							<div class="c-posts__details">
# 									<div class="c-posts__headline ">
# 					<span class="c-posts__headlineText" title="60% of Canadians renewing mortgages could see payments go up by 2026" data-title>60% of Canadians renewing mortgages could see payments go up by 2026</span>
# 			</div>
# 													<div class="c-posts__about">
# 							<span class="c-posts__info c-posts__info--highlight">20,570</span>
# 							<span class="c-posts__info">Read</span>
# 						</div>
												
# 											</div>
# 				</a>
# 			</li>
# 								<li class="c-posts__item" data-post-id="11291824">
# 				<a href="https://globalnews.ca/news/11291824/suspect-charged-hijacking-security-incident-yvr/" class="c-posts__inner" >
# 					<div class="c-posts__rank">
# 							<svg class="c-icon  c-posts__rankIcon" focusable="false">
# 		<use xlink:href="https://globalnews.ca/wp-content/themes/shaw-globalnews/assets/dist/icons/out/symbol/svg/sprite.symbol.svg?v=cv3j60zbx#3"></use>
# 	</svg>
# 						</div>
					
# 		<div class="c-posts__media  c-imageContainer"  >
# 						<img class="c-posts__thumbnail"
# 				data-src=https://globalnews.ca/wp-content/uploads/2025/07/cassim-2.jpg?quality=65&#038;strip=all&#038;w=345				data-width=345 data-height=357 data-ratio=0.67									loading="lazy"
# 								width="345" height="357" alt="" 
# 												srcset='' sizes=''
# 								data-srcset="https://globalnews.ca/wp-content/uploads/2025/07/cassim-2.jpg?quality=65&#038;strip=all&#038;w=336 336w,https://globalnews.ca/wp-content/uploads/2025/07/cassim-2.jpg?quality=65&#038;strip=all&#038;w=345 345w,https://globalnews.ca/wp-content/uploads/2025/07/cassim-2.jpg?quality=65&#038;strip=all&#038;w=345 345w,"				data-sizes="auto"						/>
# 													</div>
# 							<div class="c-posts__details">
# 									<div class="c-posts__headline ">
# 					<span class="c-posts__headlineText" title="Suspect charged with hijacking in connection with security incident at YVR" data-title>Suspect charged with hijacking in connection with security incident at YVR</span>
# 			</div>
# 													<div class="c-posts__about">
# 							<span class="c-posts__info c-posts__info--highlight">19,137</span>
# 							<span class="c-posts__info">Read</span>
# 						</div>
												
# 											</div>
# 				</a>
# 			</li>

In [8]:
#though this page has numbered ranks, they are under shadowDOM, so we will use enumerate here again

# Loop through the eight elements in the list, pulling the information I want 
for rank, article in enumerate(articles, start=1):
    title_tag = article.find('span', class_='c-posts__headlineText')
    link_tag = article.find('a')


    title = title_tag.get_text(strip=True) if title_tag else 'No title'
    href = link_tag.get('href') if link_tag else ''
    full_url = href if href.startswith('http') else 'https://globalnews.ca' + href

    print("Rank:", rank)
    print("Title:", title)
    print("URL:", full_url)
    print("---")



Rank: 1
Title: Malcolm-Jamal Warner, ‘Cosby Show’ star, dead at 54
URL: https://globalnews.ca/news/11296876/malcolm-jamal-warner-dead-cosby/
---
Rank: 2
Title: Montreal dad facing murder charge after 9-year-old daughter found dead in New York
URL: https://globalnews.ca/news/11296404/missing-canadian-girl-death-new-york-state/
---
Rank: 3
Title: Astronomer CEO Andy Byron resigns after Coldplay ‘kiss cam’ video
URL: https://globalnews.ca/news/11295498/astronomer-ceo-resigns-coldplay-video/
---
Rank: 4
Title: Olympian Penny Oleksiak faces doping allegations, multi-year ban
URL: https://globalnews.ca/news/11296493/penny-oleksiak-doping-allegations-ban/
---
Rank: 5
Title: ‘We miss you,’ U.S. senators tell Canada as Lutnick vows tariffs will stay
URL: https://globalnews.ca/news/11296450/donald-trump-tariffs-mark-carney-us-senators/
---


In [9]:
rows = []  # List to store all article dicts

for rank, article in enumerate(articles, start=1):
    row = {}
    #Rank (Though this page has numbered ranks, they are under shadowDOM, so we will use enumerate here again)
    row['Rank'] = rank 
    
    # Title (required)
    title_tag = article.find('span', class_='c-posts__headlineText')
    row['Article title'] = title_tag.get_text(strip=True) if title_tag else "No title found"

    # URL (required)
    link_tag = article.find('a')
    href = link_tag.get('href') if link_tag else ''
    full_url = href if href.startswith('http') else 'https://globalnews.ca' + href
    row['url'] = full_url

    rows.append(row)

print(f"Total articles scraped: {len(rows)}")


Total articles scraped: 5


In [10]:
df = pd.json_normalize(rows)

In [11]:
df.head(10)

Unnamed: 0,Rank,Article title,url
0,1,"Malcolm-Jamal Warner, ‘Cosby Show’ star, dead ...",https://globalnews.ca/news/11296876/malcolm-ja...
1,2,Montreal dad facing murder charge after 9-year...,https://globalnews.ca/news/11296404/missing-ca...
2,3,Astronomer CEO Andy Byron resigns after Coldpl...,https://globalnews.ca/news/11295498/astronomer...
3,4,Olympian Penny Oleksiak faces doping allegatio...,https://globalnews.ca/news/11296493/penny-olek...
4,5,"‘We miss you,’ U.S. senators tell Canada as Lu...",https://globalnews.ca/news/11296450/donald-tru...


In [12]:
#adding column with the href hyperlinked url to the title, for datawrapper
df["title_with_link"] = df.apply(lambda row: f'<a href="{row["url"]}">{row["Article title"]}</a>', axis=1)

In [13]:
df.to_csv("GlobalNewsScrape.csv", index=False)