In [28]:
# Project two part 1

# Scrape the content of CTV's trending section and save it as a CSV.

# We want: titles, subhead, article URL, byline, article type.

# Bonus, if you want to get fancy:

#     Make the CSV file auto-updating. Use this tutorial (https://www.youtube.com/watch?v=QNKxzkNpsko) (https://jonathansoma.com/everything/git/auto-updating-scaper-viz/) 


In [29]:
#Columns: title, subhead, article URL, whether it's premium or not, byline, article type, image URL.

In [60]:
import os
import random
import time

from playwright.async_api import async_playwright, expect

In [61]:
async def open_browser(headless=False):
    """
    Starts the automated browser and opens a new window
    """
    # Start playwright
    playwright = await async_playwright().start()

    # Assign a random user agent (optional, shown below)
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"

    # Open firefox browser, can use chromium or webkit
    browser = await playwright.firefox.launch(headless=headless)

    # Create a new browser context (for setting user agent, etc.)
    context = await browser.new_context(user_agent=user_agent)

    # Create a new browser window (tab)
    page = await context.new_page()

    return browser, page

In [62]:
url = "https://www.ctvnews.ca/"

In [63]:
async def open_browser(headless=False):
    playwright = await async_playwright().start()
    user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36"
    browser = await playwright.firefox.launch(headless=headless)
    context = await browser.new_context(user_agent=user_agent)
    page = await context.new_page()
    return browser, page

In [65]:
async def save_html_from_url(url, filename="CTV_homepage.html"):
    browser, page = await open_browser()
    await page.goto(url, timeout=60000)

    # Scroll down to trigger any lazy-loaded content
    await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
    await page.wait_for_timeout(3000)

    # Wait for the Top Videos section (or whatever dynamic content you're targeting)
    await page.wait_for_selector('div[class*="b-standard-list-custom--content-api-collections"]', timeout=15000)

    # Now save the HTML
    html = await page.content()
    with open(filename, "w", encoding="utf-8") as f:
        f.write(html)

    await browser.close()


In [66]:
#save the html file for parsing
await save_html_from_url(url)

In [67]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [68]:
# Load the saved HTML
with open("CTV_homepage.html", "r", encoding="utf-8") as f:
    html = f.read()

doc = BeautifulSoup(html, "html.parser")

In [69]:
# Find the Top Videos heading, since it's proving elusive
heading = doc.find("h2", string="Top Videos")

In [70]:
# Walk up to its section container
section = heading.find_parent("div", class_="c-stack")

In [71]:
# Get the <article> tags within this section
articles = section.find_all("article")

In [73]:
# check work there should be eight videos, nine would also make sense
print(f"Found {len(articles)} top videos.")

Found 8 top videos.


In [76]:
# Loop through the eight elements in the list, pulling the information I want 
for rank, article in enumerate(articles, start=1):
    title_tag = article.find("h3")
    link_tag = title_tag.find("a") if title_tag else None

    title = link_tag.get_text(strip=True) if link_tag else 'No title'
    href = link_tag.get("href", "") if link_tag else ""
    full_url = url + href if href.startswith("/") else href

    print("Rank:", rank)
    print("Title:", title)
    print("URL:", full_url)
    print("---")

Rank: 1
Title: ‘The world needs what Canada has’: Ford on new energy deal with Alberta, Saskatchewan
URL: https://www.ctvnews.ca//toronto/video/2025/07/22/the-world-needs-what-canada-has-ford-on-new-energy-deal-with-alberta-saskatchewan/
---
Rank: 2
Title: Canadian families spent more on taxes in 2024 than food, housing, and clothing combined
URL: https://www.ctvnews.ca//video/2025/07/22/new-report-finds-taxes-remain-the-biggest-expense-for-canadian-families/
---
Rank: 3
Title: 9-year-old missing Montreal girl found dead, father charged with murder
URL: https://www.ctvnews.ca//video/2025/07/21/ctv-national-news-montreal-father-charged-in-death-of-9-year-old-girl/
---
Rank: 4
Title: Police break driver’s window, punch him while dragging him out of car for resisting arrest
URL: https://www.ctvnews.ca//video/2025/07/22/police-break-drivers-window-punch-him-while-dragging-him-out-of-car-for-resisting-arrest/
---
Rank: 5
Title: Tesla exiting ferry falls into water with driver inside
URL: ht

In [78]:
#Loop and extract data
rows = []
for rank, article in enumerate(articles, start=1):
    row = {}
    # Rank
    row["rank"] = rank
    
    # Title
    title_tag = article.find("h3")
    row["Article title"] = title_tag.get_text(strip=True) if title_tag else "No title"
    
    # URL 
    link_tag = title_tag.find("a") if title_tag else None
    href = link_tag.get("href", "") if link_tag else ""
    row["url"] = href if href.startswith("http") else f"https://www.ctvnews.ca{href}" if href else "No URL"

    

    rows.append(row)

In [79]:
import pandas as pd

In [80]:
df = pd.json_normalize(rows)

In [81]:
df.head(10)

Unnamed: 0,rank,Article title,url
0,1,‘The world needs what Canada has’: Ford on new...,https://www.ctvnews.ca/toronto/video/2025/07/2...
1,2,Canadian families spent more on taxes in 2024 ...,https://www.ctvnews.ca/video/2025/07/22/new-re...
2,3,"9-year-old missing Montreal girl found dead, f...",https://www.ctvnews.ca/video/2025/07/21/ctv-na...
3,4,"Police break driver’s window, punch him while ...",https://www.ctvnews.ca/video/2025/07/22/police...
4,5,Tesla exiting ferry falls into water with driv...,https://www.ctvnews.ca/video/2025/07/21/ctv-na...
5,6,Police deadlift car off of motorcyclist pinned...,https://www.ctvnews.ca/video/2025/07/22/police...
6,7,Customers wait hours in line to check out Elon...,https://www.ctvnews.ca/video/2025/07/22/custom...
7,8,Six-year-old girl calls 911 and saves her moth...,https://www.ctvnews.ca/video/2025/07/22/six-ye...


In [82]:
df['url']


0    https://www.ctvnews.ca/toronto/video/2025/07/2...
1    https://www.ctvnews.ca/video/2025/07/22/new-re...
2    https://www.ctvnews.ca/video/2025/07/21/ctv-na...
3    https://www.ctvnews.ca/video/2025/07/22/police...
4    https://www.ctvnews.ca/video/2025/07/21/ctv-na...
5    https://www.ctvnews.ca/video/2025/07/22/police...
6    https://www.ctvnews.ca/video/2025/07/22/custom...
7    https://www.ctvnews.ca/video/2025/07/22/six-ye...
Name: url, dtype: object

In [83]:
#adding column with the href hyperlinked url to the title, for datawrapper
df["title_with_link"] = df.apply(lambda row: f'<a href="{row["url"]}">{row["Article title"]}</a>', axis=1)

In [84]:
df.to_csv("CTVScrape.csv", index=False)