In [46]:
Project two part 3

# Scrape the content of CTV's trending section and save it as a CSV.

# We want: titles, subhead, article URL, byline, article type.

# Bonus, if you want to get fancy:

#     Make the CSV file auto-updating. Use this tutorial (https://www.youtube.com/watch?v=QNKxzkNpsko) (https://jonathansoma.com/everything/git/auto-updating-scaper-viz/) 


In [47]:
#Columns: title, subhead, article URL, whether it's premium or not, byline, article type, image URL.

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.ctvnews.ca/"

#for anonymnity
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    doc = BeautifulSoup(response.text, 'html.parser')
    # Now you can use soup.find(), soup.find_all(), etc.
else:
    print(f"Request failed with status code: {response.status_code}")


In [12]:
with open("CTV_homepage.html", "w", encoding="utf-8") as f:
    f.write(response.text)

print("HTML saved to CTV_homepage.html")

HTML saved to CTV_homepage.html


In [13]:
with open("CTV_homepage.html", "r", encoding="utf-8") as f:
    html = f.read()

soup = BeautifulSoup(html, "html.parser")

In [6]:
#Locate the Top Videos section container
top_videos_container = soup.find("div", class_="c-grid b-standard-list-custom__items")


In [7]:
#Find all article items inside
video_articles = top_videos_container.find_all("article", class_="c-stack b-standard-list-custom__item")


In [8]:
#Check how many, there should be 8, though sometimes there could be 9 it looks like 
print(f"There are {len(video_articles)} articles in the trending box")

There are 8 articles in the trending box


In [9]:
# Loop through the eight elements in the list, pulling the information I want 
for rank, link in enumerate(video_articles, start=1):
    title_tag = link.find("h3")
    link_tag = title_tag.find("a") if title_tag else None
    label_tag = link.find('span', class_='tgam-label')

    title = title_tag.get_text(strip=True) if title_tag else 'No title'
    href = link_tag.get('href', '') if link_tag else ''
    full_url = url + href if href.startswith('/') else href

    print("Rank:", rank)
    print("Title:", title)
    print("URL:", full_url)
    print("---")

Rank: 1
Title: Caught
                    on camera: Tomorrowland music festival stage goes up in flames in Belgium
URL: https://www.ctvnews.ca/video/2025/07/16/caught-on-camera-tomorrowland-music-festival-stage-goes-up-in-flames-in-belgium/
---
Rank: 2
Title: ‘Shifting
                    from reliance to resilience’: PM Carney announces restrictions on foreign steel imports
URL: https://www.ctvnews.ca/video/2025/07/16/shifting-from-reliance-to-resilience-pm-carney-announces-restrictions-on-foreign-steel-imports/
---
Rank: 3
Title: Algoma
                    Steel president seeks access, trade certainty, and support after PM Carney announcement
URL: https://www.ctvnews.ca/video/2025/07/16/algoma-steel-president-seeks-access-trade-certainty-and-support-after-pm-carney-announcement/
---
Rank: 4
Title: Supreme
                    Steel president: ‘PM Carney is making a bet on Canadian innovation’
URL: https://www.ctvnews.ca/video/2025/07/16/supreme-steel-president-pm-carney-is-making-a-b

In [10]:
#Loop and extract data
rows = []
for rank, item in enumerate(video_articles, start=1):
    row = {}
    # Rank
    row["rank"] = rank
    
    # Title
    headline_tag = item.find("h3")
    row["title"] = headline_tag.get_text(strip=True) if headline_tag else "No title"
    
    # URL 
    link_tag = headline_tag.find("a") if headline_tag else None
    href = link_tag.get("href", "") if link_tag else ""
    row["url"] = href if href.startswith("http") else f"https://www.ctvnews.ca{href}" if href else "No URL"

    

    rows.append(row)

In [11]:
import pandas as pd

In [39]:
df = pd.json_normalize(rows)

In [40]:
df.head(10)

Unnamed: 0,rank,title,url
0,1,Caught on camera: Tomorrowland music festival ...,https://www.ctvnews.ca/video/2025/07/16/caught...
1,2,‘Shifting from reliance to resilience’: PM Car...,https://www.ctvnews.ca/video/2025/07/16/shifti...
2,3,"Algoma Steel president seeks access, trade cer...",https://www.ctvnews.ca/video/2025/07/16/algoma...
3,4,Supreme Steel president: ‘PM Carney is making ...,https://www.ctvnews.ca/video/2025/07/16/suprem...
4,5,Watch the moment a car flips onto its roof on ...,https://www.ctvnews.ca/vancouver/video/2025/07...
5,6,Trump slams MAGA supporters calling for the re...,https://www.ctvnews.ca/video/2025/07/16/trump-...
6,7,Here’s what we know about the new restrictions...,https://www.ctvnews.ca/video/2025/07/16/heres-...
7,8,‘A lot of political conflation’ ahead of Indig...,https://www.ctvnews.ca/video/2025/07/16/a-lot-...


In [41]:
df['url']


0    https://www.ctvnews.ca/video/2025/07/16/caught...
1    https://www.ctvnews.ca/video/2025/07/16/shifti...
2    https://www.ctvnews.ca/video/2025/07/16/algoma...
3    https://www.ctvnews.ca/video/2025/07/16/suprem...
4    https://www.ctvnews.ca/vancouver/video/2025/07...
5    https://www.ctvnews.ca/video/2025/07/16/trump-...
6    https://www.ctvnews.ca/video/2025/07/16/heres-...
7    https://www.ctvnews.ca/video/2025/07/16/a-lot-...
Name: url, dtype: object

In [84]:
df.to_csv("CTVScrape.csv")