In [1]:
# Project two part 1

# Scrape the content of CTV's trending section and save it as a CSV.

# We want: titles, subhead, article URL, byline, article type.

# Bonus, if you want to get fancy:

#     Make the CSV file auto-updating. Use this tutorial (https://www.youtube.com/watch?v=QNKxzkNpsko) (https://jonathansoma.com/everything/git/auto-updating-scaper-viz/) 


In [3]:
#Columns: title, subhead, article URL, whether it's premium or not, byline, article type, image URL.

In [4]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

url = "https://www.ctvnews.ca/"

#for anonymnity
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0 Safari/537.36'
}

response = requests.get(url, headers=headers)

# Check if the request was successful
if response.status_code == 200:
    doc = BeautifulSoup(response.text, 'html.parser')
    # Now you can use soup.find(), soup.find_all(), etc.
else:
    print(f"Request failed with status code: {response.status_code}")


In [5]:
with open("CTV_homepage.html", "w", encoding="utf-8") as f:
    f.write(response.text)

print("HTML saved to CTV_homepage.html")

HTML saved to CTV_homepage.html


In [6]:
with open("CTV_homepage.html", "r", encoding="utf-8") as f:
    html = f.read()

soup = BeautifulSoup(html, "html.parser")

In [7]:
#Locate the Top Videos section container
top_videos_container = soup.find("div", class_="c-grid b-standard-list-custom__items")


In [8]:
#Find all article items inside
video_articles = top_videos_container.find_all("article", class_="c-stack b-standard-list-custom__item")


In [9]:
#Check how many, there should be 8, though sometimes there could be 9 it looks like 
print(f"There are {len(video_articles)} articles in the trending box")

There are 3 articles in the trending box


In [10]:
# Loop through the eight elements in the list, pulling the information I want 
for rank, link in enumerate(video_articles, start=1):
    title_tag = link.find("h3")
    link_tag = title_tag.find("a") if title_tag else None
    label_tag = link.find('span', class_='tgam-label')

    title = title_tag.get_text(strip=True) if title_tag else 'No title'
    href = link_tag.get('href', '') if link_tag else ''
    full_url = url + href if href.startswith('/') else href

    print("Rank:", rank)
    print("Title:", title)
    print("URL:", full_url)
    print("---")

Rank: 1
Title: Pet owners warned of ticks transmitting Rocky Mountain Spotted FeverOpens in new window
URL: https://www.ctvnews.ca//kitchener/article/pet-owners-warned-of-ticks-transmitting-rocky-mountain-spotted-fever/
---
Rank: 2
Title: ‘We’re going to have to rebuild our life’: Family’s home burns after cancelling home insuranceOpens in new window
URL: https://www.ctvnews.ca//canada/article/were-going-to-have-to-rebuild-our-life-familys-home-burns-after-cancelling-home-insurance/
---
Rank: 3
Title: ‘Japanese walking’: Does the TikTok trend live up to the hype?Opens in new window
URL: https://www.ctvnews.ca//health/article/japanese-walking-does-the-tiktok-trend-live-up-to-the-hype/
---


In [18]:
#Loop and extract data
rows = []
for rank, item in enumerate(video_articles, start=1):
    row = {}
    # Rank
    row["rank"] = rank
    
    # Title
    headline_tag = item.find("h3")
    row["Article title"] = headline_tag.get_text(strip=True) if headline_tag else "No title"
    
    # URL 
    link_tag = headline_tag.find("a") if headline_tag else None
    href = link_tag.get("href", "") if link_tag else ""
    row["url"] = href if href.startswith("http") else f"https://www.ctvnews.ca{href}" if href else "No URL"

    

    rows.append(row)

In [19]:
import pandas as pd

In [20]:
df = pd.json_normalize(rows)

In [24]:
df.head(10)

Unnamed: 0,rank,Article title,url,title_with_link
0,1,Pet owners warned of ticks transmitting Rocky ...,https://www.ctvnews.ca/kitchener/article/pet-o...,"<a href=""https://www.ctvnews.ca/kitchener/arti..."
1,2,‘We’re going to have to rebuild our life’: Fam...,https://www.ctvnews.ca/canada/article/were-goi...,"<a href=""https://www.ctvnews.ca/canada/article..."
2,3,‘Japanese walking’: Does the TikTok trend live...,https://www.ctvnews.ca/health/article/japanese...,"<a href=""https://www.ctvnews.ca/health/article..."


In [25]:
df['url']


0    https://www.ctvnews.ca/kitchener/article/pet-o...
1    https://www.ctvnews.ca/canada/article/were-goi...
2    https://www.ctvnews.ca/health/article/japanese...
Name: url, dtype: object

In [26]:
#adding column with the href hyperlinked url to the title, for datawrapper
df["title_with_link"] = df.apply(lambda row: f'<a href="{row["url"]}">{row["Article title"]}</a>', axis=1)

In [27]:
df.to_csv("CTVScrape.csv", index=False)