## Gathering ressources & data 

#### Scraping all topics first 

In [2]:
import requests
from bs4 import BeautifulSoup
import re

def scrape_ted_topics_updated():
    """
    Scrapes all topics from the TED topics page using a more robust selector.
    """
    # The URL of the page to scrape
    url = "https://www.ted.com/topics"

    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)
        # Raise an exception for bad status codes (4xx or 5xx)
        response.raise_for_status()

        # Parse the HTML content of the page with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all <a> tags whose 'href' attribute starts with "/topics/"
        # This is more stable than relying on a CSS class.
        topic_elements = soup.select('a[href^="/topics/"]')

        # Extract the text from each element. We filter out any empty strings.
        topics = [element.get_text(strip=True) for element in topic_elements if element.get_text(strip=True)]

        # Print all the scraped topics
        if topics:
            print("Successfully scraped TED topics:")
            for topic in sorted(topics): # Sorting the list alphabetically
                print(f"- {topic}")
        else:
            print("No topics found. The page structure might have changed again.")

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during the HTTP request: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    scrape_ted_topics_updated()

Successfully scraped TED topics:
- 3D printing
- AI
- AIDS
- Activism
- Adaptation
- Addiction
- Africa
- Aging
- Agriculture
- Algorithm
- Aliens
- Alzheimer's
- Ancient world
- Animals
- Animation
- Antarctica
- Anthropocene
- Anthropology
- Archaeology
- Architecture
- Art
- Artificial intelligence
- Asia
- Asteroid
- Astrobiology
- Astronomy
- Atheism
- Augmented reality
- Autism spectrum disorder
- Bacteria
- Beauty
- Bees
- Behavioral economics
- Best of the Web
- Big Bang
- Biodiversity
- Bioethics
- Biology
- Biomimicry
- Bionics
- Biosphere
- Biotech
- Birds
- Blindness
- Blockchain
- Body language
- Books
- Botany
- Brain
- Brazil
- Buddhism
- Bullying
- Business
- CRISPR
- Cancer
- Capitalism
- Career
- Chemistry
- China
- Christianity
- Cities
- Climate change
- Code
- Cognitive science
- Collaboration
- Comedy
- Communication
- Community
- Compassion
- Competition
- Computers
- Conducting
- Confidence
- Consciousness
- Conservation
- Consumerism
- Coral reefs
- Coronavirus

In [3]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_and_save_ted_topics():
    """
    Scrapes all topics from the TED topics page and saves them to a CSV file.
    """
    url = "https://www.ted.com/topics"
    # Define the name of the output CSV file
    filename = "ted_topics.csv"

    try:
        # 1. Scrape the webpage
        response = requests.get(url)
        response.raise_for_status()  # Check for request errors

        soup = BeautifulSoup(response.text, 'html.parser')
        topic_elements = soup.select('a[href^="/topics/"]')
        topics = [element.get_text(strip=True) for element in topic_elements if element.get_text(strip=True)]

        if not topics:
            print("No topics found. The page structure might have changed.")
            return

        # 2. Write the data to a CSV file
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            # Create a CSV writer object
            writer = csv.writer(csvfile)

            # Write the header row
            writer.writerow(['Topic'])

            # Write the topic data, one topic per row
            for topic in sorted(topics):
                writer.writerow([topic])

        print(f"✅ Success! Scraped {len(topics)} topics and saved them to '{filename}'.")

    except requests.exceptions.RequestException as e:
        print(f"❌ An error occurred during the HTTP request: {e}")
    except IOError as e:
        print(f"❌ An error occurred while writing the file: {e}")
    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")

if __name__ == "__main__":
    scrape_and_save_ted_topics()

✅ Success! Scraped 373 topics and saved them to 'ted_topics.csv'.


In [None]:
import re
from html import unescape
import pandas as pd

#### For each raw .rtf file, read them and use unescape to match urls and necessary information and get rid of the rest or the raw html data 

In [6]:
# Extract video information using regex patterns
videos = []

# Pattern to find video links with titles, speakers, and duration
pattern = r'href="(/talks/[^"]+)"[^>]*>.*?alt="([^"]*)".*?dir="ltr">(\d+:\d+)</span>.*?dir="ltr">([^<]+)</span>.*?dir="ltr">([^<]+)</p>'

matches = re.findall(pattern, content, re.DOTALL)

In [32]:
with open('12-18.rtf', 'r', encoding='utf-8') as f:
    content = f.read()

In [33]:
# Extract video information using regex patterns
videos = []

# Pattern to find video links with titles, speakers, and duration
pattern = r'href="(/talks/[^"]+)"[^>]*>.*?alt="([^"]*)".*?dir="ltr">(\d+:\d+)</span>.*?dir="ltr">([^<]+)</span>.*?dir="ltr">([^<]+)</p>'

matches = re.findall(pattern, content, re.DOTALL)

for match in matches:
    url_path = match[0]
    alt_text = unescape(match[1])
    duration = match[2]
    title = unescape(match[3])
    speaker = unescape(match[4])
    
    video_data = {
        'title': title.strip(),
        'speaker': speaker.strip(),
        'duration': duration,
        'url': f'https://ted.com{url_path}'
    }
    videos.append(video_data)

#### Create a dataframe for each and load into a unique .csv file

In [None]:

df = pd.DataFrame(videos)

print(f"Found {len(df)} TED Talk videos\n")
print("DataFrame Shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

print("\nDataFrame Info:")
df.info()

df.to_csv('ted_talks12-18.csv', index=False, encoding='utf-8')
print("\n✓ Data saved to ted_talks12-18.csv")


Found 2265 TED Talk videos

DataFrame Shape: (2265, 4)

First 5 rows:

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2265 entries, 0 to 2264
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     2265 non-null   object
 1   speaker   2265 non-null   object
 2   duration  2265 non-null   object
 3   url       2265 non-null   object
dtypes: object(4)
memory usage: 70.9+ KB

✓ Data saved to ted_talks12-18.csv


In [35]:
df

Unnamed: 0,title,speaker,duration,url
0,A pastry chef works his chocolatier magic \'97...,Amaury Guichon,12:33,https://ted.com/talks/amaury_guichon_a_pastry_...
1,The flourishing future of women's sports,Kate Johnson,12:48,https://ted.com/talks/kate_johnson_the_flouris...
2,"How we\'92re turning pollution into toys, toot...",Xu Hao,12:53,https://ted.com/talks/xu_hao_how_we_re_turning...
3,The best thing that could happen to the energy...,Matt Tilleard,12:41,https://ted.com/talks/matt_tilleard_the_best_t...
4,3 simple ways to build stronger relationships ...,Alyssa Birnbaum,14:59,https://ted.com/talks/alyssa_birnbaum_3_simple...
...,...,...,...,...
2260,Meet the founder of the blog revolution,Mena Trott,16:29,https://ted.com/talks/mena_trott_meet_the_foun...
2261,Simple designs to save a life,Amy Smith,14:42,https://ted.com/talks/amy_smith_simple_designs...
2262,One Laptop per Child,Nicholas Negroponte,17:20,https://ted.com/talks/nicholas_negroponte_one_...
2263,Letting go of God,Julia Sweeney,16:15,https://ted.com/talks/julia_sweeney_letting_go...


In [None]:
import glob
import os

#### Process data: combine all different collected videos into one big db

In [41]:
csv_files = glob.glob('ted_talks*.csv')

In [42]:
csv_files

['ted_talks12-18.csv',
 'ted_talks6-12.csv',
 'ted_talks0-6.csv',
 'ted_talks18+.csv']

In [53]:
# Read and combine all CSV files
dfs = [pd.read_csv(f) for f in csv_files]
combined_df = pd.concat(dfs, ignore_index=True)

# Save the combined dataframe as a new CSV file
combined_df.to_csv(os.path.join("ted_talks_all.csv"), index=False)

# Display the shape and first few rows to confirm
print(f"Combined {len(csv_files)} files.")
print(combined_df.shape)
combined_df.head()

Combined 4 files.
(7267, 4)


Unnamed: 0,title,speaker,duration,url
0,A pastry chef works his chocolatier magic \'97...,Amaury Guichon,12:33,https://ted.com/talks/amaury_guichon_a_pastry_...
1,The flourishing future of women's sports,Kate Johnson,12:48,https://ted.com/talks/kate_johnson_the_flouris...
2,"How we\'92re turning pollution into toys, toot...",Xu Hao,12:53,https://ted.com/talks/xu_hao_how_we_re_turning...
3,The best thing that could happen to the energy...,Matt Tilleard,12:41,https://ted.com/talks/matt_tilleard_the_best_t...
4,3 simple ways to build stronger relationships ...,Alyssa Birnbaum,14:59,https://ted.com/talks/alyssa_birnbaum_3_simple...


#### Clean data (briefly): drop and list existing duplicates. Since TED.com classes videos into 4 separate classes by their duration, some being at the curbs will appear twice in both categories  

In [51]:
combined_df = combined_df.drop_duplicates(subset=["url"])  # specific column(s)

combined_df.to_csv("ted_talks_all_NODUP.csv", index=False)

print(f"Combined {len(csv_files)} files into {combined_df.shape[0]} unique rows.")
combined_df.head()

Combined 4 files into 7244 unique rows.


Unnamed: 0,title,speaker,duration,url
0,A pastry chef works his chocolatier magic \'97...,Amaury Guichon,12:33,https://ted.com/talks/amaury_guichon_a_pastry_...
1,The flourishing future of women's sports,Kate Johnson,12:48,https://ted.com/talks/kate_johnson_the_flouris...
2,"How we\'92re turning pollution into toys, toot...",Xu Hao,12:53,https://ted.com/talks/xu_hao_how_we_re_turning...
3,The best thing that could happen to the energy...,Matt Tilleard,12:41,https://ted.com/talks/matt_tilleard_the_best_t...
4,3 simple ways to build stronger relationships ...,Alyssa Birnbaum,14:59,https://ted.com/talks/alyssa_birnbaum_3_simple...


In [54]:
duplicates = combined_df[combined_df.duplicated(subset=["url"], keep=False)]

if not duplicates.empty:
    print(f"\n⚠️ Found {len(duplicates)} duplicate rows based on URL:\n")
    display(duplicates.sort_values(by=["url"]).head(10))  # show first few duplicate groups
else:
    print("\n🎉 No duplicates found based on 'url'!")

# (Optional) Save all duplicates to a separate CSV file for inspection
if not duplicates.empty:
    dupes_path = os.path.join("ted_talks_duplicates.csv")
    duplicates.to_csv(dupes_path, index=False)
    print(f"📁 Duplicate rows saved to: {dupes_path}")

# Drop duplicates based on 'url'
combined_df = combined_df.drop_duplicates(subset=["url"])

print(f"\n✅ Rows after removing duplicates: {len(combined_df)}")

# Save the cleaned DataFrame
output_path = os.path.join("ted_talks_all_clean.csv")
combined_df.to_csv(output_path, index=False)

print(f"\n📁 Cleaned CSV saved to: {output_path}")


⚠️ Found 46 duplicate rows based on URL:



Unnamed: 0,title,speaker,duration,url
356,The humans at the center of the US immigration...,Ali Noorani,12:00,https://ted.com/talks/ali_noorani_the_humans_a...
2755,The humans at the center of the US immigration...,Ali Noorani,12:00,https://ted.com/talks/ali_noorani_the_humans_a...
5302,Einstein's twin paradox explained,Amber Stuver,06:00,https://ted.com/talks/amber_stuver_einstein_s_...
3195,Einstein's twin paradox explained,Amber Stuver,06:00,https://ted.com/talks/amber_stuver_einstein_s_...
784,The enigmatic language of elephants,Beth Mortimer and Tarje Nissen-Meyer,12:00,https://ted.com/talks/beth_mortimer_and_tarje_...
3243,The enigmatic language of elephants,Beth Mortimer and Tarje Nissen-Meyer,12:00,https://ted.com/talks/beth_mortimer_and_tarje_...
627,The real story of Rosa Parks -- and why we nee...,David Ikard,18:00,https://ted.com/talks/david_ikard_the_real_sto...
6701,The real story of Rosa Parks -- and why we nee...,David Ikard,18:00,https://ted.com/talks/david_ikard_the_real_sto...
3365,Let's protect the oceans like national parks,David Lang,12:00,https://ted.com/talks/david_lang_let_s_protect...
934,Let's protect the oceans like national parks,David Lang,12:00,https://ted.com/talks/david_lang_let_s_protect...


📁 Duplicate rows saved to: ted_talks_duplicates.csv

✅ Rows after removing duplicates: 7244

📁 Cleaned CSV saved to: ted_talks_all_clean.csv


#### In total the site contains
* 1935 0-6 min videos
* 2128 6-12 min videos
* 2265 12-18 min videos
* 985 18+ min videos

#### A total of: 7313 videos 
#### We've been able to scrape 99% of the website successfully 
