In [1]:
import requests
from bs4 import BeautifulSoup

def scrape_ted_topics():
    """
    Scrapes all topics from the TED topics page.
    """
    # The URL of the page to scrape
    url = "https://www.ted.com/topics"

    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)
        # Raise an exception for bad status codes (4xx or 5xx)
        response.raise_for_status()

        # Parse the HTML content of the page with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all the <a> tags that contain the topic links.
        # Based on the site's structure, these links have the class 'ga-topic-link'.
        topic_elements = soup.find_all('a', class_='ga-topic-link')

        # Extract the text from each element and store it in a list
        topics = [element.get_text(strip=True) for element in topic_elements]

        # Print all the scraped topics
        if topics:
            print("Successfully scraped TED topics:")
            for topic in topics:
                print(f"- {topic}")
        else:
            print("No topics found. The page structure might have changed.")

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during the HTTP request: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    scrape_ted_topics()

No topics found. The page structure might have changed.


In [2]:
import requests
from bs4 import BeautifulSoup
import re

def scrape_ted_topics_updated():
    """
    Scrapes all topics from the TED topics page using a more robust selector.
    """
    # The URL of the page to scrape
    url = "https://www.ted.com/topics"

    try:
        # Send an HTTP GET request to the URL
        response = requests.get(url)
        # Raise an exception for bad status codes (4xx or 5xx)
        response.raise_for_status()

        # Parse the HTML content of the page with BeautifulSoup
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all <a> tags whose 'href' attribute starts with "/topics/"
        # This is more stable than relying on a CSS class.
        topic_elements = soup.select('a[href^="/topics/"]')

        # Extract the text from each element. We filter out any empty strings.
        topics = [element.get_text(strip=True) for element in topic_elements if element.get_text(strip=True)]

        # Print all the scraped topics
        if topics:
            print("Successfully scraped TED topics:")
            for topic in sorted(topics): # Sorting the list alphabetically
                print(f"- {topic}")
        else:
            print("No topics found. The page structure might have changed again.")

    except requests.exceptions.RequestException as e:
        print(f"An error occurred during the HTTP request: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

if __name__ == "__main__":
    scrape_ted_topics_updated()

Successfully scraped TED topics:
- 3D printing
- AI
- AIDS
- Activism
- Adaptation
- Addiction
- Africa
- Aging
- Agriculture
- Algorithm
- Aliens
- Alzheimer's
- Ancient world
- Animals
- Animation
- Antarctica
- Anthropocene
- Anthropology
- Archaeology
- Architecture
- Art
- Artificial intelligence
- Asia
- Asteroid
- Astrobiology
- Astronomy
- Atheism
- Augmented reality
- Autism spectrum disorder
- Bacteria
- Beauty
- Bees
- Behavioral economics
- Best of the Web
- Big Bang
- Biodiversity
- Bioethics
- Biology
- Biomimicry
- Bionics
- Biosphere
- Biotech
- Birds
- Blindness
- Blockchain
- Body language
- Books
- Botany
- Brain
- Brazil
- Buddhism
- Bullying
- Business
- CRISPR
- Cancer
- Capitalism
- Career
- Chemistry
- China
- Christianity
- Cities
- Climate change
- Code
- Cognitive science
- Collaboration
- Comedy
- Communication
- Community
- Compassion
- Competition
- Computers
- Conducting
- Confidence
- Consciousness
- Conservation
- Consumerism
- Coral reefs
- Coronavirus

In [3]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_and_save_ted_topics():
    """
    Scrapes all topics from the TED topics page and saves them to a CSV file.
    """
    url = "https://www.ted.com/topics"
    # Define the name of the output CSV file
    filename = "ted_topics.csv"

    try:
        # 1. Scrape the webpage
        response = requests.get(url)
        response.raise_for_status()  # Check for request errors

        soup = BeautifulSoup(response.text, 'html.parser')
        topic_elements = soup.select('a[href^="/topics/"]')
        topics = [element.get_text(strip=True) for element in topic_elements if element.get_text(strip=True)]

        if not topics:
            print("No topics found. The page structure might have changed.")
            return

        # 2. Write the data to a CSV file
        with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
            # Create a CSV writer object
            writer = csv.writer(csvfile)

            # Write the header row
            writer.writerow(['Topic'])

            # Write the topic data, one topic per row
            for topic in sorted(topics):
                writer.writerow([topic])

        print(f"✅ Success! Scraped {len(topics)} topics and saved them to '{filename}'.")

    except requests.exceptions.RequestException as e:
        print(f"❌ An error occurred during the HTTP request: {e}")
    except IOError as e:
        print(f"❌ An error occurred while writing the file: {e}")
    except Exception as e:
        print(f"❌ An unexpected error occurred: {e}")

if __name__ == "__main__":
    scrape_and_save_ted_topics()

✅ Success! Scraped 373 topics and saved them to 'ted_topics.csv'.


In [4]:
import re
from html import unescape
import pandas as pd

In [5]:
with open('test3.rtf', 'r', encoding='utf-8') as f:
    content = f.read()

In [6]:
# Extract video information using regex patterns
videos = []

# Pattern to find video links with titles, speakers, and duration
pattern = r'href="(/talks/[^"]+)"[^>]*>.*?alt="([^"]*)".*?dir="ltr">(\d+:\d+)</span>.*?dir="ltr">([^<]+)</span>.*?dir="ltr">([^<]+)</p>'

matches = re.findall(pattern, content, re.DOTALL)

In [7]:
for match in matches:
    url_path = match[0]
    alt_text = unescape(match[1])
    duration = match[2]
    title = unescape(match[3])
    speaker = unescape(match[4])
    
    video_data = {
        'title': title.strip(),
        'speaker': speaker.strip(),
        'duration': duration,
        'url': f'https://ted.com{url_path}'
    }
    videos.append(video_data)


In [10]:
# Create DataFrame
df = pd.DataFrame(videos)

# Display DataFrame info
print(f"Found {len(df)} TED Talk videos\n")
print("DataFrame Shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

print("\nDataFrame Info:")
print(df.info())

# Optional: Save to CSV
df.to_csv('ted_talks.csv', index=False, encoding='utf-8')
print("\n✓ Data saved to ted_talks.csv")

# Optional: Save to Excel
df.to_excel('ted_talks.xlsx', index=False)
print("✓ Data saved to ted_talks.xlsx")

df


Found 24 TED Talk videos

DataFrame Shape: (24, 4)

First 5 rows:
                                               title  \
0  A pastry chef works his chocolatier magic \'97...   
1                 This TED Talk is full of bad ideas   
2  What will the future of art look like? A visua...   
3  Beyond the Talk: Salome Agbaroji and Samora Pi...   
4         300 years of classical music in 18 minutes   

                                   speaker duration  \
0                           Amaury Guichon    12:33   
1                              Gabe Whaley    11:19   
2             Rob Bredow and Nora Atkinson    24:48   
3  Salome Agbaroji and Samora Pinderhughes    45:20   
4                              Joshua Bell    18:07   

                                                 url  
0  https://ted.com/talks/amaury_guichon_a_pastry_...  
1  https://ted.com/talks/gabe_whaley_this_ted_tal...  
2  https://ted.com/talks/rob_bredow_and_nora_atki...  
3  https://ted.com/talks/salome_agbaroji_and_s

Unnamed: 0,title,speaker,duration,url
0,A pastry chef works his chocolatier magic \'97...,Amaury Guichon,12:33,https://ted.com/talks/amaury_guichon_a_pastry_...
1,This TED Talk is full of bad ideas,Gabe Whaley,11:19,https://ted.com/talks/gabe_whaley_this_ted_tal...
2,What will the future of art look like? A visua...,Rob Bredow and Nora Atkinson,24:48,https://ted.com/talks/rob_bredow_and_nora_atki...
3,Beyond the Talk: Salome Agbaroji and Samora Pi...,Salome Agbaroji and Samora Pinderhughes,45:20,https://ted.com/talks/salome_agbaroji_and_samo...
4,300 years of classical music in 18 minutes,Joshua Bell,18:07,https://ted.com/talks/joshua_bell_300_years_of...
5,A multigenerational musical journey,Yijia Tu,09:38,https://ted.com/talks/yijia_tu_a_multigenerati...
6,Is AI ruining music?,Dustin Ballard,11:15,https://ted.com/talks/dustin_ballard_is_ai_rui...
7,How to turn AI prompts into movie magic,Jason Zada,11:22,https://ted.com/talks/jason_zada_how_to_turn_a...
8,Why meeting in the middle isn\'92t enough for ...,Bill Heck and Stephanie Lepp,11:24,https://ted.com/talks/bill_heck_and_stephanie_...
9,The inside story of Notre-Dame\'92s incredible...,Philippe Villeneuve,12:59,https://ted.com/talks/philippe_villeneuve_the_...


In [32]:
with open('12-18.rtf', 'r', encoding='utf-8') as f:
    content = f.read()

In [33]:
# Extract video information using regex patterns
videos = []

# Pattern to find video links with titles, speakers, and duration
pattern = r'href="(/talks/[^"]+)"[^>]*>.*?alt="([^"]*)".*?dir="ltr">(\d+:\d+)</span>.*?dir="ltr">([^<]+)</span>.*?dir="ltr">([^<]+)</p>'

matches = re.findall(pattern, content, re.DOTALL)

for match in matches:
    url_path = match[0]
    alt_text = unescape(match[1])
    duration = match[2]
    title = unescape(match[3])
    speaker = unescape(match[4])
    
    video_data = {
        'title': title.strip(),
        'speaker': speaker.strip(),
        'duration': duration,
        'url': f'https://ted.com{url_path}'
    }
    videos.append(video_data)

In [34]:
# Create DataFrame
df = pd.DataFrame(videos)

# Display DataFrame info
print(f"Found {len(df)} TED Talk videos\n")
print("DataFrame Shape:", df.shape)
print("\nFirst 5 rows:")
df.head()

print("\nDataFrame Info:")
df.info()

# Optional: Save to CSV
df.to_csv('ted_talks12-18.csv', index=False, encoding='utf-8')
print("\n✓ Data saved to ted_talks12-18.csv")




Found 2265 TED Talk videos

DataFrame Shape: (2265, 4)

First 5 rows:

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2265 entries, 0 to 2264
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     2265 non-null   object
 1   speaker   2265 non-null   object
 2   duration  2265 non-null   object
 3   url       2265 non-null   object
dtypes: object(4)
memory usage: 70.9+ KB

✓ Data saved to ted_talks12-18.csv


In [35]:
df

Unnamed: 0,title,speaker,duration,url
0,A pastry chef works his chocolatier magic \'97...,Amaury Guichon,12:33,https://ted.com/talks/amaury_guichon_a_pastry_...
1,The flourishing future of women's sports,Kate Johnson,12:48,https://ted.com/talks/kate_johnson_the_flouris...
2,"How we\'92re turning pollution into toys, toot...",Xu Hao,12:53,https://ted.com/talks/xu_hao_how_we_re_turning...
3,The best thing that could happen to the energy...,Matt Tilleard,12:41,https://ted.com/talks/matt_tilleard_the_best_t...
4,3 simple ways to build stronger relationships ...,Alyssa Birnbaum,14:59,https://ted.com/talks/alyssa_birnbaum_3_simple...
...,...,...,...,...
2260,Meet the founder of the blog revolution,Mena Trott,16:29,https://ted.com/talks/mena_trott_meet_the_foun...
2261,Simple designs to save a life,Amy Smith,14:42,https://ted.com/talks/amy_smith_simple_designs...
2262,One Laptop per Child,Nicholas Negroponte,17:20,https://ted.com/talks/nicholas_negroponte_one_...
2263,Letting go of God,Julia Sweeney,16:15,https://ted.com/talks/julia_sweeney_letting_go...
