In [2]:
!pip install bs4

Collecting bs4
  Using cached bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting beautifulsoup4 (from bs4)
  Using cached beautifulsoup4-4.12.3-py3-none-any.whl.metadata (3.8 kB)
Collecting soupsieve>1.2 (from beautifulsoup4->bs4)
  Using cached soupsieve-2.6-py3-none-any.whl.metadata (4.6 kB)
Using cached bs4-0.0.2-py2.py3-none-any.whl (1.2 kB)
Using cached beautifulsoup4-4.12.3-py3-none-any.whl (147 kB)
Using cached soupsieve-2.6-py3-none-any.whl (36 kB)
Installing collected packages: soupsieve, beautifulsoup4, bs4
Successfully installed beautifulsoup4-4.12.3 bs4-0.0.2 soupsieve-2.6


In [14]:
import requests
from bs4 import BeautifulSoup
import csv
import os

# Base URLs
MAIN_URL = 'https://2024.za.pycon.org/talks/'
BASE_URL = 'https://2024.za.pycon.org'

# Directories to save the files
TXT_SAVE_DIR = 'Text/data/talks/'
CSV_SAVE_DIR = 'Text/data/'
SPEAKER_SAVE_DIR = 'Text/data/speakers/'
COMBINED_SAVE_DIR = 'Text/data/combined/'

# Create directories if they don't exist
os.makedirs(TXT_SAVE_DIR, exist_ok=True)
os.makedirs(CSV_SAVE_DIR, exist_ok=True)
os.makedirs(SPEAKER_SAVE_DIR, exist_ok=True)
os.makedirs(COMBINED_SAVE_DIR, exist_ok=True)

def get_talk_links():
    """Fetch the main page and extract all talk links."""
    response = requests.get(MAIN_URL)
    soup = BeautifulSoup(response.text, 'html.parser')

    table = soup.find('table', {'class': 'table table-striped'})
    talks = []

    rows = table.find_all('tr')
    for row in rows:
        if row.find('th'):
            # Skip header rows
            continue
        cells = row.find_all('td')
        if len(cells) < 2:
            continue
        title_cell = cells[0]
        link_tag = title_cell.find('a')
        if not link_tag:
            continue
        link = link_tag['href']
        talks.append(BASE_URL + link)
    return talks

def parse_talk_page(talk_url, talk_id, speaker_id):
    """Fetch and parse a talk page to extract metadata and speaker information."""
    response = requests.get(talk_url)
    talk_soup = BeautifulSoup(response.text, 'html.parser')

    section = talk_soup.find('section', {'class': 'wafer wafer-talk'})
    if not section:
        return None

    # Extract title
    title = section.find('h1').get_text(strip=True)

    # Extract metadata
    metadata_div = section.find('div')
    metadata_p = metadata_div.find_all('p')
    metadata = {}

    speaker_info = {}
    speaker_name = None
    speaker_profile_url = None
    for p in metadata_p:
        text = p.get_text(separator=' ', strip=True)
        if ':' in text:
            key, value = text.split(':', 1)
            key = key.strip()
            value = value.strip()
            # Handle the 'Speaker' field separately
            if key == 'Speaker':
                speaker_tag = p.find('a')
                if speaker_tag:
                    speaker_name = speaker_tag.get_text(strip=True)
                    speaker_profile_url = BASE_URL + speaker_tag['href']
                    # Parse speaker page for speaker metadata
                    speaker_info = parse_speaker_page(speaker_profile_url)
                    speaker_info['Name'] = speaker_name
                    speaker_info['Profile URL'] = speaker_profile_url
                    speaker_info['Talk ID'] = talk_id  # Link speaker to the talk
                    speaker_info['Speaker ID'] = speaker_id  # Add speaker ID to their data
            metadata[key] = value

    # Extract abstract
    abstract_div = section.find('div', {'id': 'abstract'})
    abstract = abstract_div.get_text(separator='\n', strip=True) if abstract_div else ''

    # Combine all data
    talk_data = {
        'Talk ID': talk_id,
        'Title': title,
        'Abstract': abstract,
        'Speaker Name': speaker_name,
        'Speaker Profile URL': speaker_profile_url,
        'Talk URL': talk_url,
        'Speaker ID': speaker_id  # Include speaker ID in the talk data
    }
    
    return talk_data, speaker_info

def parse_speaker_page(speaker_url):
    """Fetch and parse a speaker's page to extract metadata."""
    response = requests.get(speaker_url)
    speaker_soup = BeautifulSoup(response.text, 'html.parser')

    speaker_data = {}

    # Extract speaker profile photo URL
    photo_tag = speaker_soup.find('img', {'class': 'img-circle'})
    speaker_data['Photo URL'] = BASE_URL + photo_tag['src'] if photo_tag else 'No photo available'

    # Extract social links and other metadata
    bio_section = speaker_soup.find('section', {'class': 'wafer-profile-bio'})
    if bio_section:
        bio_links = bio_section.find_all('a')
        for link in bio_links:
            url = link['href']
            if 'twitter' in url:
                speaker_data['Twitter'] = url
            elif 'github' in url:
                speaker_data['GitHub'] = url
            elif 'fosstodon' in url:
                speaker_data['Fediverse'] = url

        # Split bio into parts and save as separate fields
        bio_paragraphs = bio_section.find_all('p')
        for i, bio_part in enumerate(bio_paragraphs, start=1):
            speaker_data[f'Bio Part {i}'] = bio_part.get_text(strip=True)

    return speaker_data

def save_talk_as_txt(talk_data, talk_id):
    """Save a talk's metadata and abstract as a .txt file using the talk ID."""
    file_path = os.path.join(TXT_SAVE_DIR, f"{talk_id}.txt")
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(f"Talk ID: {talk_id}\n")  # Include talk ID
        file.write(f"Speaker ID: {talk_data.get('Speaker ID')}\n")  # Include speaker ID
        # Write metadata at the top
        for key, value in talk_data.items():
            if key != 'Abstract' and key != 'Speaker ID':
                file.write(f"{key}: {value}\n")
        file.write("\nAbstract:\n")
        file.write(talk_data.get('Abstract', 'No abstract available'))

def save_speaker_as_txt(speaker_data, speaker_id):
    """Save a speaker's data as a .txt file using the speaker's ID."""
    file_path = os.path.join(SPEAKER_SAVE_DIR, f"{speaker_id}.txt")
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(f"Speaker ID: {speaker_id}\n")  # Include speaker ID
        file.write(f"Talk ID: {speaker_data.get('Talk ID')}\n")  # Include talk ID
        for key, value in speaker_data.items():
            if key != 'Talk ID':
                file.write(f"{key}: {value}\n")

def save_combined_as_txt(talk_data, speaker_data, talk_id):
    """Save a combined .txt file with both talk and speaker data."""
    file_path = os.path.join(COMBINED_SAVE_DIR, f"combined_{talk_id}.txt")
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(f"Talk ID: {talk_id}\n")  # Include talk ID
        file.write(f"Speaker ID: {talk_data.get('Speaker ID')}\n")  # Include speaker ID
        file.write("\n--- Talk Data ---\n")
        for key, value in talk_data.items():
            if key != 'Abstract' and key != 'Speaker ID':
                file.write(f"{key}: {value}\n")
        file.write("\nAbstract:\n")
        file.write(talk_data.get('Abstract', 'No abstract available'))
        
        file.write("\n--- Speaker Data ---\n")
        for key, value in speaker_data.items():
            if key != 'Talk ID' and key != 'Speaker ID':
                file.write(f"{key}: {value}\n")

def save_combined_as_csv(talk_data, speaker_data):
    """Return a single dictionary combining both talk and speaker data for CSV output."""
    combined_data = {}
    combined_data.update(talk_data)
    combined_data.update(speaker_data)
    return combined_data

# Scrape talks and speakers
talk_links = get_talk_links()
all_talks = []
all_speakers = []
combined_data_list = []  # Stores the combined data for all talks and speakers
speaker_id_counter = 1  # Initialize speaker ID counter

for talk_id, link in enumerate(talk_links, start=1):
    print(f"Processing talk {talk_id}/{len(talk_links)}: {link}")
    talk_data, speaker_data = parse_talk_page(link, talk_id, speaker_id_counter)
    
    if talk_data:
        all_talks.append(talk_data)
        save_talk_as_txt(talk_data, talk_id)  # Save each talk with ID as filename
    
    if speaker_data:
        speaker_data['Speaker ID'] = speaker_id_counter  # Add speaker ID
        all_speakers.append(speaker_data)
        save_speaker_as_txt(speaker_data, speaker_id_counter)  # Save speaker data with their ID as filename
        
        # Combine data for CSV and txt files
        save_combined_as_txt(talk_data, speaker_data, talk_id)
        combined_data_list.append(save_combined_as_csv(talk_data, speaker_data))
        
        speaker_id_counter += 1

# Define CSV columns for talk and speaker data
speaker_fieldnames = ['Speaker ID', 'Name', 'Profile URL', 'Photo URL', 'Twitter', 'GitHub', 'Fediverse', 'Bio Part 1', 'Bio Part 2', 'Bio Part 3']
talk_fieldnames = ['Talk ID', 'Title', 'Abstract', 'Speaker Name', 'Speaker Profile URL', 'Talk URL', 'Speaker ID']

# Write speaker data to speaker.csv
speaker_csv_path = os.path.join(CSV_SAVE_DIR, 'speakers.csv')
with open(speaker_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=speaker_fieldnames)
    writer.writeheader()
    for speaker in all_speakers:
        writer.writerow({key: speaker.get(key, None) for key in speaker_fieldnames})

# Write talk data to talks.csv
talk_csv_path = os.path.join(CSV_SAVE_DIR, 'talks.csv')
with open(talk_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=talk_fieldnames)
    writer.writeheader()
    for talk in all_talks:
        writer.writerow({key: talk.get(key, None) for key in talk_fieldnames})

# Write combined data to combined.csv
combined_csv_path = os.path.join(CSV_SAVE_DIR, 'combined.csv')
with open(combined_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=talk_fieldnames + speaker_fieldnames)
    writer.writeheader()
    for row in combined_data_list:
        writer.writerow({key: row.get(key, None) for key in talk_fieldnames + speaker_fieldnames})


Processing talk 1/36: https://2024.za.pycon.org/talks/11-applying-ai-with-python/
Processing talk 2/36: https://2024.za.pycon.org/talks/13-harnessing-the-power-of-community-lessons-from-speedrunning-for-the-python-ecosystem-and-beyond/
Processing talk 3/36: https://2024.za.pycon.org/talks/19-its-about-time-time-series-forecasting-with-darts/
Processing talk 4/36: https://2024.za.pycon.org/talks/20-monitoring-and-evaluating-llm-apps-with-langfuse/
Processing talk 5/36: https://2024.za.pycon.org/talks/21-leveraging-the-nltk-library-for-translation-a-case-study-of-dyula-french-translation/
Processing talk 6/36: https://2024.za.pycon.org/talks/22-creating-personalised-images-with-pythons-stable-diffusion/
Processing talk 7/36: https://2024.za.pycon.org/talks/27-mental-illness-and-vulnerability-in-tech/
Processing talk 8/36: https://2024.za.pycon.org/talks/32-bridging-language-barriers-making-programming-education-accessible-to-all/
Processing talk 9/36: https://2024.za.pycon.org/talks/33-l