# APEC 2025 Korea Website Scraping

This notebook will scrape content from the APEC 2025 Korea website.

In [2]:
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin, urlparse
import os
import json

In [None]:
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.5',
    'Accept-Encoding': 'gzip, deflate',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
})

In [38]:
def get_soup(url, timeout=10):

    print(f"Fetching: {url}")
    response = session.get(url, timeout=timeout)
    response.raise_for_status()
    
    if response.encoding == 'ISO-8859-1':
        response.encoding = response.apparent_encoding
    
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup


In [39]:
url = "https://apec2025.kr/?menuno=1"
soup = get_soup(url)

Fetching: https://apec2025.kr/?menuno=1


In [41]:
if soup:
    title = soup.title.string if soup.title else "No title"
    print(f"Title: {title}")
    
    print(f"\nPage structure:")
    print(f"Links: {len(soup.find_all('a'))}")
    print(f"Images: {len(soup.find_all('img'))}")
    print(f"Paragraphs: {len(soup.find_all('p'))}")
    print(f"Headings: {len(soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']))}")

Title: Main > APEC 2025 KOREA

Page structure:
Links: 82
Images: 26
Paragraphs: 21
Headings: 15


In [None]:
if soup:
    links = soup.find_all('a', href=True)
    print(f"Found {len(links)} links:")
    
    base_url = "https://apec2025.kr"
    all_links = []
    
    for link in links[:-1]:  
        href = link['href']
        text = link.get_text(strip=True)
        
        full_url = urljoin(base_url, href)
        all_links.append({'url': full_url, 'text': text})
        
        print(f"- {text[:50]}... -> {full_url}")
    


Found 82 links:
- Go to main content... -> https://apec2025.kr#contents
- youtube... -> https://www.youtube.com/@APEC2025KOREA/videos
- instagram... -> https://www.instagram.com/apec2025korea/
- facebook... -> https://www.facebook.com/apec2025korea1
- flickr... -> https://www.flickr.com/photos/apec2025
- KOR... -> https://apec2025.kr/kor/
- About APEC 2025 KOREA... -> https://apec2025.kr/
- Mobile menu... -> https://apec2025.kr#none
- About APEC 2025 KOREA... -> https://apec2025.kr?menuno=2
- APEC... -> https://apec2025.kr?menuno=89
- APEC 2025 KOREA... -> https://apec2025.kr?menuno=90
- Meetings... -> https://apec2025.kr?menuno=93
- Side Event... -> https://apec2025.kr?menuno=94
- Media... -> https://apec2025.kr?menuno=14
- Notices... -> https://apec2025.kr?menuno=15
- Press Releases... -> https://apec2025.kr?menuno=16
- Resources... -> https://apec2025.kr?menuno=17
- Social Media... -> https://apec2025.kr?menuno=98
- Partners... -> https://apec2025.kr?menuno=100
- Sponsorship... -> h

In [42]:
def process_table(table):
    """Convert table to structured text format"""
    table_text = ["\n[TABLE START]\n"]
    
    headers = table.find_all('th')
    if headers:
        header_texts = [th.get_text(strip=True) for th in headers]
        table_text.append("HEADERS: " + " | ".join(header_texts) + "\n")
        table_text.append("-" * 50 + "\n")
    
    rows = table.find_all('tr')
    for i, row in enumerate(rows, 1):
        cells = row.find_all(['td', 'th'])
        if cells:
            cell_texts = [cell.get_text(strip=True) for cell in cells]
            cell_texts = [text for text in cell_texts if text]
            if cell_texts:
                table_text.append(f"ROW {i}: " + " | ".join(cell_texts) + "\n")
    
    table_text.append("[TABLE END]\n\n")
    return ''.join(table_text)


In [None]:
def extract_text_content_dedup(soup):
    content_div = soup.find('div', class_='contents')

    
    # Remove script and style elements
    for script in content_div(["script", "style", "nav", "header", "footer", "noscript"]):
        script.decompose()
    

    # Track seen content to avoid duplicates
    seen_content = set()
    structured_text = []
    processed_elements = set()
    
    # Process all elements in order
    for element in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'li', 'table']):
        if any(element in processed for processed in processed_elements):
            continue
            
        if element.name == 'table':
            table_content = process_table(element)
            if table_content not in seen_content:
                structured_text.append(table_content)
                seen_content.add(table_content)
            processed_elements.add(element)
            for desc in element.find_all():
                processed_elements.add(desc)
            continue
        
        text = element.get_text(strip=True)
        if not text or len(text) < 3:  # Skip very short text
            continue
            
        normalized_text = ' '.join(text.split()).lower()
        
        if element.name == 'h2':
            formatted_text = f"\n\n=== {text.upper()} ===\n"
            if normalized_text not in seen_content:
                structured_text.append(formatted_text)
                seen_content.add(normalized_text)
        elif element.name == 'h3':
            formatted_text = f"\n--- {text} ---\n"
            if normalized_text not in seen_content:
                structured_text.append(formatted_text)
                seen_content.add(normalized_text)
        elif element.name in ['h1', 'h4', 'h5', 'h6']:
            formatted_text = f"\n{text}\n"
            if normalized_text not in seen_content:
                structured_text.append(formatted_text)
                seen_content.add(normalized_text)
        else:
            if not element.find_parent('table') and normalized_text not in seen_content:
                structured_text.append(text + " ")
                seen_content.add(normalized_text)
    
    # Join and clean up
    final_text = ''.join(structured_text)
    
    # Clean up whitespace
    lines = final_text.split('\n')
    cleaned_lines = []
    for line in lines:
        cleaned_line = ' '.join(line.split())
        cleaned_lines.append(cleaned_line)
    
    # Remove excessive empty lines
    result_lines = []
    empty_count = 0
    for line in cleaned_lines:
        if line.strip() == '':
            empty_count += 1
            if empty_count <= 2:
                result_lines.append(line)
        else:
            empty_count = 0
            result_lines.append(line)
    
    return '\n'.join(result_lines).strip()

In [45]:
def scrape_page_content(url, max_retries=3):
    try:
        soup = get_soup(url)

        title = soup.title.string if soup.title else "No title"
        
        content = extract_text_content_dedup(soup)
        
        return {
            'url': url,
            'title': title.strip(),
            'content': content,
            'word_count': len(content.split()),
            'status': 'success'
        }
    except Exception as e:
        print(f"Error scraping {url}: {e}")


In [46]:
# Filter links to only include APEC 2025 Korea pages 
apec_links = []
for link_data in all_links:
    url = link_data['url']
    # Only include apec2025.kr links and exclude javascript links
    if 'apec2025.kr' in url and not url.startswith('javascript:') and '?menuno=' in url:
        apec_links.append(link_data)

# Remove duplicates
unique_urls = set()
unique_apec_links = []
for link in apec_links:
    if link['url'] not in unique_urls:
        unique_urls.add(link['url'])
        unique_apec_links.append(link)

print(f"Found {len(unique_apec_links)} unique APEC 2025 Korea pages to scrape:")
for i, link in enumerate(unique_apec_links[:-1], 1):
    print(f"{i}. {link['text'][:60]}-> {link['url']}")


Found 24 unique APEC 2025 Korea pages to scrape:
1. About APEC 2025 KOREA-> https://apec2025.kr?menuno=2
2. APEC-> https://apec2025.kr?menuno=89
3. APEC 2025 KOREA-> https://apec2025.kr?menuno=90
4. Meetings-> https://apec2025.kr?menuno=93
5. Side Event-> https://apec2025.kr?menuno=94
6. Media-> https://apec2025.kr?menuno=14
7. Notices-> https://apec2025.kr?menuno=15
8. Press Releases-> https://apec2025.kr?menuno=16
9. Resources-> https://apec2025.kr?menuno=17
10. Social Media-> https://apec2025.kr?menuno=98
11. Partners-> https://apec2025.kr?menuno=100
12. Visit Korea-> https://apec2025.kr?menuno=18
13. K-story-> https://apec2025.kr?menuno=19
14. Gyeongju-> https://apec2025.kr?menuno=102
15. Jeju-> https://apec2025.kr?menuno=103
16. Incheon-> https://apec2025.kr?menuno=104
17. Busan-> https://apec2025.kr?menuno=106
18. Seoul-> https://apec2025.kr?menuno=24
19. -> https://apec2025.kr/?menuno=93
20. 재생버튼-> https://apec2025.kr/?menuno=16&act=view&ztag=rO0ABXQAUTxjYWxsIHR5cGU9ImJvYXJkIiBu

In [47]:
scraped_data = []
total_pages = len(unique_apec_links)


for i, link_data in enumerate(unique_apec_links, 1):
    url = link_data['url']
    print(f"[{i}/{total_pages}] Scraping: {link_data['text'][:50]}...")
    
    page_data = scrape_page_content(url)
    page_data['link_text'] = link_data['text']
    scraped_data.append(page_data)
    
    time.sleep(1)
    
    # Show progress
    if page_data['status'] == 'success':
        print(f"Success: {page_data['word_count']} words")
    else:
        print(f"{page_data['status'].title()}")
    print()



[1/24] Scraping: About APEC 2025 KOREA...
Fetching: https://apec2025.kr?menuno=2
Success: 843 words

[2/24] Scraping: APEC...
Fetching: https://apec2025.kr?menuno=89
Success: 843 words

[3/24] Scraping: APEC 2025 KOREA...
Fetching: https://apec2025.kr?menuno=90
Success: 937 words

[4/24] Scraping: Meetings...
Fetching: https://apec2025.kr?menuno=93
Success: 725 words

[5/24] Scraping: Side Event...
Fetching: https://apec2025.kr?menuno=94
Success: 227 words

[6/24] Scraping: Media...
Fetching: https://apec2025.kr?menuno=14
Success: 98 words

[7/24] Scraping: Notices...
Fetching: https://apec2025.kr?menuno=15
Success: 98 words

[8/24] Scraping: Press Releases...
Fetching: https://apec2025.kr?menuno=16
Success: 14392 words

[9/24] Scraping: Resources...
Fetching: https://apec2025.kr?menuno=17
Success: 91 words

[10/24] Scraping: Social Media...
Fetching: https://apec2025.kr?menuno=98
Success: 51 words

[11/24] Scraping: Partners...
Fetching: https://apec2025.kr?menuno=100
Success: 12 word

In [None]:
import json

filename = f"apec2025_scraped_data.json"

with open(filename, 'w', encoding='utf-8') as f:
    json.dump(scraped_data, f, ensure_ascii=False, indent=2)

print(f"Scraped data saved to: {filename}")


Scraped data saved to: apec2025_scraped_data.json


In [None]:
os.makedirs('scraped_pages', exist_ok=True)

for i, page in enumerate(successful_scrapes, 1):
    safe_title = "".join(c for c in page['title'] if c.isalnum() or c in (' ', '-', '_')).rstrip()
    safe_title = safe_title[:50]  # Limit length
    filename = f"scraped_pages/{i:02d}_{safe_title}.txt"
    
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(f"Title: {page['title']}\n")
        f.write(f"URL: {page['url']}\n")
        f.write(f"Word Count: {page['word_count']}\n")
        f.write("=" * 80 + "\n\n")
        f.write(page['content'])

