In [1]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
from tqdm.asyncio import tqdm
import json

In [2]:
async def fetch(session, url):
    try:
        async with session.get(url) as response:
            response.raise_for_status()
            return await response.text()
    except aiohttp.ClientError as e:
        print(f"Error fetching {url}: {e}")
        return None

In [3]:
async def scrape_content(session, url):
    html_content = await fetch(session, url)
    if html_content:
        soup = BeautifulSoup(html_content, 'html.parser')

        title_tag = soup.find('h1', class_='page-title')
        title = title_tag.get_text(strip=True) if title_tag else "No title found"

        about_me_div = soup.find('div', id='rs-about-me')
        body = about_me_div.get_text(strip=True) if about_me_div else "No content found in 'rs-about-me' div."

        return {'url': url, 'title': title, 'body': body}
    return {'url': url, 'title': 'Failed to fetch', 'body': 'Failed to fetch'}

In [4]:
async def main():
    results = []

    with open('links-umexpert.txt', 'r') as file:
        urls = [url.strip() for url in file.readlines()]

    async with aiohttp.ClientSession() as session:
        tasks = [scrape_content(session, url) for url in urls]
        for content in tqdm(asyncio.as_completed(tasks), desc='Scraping URLs', total=len(urls)):
            result = await content
            results.append(result)

    with open('fsktm-scraped2.json', 'w') as outfile:
        json.dump(results, outfile, indent=2)

In [7]:
if __name__ == '__main__':
    asyncio.run(main())

RuntimeError: asyncio.run() cannot be called from a running event loop