In [5]:
from bs4 import BeautifulSoup
import urllib3
import re
import json
import pandas as pd
import time
from tqdm.notebook import tqdm

In [6]:
def get_html(url):
    http = urllib3.PoolManager()
    response = http.request('GET', url,headers = {'User-agent': 'your bot 0.1'})
    soup = BeautifulSoup(response.data)
    return soup,response.status

In [7]:
def get_last_page_num(link):
    soup, status = get_html(link)
    page_links = soup.find_all('a', {'data-page': True})
    page_numbers = [int(link['data-page']) for link in page_links]
    last_page_number = max(page_numbers) if page_numbers else None
    return last_page_number

In [8]:
month = 1
topics = ['politik', 'wirtschaft', 'sport', 'wissen','kultur']
link_dict = {key: [] for key in topics}
year = 2023

for topic in tqdm(topics):
    baseUrl = f'https://www.sueddeutsche.de/archiv/{topic}/{year}/{month}'
    last_page = get_last_page_num(baseUrl)
    print(topic,'->',last_page)
    for page in tqdm(range(1,last_page+1)):
        url = f"https://www.sueddeutsche.de/archiv/{topic}/{year}/{month}/page/{page}"
        soup,status = get_html(url)
        if status==200:
            links = soup.find_all('a', class_='entrylist__link')
            for link in links:
                href = link.get('href')
                link_dict[topic].append(href)
        else:
            print('failed')

  0%|          | 0/5 [00:00<?, ?it/s]

politik -> 50


  0%|          | 0/50 [00:00<?, ?it/s]

wirtschaft -> 52


  0%|          | 0/52 [00:00<?, ?it/s]

sport -> 69


  0%|          | 0/69 [00:00<?, ?it/s]

wissen -> 11


  0%|          | 0/11 [00:00<?, ?it/s]

kultur -> 14


  0%|          | 0/14 [00:00<?, ?it/s]

In [15]:
df = pd.DataFrame([],columns=['Url','Topic','Publish Date','Author','Title','Teaser','News'])

for topic,all_links in tqdm(link_dict.items()):
    for link in tqdm(all_links):
        soup, status = get_html(link)
        if status==200:
            paragraphs = soup.find_all('p', {'data-manual': 'paragraph'})

            news = ''
            for paragraph in paragraphs:
                news = news + '\n' +paragraph.get_text(strip=True)

            news = news.strip()
            element = soup.find('div', {'id': 'taboola-feed-below-article', 'data-paycategory': 'free'})

            if element:
                data_authors = element.get('data-authors')
                data_teaser = element.get('data-teaser')
                data_title = element.get('data-title')
                data_publishdate = element.get('data-publishdate')

                df.loc[len(df)] = [link,topic,data_publishdate,data_authors,data_title,data_teaser,news]
            else:
                pass
#                 print("Paid news",link)       
        else:
            print('Failed->',status,link)

# Parallel processing

In [11]:
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

df = pd.DataFrame([], columns=['Url', 'Topic', 'Publish Date', 'Author', 'Title', 'Teaser', 'News'])

def process_link(topic, link):
    soup, status = get_html(link)
    if status == 200:
        paragraphs = soup.find_all('p', {'data-manual': 'paragraph'})
        news = '\n'.join(paragraph.get_text(strip=True) for paragraph in paragraphs).strip()

        element = soup.find('div', {'id': 'taboola-feed-below-article', 'data-paycategory': 'free'})
        if element:
            data_authors = element.get('data-authors')
            data_teaser = element.get('data-teaser')
            data_title = element.get('data-title')
            data_publishdate = element.get('data-publishdate')
            return (link, topic, data_publishdate, data_authors, data_title, data_teaser, news)
    else:
        print('Failed->', status, link)
    return None

# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor() as executor:
    futures = []
    for topic, all_links in link_dict.items():
        for link in all_links:
            futures.append(executor.submit(process_link, topic, link))
    
    for future in tqdm(as_completed(futures), total=len(futures)):
        result = future.result()
        if result:
            df.loc[len(df)] = result

 88%|██████████████████████████████████████████████████████████████████▎        | 8413/9517 [14:28<02:33,  7.17it/s]

Failed-> 404 https://sz-magazin.de/gletscher


100%|███████████████████████████████████████████████████████████████████████████| 9517/9517 [16:28<00:00,  9.63it/s]


In [12]:
df.shape

(7425, 7)

In [14]:
df.to_csv('all_news.csv',index=False)