This notebook will read/load the newspaper htmls, transform them and prepare the simple dimensions, which are prepared to match the following analyse. The analyse will be about the word "klima" and its changing use over time by newspaper. See Readme for more.

In [1]:
import os
import sys
import logging
import glob
sys.path.append(os.path.abspath("pylib"))

import pandas as pd
from handle_sqlite import save_dataframe_to_db, read_table_as_dataframe
from handle_data_processing import process_newspaper_with_context


# Load all the newspapers
Here we will load the csv for one day.

In [2]:
newspapers_test = [
    {"file_name": "data_input/data_lake/2021-04-02-54books.html", "encoding": "utf-8", "name": "54books", "date": "2021-04-02"},
    {"file_name": "data_input/data_lake/2021-04-02-abendblatt.html", "encoding": "utf-8", "name": "abendblatt", "date": "2021-04-02"},
    {"file_name": "data_input/data_lake/2025-01-14-vice-de.html", "encoding": "utf-8", "name": "vice", "date": "2025-01-24"},
    {"file_name": "data_input/data_lake/2021-04-24-tagesschau.html", "encoding": "utf-8", "name": "tagesschaus", "date": "2021-04-24"},
    {"file_name": "data_input/data_lake/2025-01-24-tagesschau.html", "encoding": "utf-8", "name": "tagesschaus", "date": "2025-01-24"},
]

In [3]:
# Use glob to list all CSV files in the specified directory with date format in their names
csv_files = glob.glob('data_input/data_lake/*-*.csv')

In [4]:
# Initialize an empty list to store newspaper data
newspapers = []

# Process each CSV file
for csv_file in csv_files:
    # Load CSV
    df = pd.read_csv(csv_file)

    # Filter by status 200 and select necessary columns
    valid_newspapers = df[df['status'] == 200][['name', 'date', 'file_name', 'encoding']]

    # Convert valid rows to dictionaries and add to newspapers list
    newspapers.extend(valid_newspapers.to_dict(orient='records'))

In [5]:
newspapers

[{'name': 'sz',
  'date': '2025-01-14',
  'file_name': 'data-lake/2025-01-14-sz.html',
  'encoding': 'utf-8'},
 {'name': 'zeit',
  'date': '2025-01-14',
  'file_name': 'data-lake/2025-01-14-zeit.html',
  'encoding': 'UTF-8'},
 {'name': 'faz',
  'date': '2025-01-14',
  'file_name': 'data-lake/2025-01-14-faz.html',
  'encoding': 'utf-8'},
 {'name': 'heise',
  'date': '2025-01-14',
  'file_name': 'data-lake/2025-01-14-heise.html',
  'encoding': 'utf-8'},
 {'name': 'golem',
  'date': '2025-01-14',
  'file_name': 'data-lake/2025-01-14-golem.html',
  'encoding': 'UTF-8'},
 {'name': 'tagesspiegel',
  'date': '2025-01-14',
  'file_name': 'data-lake/2025-01-14-tagesspiegel.html',
  'encoding': 'utf-8'},
 {'name': 'taz',
  'date': '2025-01-14',
  'file_name': 'data-lake/2025-01-14-taz.html',
  'encoding': 'utf-8'},
 {'name': 'abendblatt',
  'date': '2025-01-14',
  'file_name': 'data-lake/2025-01-14-abendblatt.html',
  'encoding': 'utf-8'},
 {'name': 'berliner',
  'date': '2025-01-14',
  'file_na

In [6]:
for newspaper in newspapers:
    print(newspaper)
    break

{'name': 'sz', 'date': '2025-01-14', 'file_name': 'data-lake/2025-01-14-sz.html', 'encoding': 'utf-8'}


In [22]:
metadata_collection = []
context_collection = []

for newspaper in newspapers:
    try:        
        metadata, context_data = process_newspaper_with_context(
            name = newspaper['name'],
            date = newspaper['date'],
            file_path = 'data_input/' + newspaper['file_name'],
            encoding = newspaper['encoding'])
          
        logging.info(f"Processing done for {newspaper['name']} ({newspaper['date']}).")
        
        # Add a unique ID for each newspaper in the metadata and add to context
        newspaper_id = len(metadata_collection) + 1  # This can be a simple counter for unique IDs (or use UUID)
        metadata["newspaper_id"] = newspaper_id  # Add newspaper_id to metadata
        
        # Append the metadata to its respective collection
        metadata_collection.append(metadata)
        
        # Append the context data with id to its respective collection if 'klima' was found at least once
        if metadata['klima_mentions_count'] > 0:
            # First add the same newspaper_id to each context data
            for context in context_data:
                context["newspaper_id"] = newspaper_id
            context_collection.extend(context_data) # Using extend here because context_data is already a list of dicts

    except Exception as e:
        logging.error(f"Error processing {newspaper['name']} for {newspaper['date']}: {e}")

2025-02-24 09:28:29,184 - INFO - Processing newspaper: sz (2025-01-14)
2025-02-24 09:28:29,186 - ERROR - Error processing sz: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-14-sz.html'
2025-02-24 09:28:29,187 - ERROR - Error processing sz for 2025-01-14: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-14-sz.html'
2025-02-24 09:28:29,187 - INFO - Processing newspaper: zeit (2025-01-14)
2025-02-24 09:28:29,620 - INFO - 8 'klima' mentions in zeit for 2025-01-14.
2025-02-24 09:28:29,621 - INFO - Processing done for zeit (2025-01-14).
2025-02-24 09:28:29,621 - INFO - Processing newspaper: faz (2025-01-14)
2025-02-24 09:28:29,623 - ERROR - Error processing faz: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-14-faz.html'
2025-02-24 09:28:29,623 - ERROR - Error processing faz for 2025-01-14: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-14-faz.html'
2025-02-24 09:28:29,624 - INFO - Processing newspaper: heise (2025

2025-02-24 09:28:33,132 - INFO - Processing newspaper: spiegel (2025-01-14)
2025-02-24 09:28:33,908 - INFO - 5 'klima' mentions in spiegel for 2025-01-14.
2025-02-24 09:28:33,909 - INFO - Processing done for spiegel (2025-01-14).
2025-02-24 09:28:33,910 - INFO - Processing newspaper: cnn (2025-01-14)
2025-02-24 09:28:33,911 - ERROR - Error processing cnn: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-14-cnn.html'
2025-02-24 09:28:33,912 - ERROR - Error processing cnn for 2025-01-14: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-14-cnn.html'
2025-02-24 09:28:33,912 - INFO - Processing newspaper: bbc (2025-01-14)
2025-02-24 09:28:34,003 - INFO - No 'klima' mentions found in bbc for 2025-01-14.
2025-02-24 09:28:34,004 - INFO - Processing done for bbc (2025-01-14).
2025-02-24 09:28:34,004 - INFO - Processing newspaper: dhv (2025-01-14)
2025-02-24 09:28:34,189 - INFO - No 'klima' mentions found in dhv for 2025-01-14.
2025-02-24 09:28:34,189 - INFO -

2025-02-24 09:28:38,333 - ERROR - Error processing berliner: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-02-berliner.html'
2025-02-24 09:28:38,334 - ERROR - Error processing berliner for 2025-01-02: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-02-berliner.html'
2025-02-24 09:28:38,335 - INFO - Processing newspaper: welt (2025-01-02)
2025-02-24 09:28:38,537 - INFO - 3 'klima' mentions in welt for 2025-01-02.
2025-02-24 09:28:38,538 - INFO - Processing done for welt (2025-01-02).
2025-02-24 09:28:38,538 - INFO - Processing newspaper: medium (2025-01-02)
2025-02-24 09:28:38,548 - INFO - No 'klima' mentions found in medium for 2025-01-02.
2025-02-24 09:28:38,549 - INFO - Processing done for medium (2025-01-02).
2025-02-24 09:28:38,549 - INFO - Processing newspaper: esslinger (2025-01-02)
2025-02-24 09:28:38,722 - INFO - 1 'klima' mentions in esslinger for 2025-01-02.
2025-02-24 09:28:38,723 - INFO - Processing done for esslinger (2025-01-02).
20

2025-02-24 09:28:42,898 - INFO - Processing newspaper: wiwo (2025-01-02)
2025-02-24 09:28:42,899 - ERROR - Error processing wiwo: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-02-wiwo.html'
2025-02-24 09:28:42,899 - ERROR - Error processing wiwo for 2025-01-02: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-02-wiwo.html'
2025-02-24 09:28:42,901 - INFO - Processing newspaper: dw-en (2025-01-02)
2025-02-24 09:28:42,901 - ERROR - Error processing dw-en: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-02-dw-en.html'
2025-02-24 09:28:42,902 - ERROR - Error processing dw-en for 2025-01-02: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-02-dw-en.html'
2025-02-24 09:28:42,903 - INFO - Processing newspaper: zwanzig (2025-01-02)
2025-02-24 09:28:42,996 - INFO - No 'klima' mentions found in zwanzig for 2025-01-02.
2025-02-24 09:28:42,997 - INFO - Processing done for zwanzig (2025-01-02).
2025-02-24 09:28:42,997 - INFO

2025-02-24 09:28:46,570 - INFO - Processing done for ntv (2021-04-02).
2025-02-24 09:28:46,571 - INFO - Processing newspaper: danielmiessler (2021-04-02)
2025-02-24 09:28:46,585 - INFO - No 'klima' mentions found in danielmiessler for 2021-04-02.
2025-02-24 09:28:46,586 - INFO - Processing done for danielmiessler (2021-04-02).
2025-02-24 09:28:46,586 - INFO - Processing newspaper: pioneer (2021-04-02)
2025-02-24 09:28:46,587 - ERROR - Error processing pioneer: [Errno 2] No such file or directory: 'data_input/data-lake/2021-04-02-pioneer.html'
2025-02-24 09:28:46,588 - ERROR - Error processing pioneer for 2021-04-02: [Errno 2] No such file or directory: 'data_input/data-lake/2021-04-02-pioneer.html'
2025-02-24 09:28:46,589 - INFO - Processing newspaper: suedwest (2021-04-02)
2025-02-24 09:28:46,818 - INFO - No 'klima' mentions found in suedwest for 2021-04-02.
2025-02-24 09:28:46,819 - INFO - Processing done for suedwest (2021-04-02).
2025-02-24 09:28:46,820 - INFO - Processing newspape

2025-02-24 09:28:50,262 - ERROR - Error processing sz for 2023-07-14: [Errno 2] No such file or directory: 'data_input/data-lake/2023-07-14-sz.html'
2025-02-24 09:28:50,263 - INFO - Processing newspaper: zeit (2023-07-14)
2025-02-24 09:28:50,642 - INFO - 32 'klima' mentions in zeit for 2023-07-14.
2025-02-24 09:28:50,643 - INFO - Processing done for zeit (2023-07-14).
2025-02-24 09:28:50,644 - INFO - Processing newspaper: faz (2023-07-14)
2025-02-24 09:28:50,645 - ERROR - Error processing faz: [Errno 2] No such file or directory: 'data_input/data-lake/2023-07-14-faz.html'
2025-02-24 09:28:50,646 - ERROR - Error processing faz for 2023-07-14: [Errno 2] No such file or directory: 'data_input/data-lake/2023-07-14-faz.html'
2025-02-24 09:28:50,646 - INFO - Processing newspaper: heise (2023-07-14)
2025-02-24 09:28:50,857 - INFO - 6 'klima' mentions in heise for 2023-07-14.
2025-02-24 09:28:50,858 - INFO - Processing done for heise (2023-07-14).
2025-02-24 09:28:50,859 - INFO - Processing ne

2025-02-24 09:28:54,407 - INFO - Processing done for dw-de (2023-07-14).
2025-02-24 09:28:54,408 - INFO - Processing newspaper: spiegel (2023-07-14)
2025-02-24 09:28:54,867 - INFO - 6 'klima' mentions in spiegel for 2023-07-14.
2025-02-24 09:28:54,868 - INFO - Processing done for spiegel (2023-07-14).
2025-02-24 09:28:54,869 - INFO - Processing newspaper: cnn (2023-07-14)
2025-02-24 09:28:54,870 - ERROR - Error processing cnn: [Errno 2] No such file or directory: 'data_input/data-lake/2023-07-14-cnn.html'
2025-02-24 09:28:54,870 - ERROR - Error processing cnn for 2023-07-14: [Errno 2] No such file or directory: 'data_input/data-lake/2023-07-14-cnn.html'
2025-02-24 09:28:54,871 - INFO - Processing newspaper: bbc (2023-07-14)
2025-02-24 09:28:55,071 - INFO - No 'klima' mentions found in bbc for 2023-07-14.
2025-02-24 09:28:55,072 - INFO - Processing done for bbc (2023-07-14).
2025-02-24 09:28:55,072 - INFO - Processing newspaper: dhv (2023-07-14)
2025-02-24 09:28:55,132 - INFO - No 'klim

2025-02-24 09:28:59,120 - INFO - No 'klima' mentions found in abendblatt for 2023-07-24.
2025-02-24 09:28:59,120 - INFO - Processing done for abendblatt (2023-07-24).
2025-02-24 09:28:59,121 - INFO - Processing newspaper: berliner (2023-07-24)
2025-02-24 09:28:59,123 - ERROR - Error processing berliner: [Errno 2] No such file or directory: 'data_input/data-lake/2023-07-24-berliner.html'
2025-02-24 09:28:59,123 - ERROR - Error processing berliner for 2023-07-24: [Errno 2] No such file or directory: 'data_input/data-lake/2023-07-24-berliner.html'
2025-02-24 09:28:59,124 - INFO - Processing newspaper: welt (2023-07-24)
2025-02-24 09:28:59,350 - INFO - 10 'klima' mentions in welt for 2023-07-24.
2025-02-24 09:28:59,350 - INFO - Processing done for welt (2023-07-24).
2025-02-24 09:28:59,351 - INFO - Processing newspaper: medium (2023-07-24)
2025-02-24 09:28:59,388 - INFO - No 'klima' mentions found in medium for 2023-07-24.
2025-02-24 09:28:59,389 - INFO - Processing done for medium (2023-0

2025-02-24 09:29:03,715 - INFO - Processing done for tagesschau (2023-07-24).
2025-02-24 09:29:03,716 - INFO - Processing newspaper: zvw (2023-07-24)
2025-02-24 09:29:03,786 - INFO - No 'klima' mentions found in zvw for 2023-07-24.
2025-02-24 09:29:03,788 - INFO - Processing done for zvw (2023-07-24).
2025-02-24 09:29:03,789 - INFO - Processing newspaper: ieee (2023-07-24)
2025-02-24 09:29:03,898 - INFO - No 'klima' mentions found in ieee for 2023-07-24.
2025-02-24 09:29:03,899 - INFO - Processing done for ieee (2023-07-24).
2025-02-24 09:29:03,900 - INFO - Processing newspaper: anwaltsverein (2023-07-24)
2025-02-24 09:29:03,940 - INFO - No 'klima' mentions found in anwaltsverein for 2023-07-24.
2025-02-24 09:29:03,941 - INFO - Processing done for anwaltsverein (2023-07-24).
2025-02-24 09:29:03,942 - INFO - Processing newspaper: wiwo (2023-07-24)
2025-02-24 09:29:04,177 - INFO - 4 'klima' mentions in wiwo for 2023-07-24.
2025-02-24 09:29:04,178 - INFO - Processing done for wiwo (2023-0

2025-02-24 09:29:07,706 - INFO - Processing newspaper: kdnuggets (2021-04-24)
2025-02-24 09:29:07,727 - INFO - No 'klima' mentions found in kdnuggets for 2021-04-24.
2025-02-24 09:29:07,728 - INFO - Processing done for kdnuggets (2021-04-24).
2025-02-24 09:29:07,729 - INFO - Processing newspaper: handelsblatt (2021-04-24)
2025-02-24 09:29:07,885 - INFO - 5 'klima' mentions in handelsblatt for 2021-04-24.
2025-02-24 09:29:07,886 - INFO - Processing done for handelsblatt (2021-04-24).
2025-02-24 09:29:07,887 - INFO - Processing newspaper: ntv (2021-04-24)
2025-02-24 09:29:08,343 - INFO - 5 'klima' mentions in ntv for 2021-04-24.
2025-02-24 09:29:08,344 - INFO - Processing done for ntv (2021-04-24).
2025-02-24 09:29:08,344 - INFO - Processing newspaper: danielmiessler (2021-04-24)
2025-02-24 09:29:08,359 - INFO - No 'klima' mentions found in danielmiessler for 2021-04-24.
2025-02-24 09:29:08,360 - INFO - Processing done for danielmiessler (2021-04-24).
2025-02-24 09:29:08,360 - INFO - Pro

2025-02-24 09:29:11,895 - INFO - Processing newspaper: dw (2021-04-24)
2025-02-24 09:29:11,973 - INFO - No 'klima' mentions found in dw for 2021-04-24.
2025-02-24 09:29:11,974 - INFO - Processing done for dw (2021-04-24).
2025-02-24 09:29:11,975 - INFO - Processing newspaper: dav (2021-04-24)
2025-02-24 09:29:12,087 - INFO - 10 'klima' mentions in dav for 2021-04-24.
2025-02-24 09:29:12,087 - INFO - Processing done for dav (2021-04-24).
2025-02-24 09:29:12,088 - INFO - Processing newspaper: sz (2025-01-24)
2025-02-24 09:29:12,089 - ERROR - Error processing sz: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-24-sz.html'
2025-02-24 09:29:12,090 - ERROR - Error processing sz for 2025-01-24: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-24-sz.html'
2025-02-24 09:29:12,091 - INFO - Processing newspaper: zeit (2025-01-24)
2025-02-24 09:29:12,459 - INFO - 21 'klima' mentions in zeit for 2025-01-24.
2025-02-24 09:29:12,459 - INFO - Processing done for ze

2025-02-24 09:29:16,137 - INFO - Processing done for dw-de (2025-01-24).
2025-02-24 09:29:16,139 - INFO - Processing newspaper: spiegel (2025-01-24)
2025-02-24 09:29:17,368 - INFO - 5 'klima' mentions in spiegel for 2025-01-24.
2025-02-24 09:29:17,372 - INFO - Processing done for spiegel (2025-01-24).
2025-02-24 09:29:17,374 - INFO - Processing newspaper: cnn (2025-01-24)
2025-02-24 09:29:17,375 - ERROR - Error processing cnn: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-24-cnn.html'
2025-02-24 09:29:17,376 - ERROR - Error processing cnn for 2025-01-24: [Errno 2] No such file or directory: 'data_input/data-lake/2025-01-24-cnn.html'
2025-02-24 09:29:17,377 - INFO - Processing newspaper: bbc (2025-01-24)
2025-02-24 09:29:17,536 - INFO - No 'klima' mentions found in bbc for 2025-01-24.
2025-02-24 09:29:17,538 - INFO - Processing done for bbc (2025-01-24).
2025-02-24 09:29:17,539 - INFO - Processing newspaper: dhv (2025-01-24)
2025-02-24 09:29:17,668 - INFO - No 'klim

2025-02-24 09:29:21,808 - INFO - Processing newspaper: berliner (2023-07-02)
2025-02-24 09:29:21,809 - ERROR - Error processing berliner: [Errno 2] No such file or directory: 'data_input/data-lake/2023-07-02-berliner.html'
2025-02-24 09:29:21,810 - ERROR - Error processing berliner for 2023-07-02: [Errno 2] No such file or directory: 'data_input/data-lake/2023-07-02-berliner.html'
2025-02-24 09:29:21,811 - INFO - Processing newspaper: welt (2023-07-02)
2025-02-24 09:29:22,157 - INFO - 7 'klima' mentions in welt for 2023-07-02.
2025-02-24 09:29:22,158 - INFO - Processing done for welt (2023-07-02).
2025-02-24 09:29:22,159 - INFO - Processing newspaper: medium (2023-07-02)
2025-02-24 09:29:22,195 - INFO - No 'klima' mentions found in medium for 2023-07-02.
2025-02-24 09:29:22,195 - INFO - Processing done for medium (2023-07-02).
2025-02-24 09:29:22,196 - INFO - Processing newspaper: esslinger (2023-07-02)
2025-02-24 09:29:22,346 - INFO - 5 'klima' mentions in esslinger for 2023-07-02.
20

2025-02-24 09:29:26,379 - INFO - 3 'klima' mentions in zvw for 2023-07-02.
2025-02-24 09:29:26,380 - INFO - Processing done for zvw (2023-07-02).
2025-02-24 09:29:26,381 - INFO - Processing newspaper: ieee (2023-07-02)
2025-02-24 09:29:26,485 - INFO - No 'klima' mentions found in ieee for 2023-07-02.
2025-02-24 09:29:26,486 - INFO - Processing done for ieee (2023-07-02).
2025-02-24 09:29:26,486 - INFO - Processing newspaper: anwaltsverein (2023-07-02)
2025-02-24 09:29:26,525 - INFO - No 'klima' mentions found in anwaltsverein for 2023-07-02.
2025-02-24 09:29:26,526 - INFO - Processing done for anwaltsverein (2023-07-02).
2025-02-24 09:29:26,527 - INFO - Processing newspaper: wiwo (2023-07-02)
2025-02-24 09:29:26,848 - INFO - 7 'klima' mentions in wiwo for 2023-07-02.
2025-02-24 09:29:26,849 - INFO - Processing done for wiwo (2023-07-02).
2025-02-24 09:29:26,850 - INFO - Processing newspaper: dw-en (2023-07-02)
2025-02-24 09:29:26,854 - INFO - No 'klima' mentions found in dw-en for 2023

2025-02-24 09:29:30,201 - INFO - Processing done for kdnuggets (2021-04-14).
2025-02-24 09:29:30,201 - INFO - Processing newspaper: handelsblatt (2021-04-14)
2025-02-24 09:29:30,358 - INFO - 4 'klima' mentions in handelsblatt for 2021-04-14.
2025-02-24 09:29:30,359 - INFO - Processing done for handelsblatt (2021-04-14).
2025-02-24 09:29:30,359 - INFO - Processing newspaper: ntv (2021-04-14)
2025-02-24 09:29:30,835 - INFO - No 'klima' mentions found in ntv for 2021-04-14.
2025-02-24 09:29:30,836 - INFO - Processing done for ntv (2021-04-14).
2025-02-24 09:29:30,837 - INFO - Processing newspaper: pioneer (2021-04-14)
2025-02-24 09:29:30,838 - ERROR - Error processing pioneer: [Errno 2] No such file or directory: 'data_input/data-lake/2021-04-14-pioneer.html'
2025-02-24 09:29:30,839 - ERROR - Error processing pioneer for 2021-04-14: [Errno 2] No such file or directory: 'data_input/data-lake/2021-04-14-pioneer.html'
2025-02-24 09:29:30,839 - INFO - Processing newspaper: suedwest (2021-04-1

2025-02-24 09:29:34,509 - INFO - Processing done for dav (2021-04-14).


In [23]:
# Convert to DataFrame after processing all newspapers
final_metadata_df = pd.DataFrame(metadata_collection)
final_context_df = pd.DataFrame(context_collection)

In [25]:
final_metadata_df

Unnamed: 0,newspaper_name,data_published,klima_mentions_count,newspaper_id
0,zeit,2025-01-14,8,1
1,heise,2025-01-14,0,2
2,golem,2025-01-14,0,3
3,tagesspiegel,2025-01-14,3,4
4,taz,2025-01-14,10,5
...,...,...,...,...
404,ieee,2021-04-14,0,405
405,anwaltsverein,2021-04-14,0,406
406,wiwo,2021-04-14,2,407
407,dw,2021-04-14,0,408


In [24]:
final_context_df

Unnamed: 0,pre_context,post_context,prefix,suffix,newspaper_id
0,Wünsche für mehr,Folge 18 :,,schutz,1
1,Wünsche für mehr,Endlich wirksame Maßnahmen,,schutz,1
2,die Wirtschaft zum,zwingen: So stellen,,schutz,1
3,sich eine gute,vor. Von Mona,,politik,1
4,"2025 Demokratie schützen,",bekämpfen und mehr,,wandel,1
...,...,...,...,...,...
885,Arten an neue,gewöhnen und nicht,,bedingungen,409
886,"Gletscher, Naturgefahren und",Interview mit dem,,wandel,409
887,Interview mit dem,Tobias Hipp Mehr,DAV,experten,409
888,Alpenverein gesprochen. Der,ist studierter Physischer,,experte,409


In [26]:
# Save the results to the database
save_dataframe_to_db(final_metadata_df, "newspapers", db_path="data_output/dwh_data.db", if_exists="replace")
save_dataframe_to_db(final_context_df, "context", db_path="data_output/dwh_data.db", if_exists="replace")

2025-02-24 09:30:36,180 - INFO - Data saved to table 'newspapers' in 'data_output/dwh_data.db' successfully.
2025-02-24 09:30:36,235 - INFO - Data saved to table 'context' in 'data_output/dwh_data.db' successfully.


In [27]:
# check the saved data
temp_data_head = read_table_as_dataframe("newspapers", "data_output/dwh_data.db").head()
temp_data_head

2025-02-24 09:30:45,165 - INFO - Data read from table 'newspapers' in 'data_output/dwh_data.db' successfully.


Unnamed: 0,newspaper_name,data_published,klima_mentions_count,newspaper_id
0,zeit,2025-01-14,8,1
1,heise,2025-01-14,0,2
2,golem,2025-01-14,0,3
3,tagesspiegel,2025-01-14,3,4
4,taz,2025-01-14,10,5


In [28]:
temp_data_head = read_table_as_dataframe("context", "data_output/dwh_data.db").head()
temp_data_head

2025-02-24 09:30:50,529 - INFO - Data read from table 'context' in 'data_output/dwh_data.db' successfully.


Unnamed: 0,pre_context,post_context,prefix,suffix,newspaper_id
0,Wünsche für mehr,Folge 18 :,,schutz,1
1,Wünsche für mehr,Endlich wirksame Maßnahmen,,schutz,1
2,die Wirtschaft zum,zwingen: So stellen,,schutz,1
3,sich eine gute,vor. Von Mona,,politik,1
4,"2025 Demokratie schützen,",bekämpfen und mehr,,wandel,1
