This notebook will read/load the newspaper htmls, transform them and prepare the simple dimensions, which are prepared to match the following analyse. The analyse will be about the word "klima" and its changing use over time by newspaper. See Readme for more.

In [None]:
import os
import sys
import logging
import glob
sys.path.append(os.path.abspath("pylib"))

import pandas as pd
from handle_sqlite import save_dataframe_to_db, read_table_as_dataframe
from handle_data_processing import process_newspaper_with_context


# Load a part of the date for testing newspapers
Here we will load the csv for one day.

In [None]:
# Use glob to list all CSV files in the specified directory with date format in their names
csv_files = glob.glob('data_input/data-lake/small_part_test/*-*.csv')

In [None]:
# Initialize an empty list to store newspaper data
newspapers = []

# Process each CSV file
for csv_file in csv_files:
    # Load CSV
    df = pd.read_csv(csv_file)

    # Filter by status 200 and select necessary columns
    valid_newspapers = df[df['status'] == 200][['name', 'date', 'file_name', 'encoding']]

    # Convert valid rows to dictionaries and add to newspapers list
    newspapers.extend(valid_newspapers.to_dict(orient='records'))

In [None]:
newspapers

In [None]:
for newspaper in newspapers:
    print(newspaper)
    break

In [None]:
metadata_collection = []
context_collection = []

for newspaper in newspapers:
    try:        
        metadata, context_data = process_newspaper_with_context(
            name = newspaper['name'],
            date = newspaper['date'],
            file_path = 'data_input/' + newspaper['file_name'],
            encoding = newspaper['encoding'])
          
        logging.info(f"Processing done for {newspaper['name']} ({newspaper['date']}).")
        
        # Add a unique ID for each newspaper in the metadata and add to context
        newspaper_id = len(metadata_collection) + 1  # This can be a simple counter for unique IDs (or use UUID)
        metadata["newspaper_id"] = newspaper_id  # Add newspaper_id to metadata
        
        # Append the metadata to its respective collection
        metadata_collection.append(metadata)
        
        # Append the context data with id to its respective collection if 'klima' was found at least once
        if metadata['klima_mentions_count'] > 0:
            # First add the same newspaper_id to each context data
            for context in context_data:
                context["newspaper_id"] = newspaper_id
            context_collection.extend(context_data) # Using extend here because context_data is already a list of dicts

    except Exception as e:
        logging.error(f"Error processing {newspaper['name']} for {newspaper['date']}: {e}")

In [None]:
# Convert to DataFrame after processing all newspapers
final_metadata_df = pd.DataFrame(metadata_collection)
final_context_df = pd.DataFrame(context_collection)

In [None]:
final_metadata_df

In [None]:
final_context_df

In [None]:
# Save the results to the database
save_dataframe_to_db(final_metadata_df, "newspapers", db_path="data_output/dwh_data.db", if_exists="replace")
save_dataframe_to_db(final_context_df, "context", db_path="data_output/dwh_data.db", if_exists="replace")

In [None]:
# check the saved data
meta_data = read_table_as_dataframe("newspapers", "data_output/dwh_data.db")
meta_data.head()

In [None]:
meta_data.to_csv("dwh_newspaper_meta.csv", index=False)

In [None]:
context_data = read_table_as_dataframe("context", "data_output/dwh_data.db")
context_data.head()

In [None]:
context_data.to_csv("dwh_newspaper_context.csv", index=False)