here we want to
- count klima by time to see if number rises or shrinks
- dynamik je lemma/CAGR over '21 - '25
- wochentagsvergleich und monatsweise
- fuzzy match the suffixes so we can
    (- analyse the top suffix in total)
    - see progress of tops ranking over time (rolling by week/month)
    - see all of these arising top suffixes in comparison over time by relative part (ANTEIL)
- see over time the entropy/diversity of the klima suffix (keeping same, getting smaller?) / Outlet-Heterogenit채t: Gini/Herfindahl



In [None]:
import os
import sys
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import pandas as pd
from textblob import TextBlob

# make helper methods available
# Add custom library path relative to notebook location
notebook_dir = os.path.dirname(os.path.abspath(__file__)) if '__file__' in globals() else os.getcwd()
sys.path.append(os.path.join(notebook_dir, "..", "pylib"))


from handle_sqlite import read_table_as_dataframe

db_path = os.path.join(notebook_dir, "..", "data_output", "dwh_data.db")

In [None]:
# Global Seaborn settings
sns.set_theme(style="whitegrid", context="talk", palette="deep")
plt.rcParams["figure.figsize"] = (12, 6)

In [None]:
# load from dwh the newspaper informations per date and their usage of klima
metadata = read_table_as_dataframe("newspapers_processed", db_path)
display(metadata.head(3))

# load from dwh the found klima words
context = read_table_as_dataframe("context_processed", db_path)
display(context.head(3))

In [None]:
context = context.astype({'pre_context': 'string',
                         'post_context': 'string',
                         'prefix': 'string',
                         'suffix': 'string',})
context.info()

In [None]:
# check again data sanity
context[~context['newspaper_id'].isin(metadata['newspaper_id'])]

In [None]:
# Convert 'data_published' to datetime for proper time-based aggregation
metadata['data_published'] = pd.to_datetime(metadata['data_published'])

In [None]:
# Merge the two dataframes on 'newspaper_id'
merged = pd.merge(context, metadata, on="newspaper_id", how="inner")

# Checking the number of unique newspaper_id in both tables
print(f"Unique newspaper_id in metadata: {metadata['newspaper_id'].nunique()}")
print(f"Unique newspaper_id in context: {context['newspaper_id'].nunique()}")
print(f"Unique newspaper_id in merged: {merged['newspaper_id'].nunique()}")

In [None]:
# Anteil Klima-Komposita (klima_mentions_count from newspaper) an allen Headlines (30-Tage-Rolling)

In [None]:
context

In [None]:
metadata

In [None]:
merged

In [None]:
# Calculate the average number of klima mentions per newspaper per day
# Group by date, sum klima_mentions_count and count newspapers
daily_stats = (
    metadata.groupby('data_published')
    .agg(total_klima_mentions=('klima_mentions_count', 'sum'),
         num_newspapers=('newspaper_name', 'count'))
    .reset_index()
    .sort_values('data_published')
)

# Calculate average klima mentions per newspaper per day
daily_stats['avg_klima_per_newspaper'] = daily_stats['total_klima_mentions'] / daily_stats['num_newspapers']

# Plot the average as a line chart
plt.figure(figsize=(12, 6))
sns.lineplot(x='data_published', y='avg_klima_per_newspaper', data=daily_stats, color='steelblue')
plt.xlabel('Date Published')
plt.ylabel('Avg Klima Mentions per Newspaper')
plt.title('Average Klima Mentions per Newspaper per Day')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:

# Aggregate total klima mentions per day
daily_klima = metadata.groupby('data_published')['klima_mentions_count'].sum().reset_index()

# Sort by date
daily_klima = daily_klima.sort_values('data_published')

# Calculate 30-day rolling sum
daily_klima['klima_mentions_30d'] = daily_klima['klima_mentions_count'].rolling(window=1).sum()

# Plot the rolling sum as a line chart
plt.figure(figsize=(12, 6))
sns.lineplot(x='data_published', y='klima_mentions_30d', data=daily_klima, color='steelblue')
plt.xlabel('Date Published')
plt.ylabel('30-Day Rolling Sum of Klima Mentions')
plt.title('Rolling 30-Day Sum of Klima Mentions per Day')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:

# Aggregate total klima mentions per day
daily_klima = metadata.groupby('data_published')['klima_mentions_count'].sum().reset_index()

# Sort by date
daily_klima = daily_klima.sort_values('data_published')

# Calculate 30-day rolling sum
daily_klima['klima_mentions_30d'] = daily_klima['klima_mentions_count'].rolling(window=30).sum()

# Plot the rolling sum as a line chart
plt.figure(figsize=(12, 6))
sns.lineplot(x='data_published', y='klima_mentions_30d', data=daily_klima, color='steelblue')
plt.xlabel('Date Published')
plt.ylabel('30-Day Rolling Sum of Klima Mentions')
plt.title('Rolling 30-Day Sum of Klima Mentions per Day')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
yearly_avg = daily_klima.groupby('year')['klima_mentions_30d'].mean()

for year, avg in yearly_avg.items():
    plt.axhline(avg, linestyle='--', label=f'{year} avg')

plt.legend()


In [None]:
import numpy as np
import matplotlib.dates as mdates

# Basisdaten
daily_klima = (
    merged.groupby('data_published')['klima_mentions_count']
    .sum()
    .reset_index()
    .sort_values('data_published')
)
daily_klima['klima_mentions_30d'] = daily_klima['klima_mentions_count'].rolling(window=30, min_periods=1).sum()
daily_klima['year'] = daily_klima['data_published'].dt.year

fig, ax = plt.subplots(figsize=(12, 6))

# 1) Rolling-Kurve
sns.lineplot(x='data_published', y='klima_mentions_30d',
             data=daily_klima, color='steelblue', ax=ax, label='30d rolling sum')

# 2) Trendlinie pro Jahr mit polyfit
for year, dfy in daily_klima.groupby('year'):
    if len(dfy) < 2:  # zu wenige Punkte
        continue

    x_num = mdates.date2num(dfy['data_published'])
    y = dfy['klima_mentions_30d'].values

    # lineares Fit (Grad=1)
    coeffs = np.polyfit(x_num, y, deg=1)
    poly = np.poly1d(coeffs)

    # Wertebereich f체r dieses Jahr
    x_line = np.linspace(x_num.min(), x_num.max(), 100)
    y_line = poly(x_line)

    ax.plot(mdates.num2date(x_line), y_line, lw=2, label=f'{year} trend')

# 3) Achsenformatierung
locator = mdates.AutoDateLocator()
formatter = mdates.ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)

ax.set_xlabel('Date Published')
ax.set_ylabel('30-Day Rolling Sum of Klima Mentions')
ax.set_title('Rolling 30-Day Sum with Yearly Linear Trendlines (polyfit)')
ax.legend(ncol=3, fontsize=9)
ax.grid(True, linewidth=0.5, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.dates as mdates

# Basisdaten
daily_klima = (
    merged.groupby('data_published')['klima_mentions_count']
    .sum()
    .reset_index()
    .sort_values('data_published')
)
daily_klima['klima_mentions_30d'] = daily_klima['klima_mentions_count'].rolling(window=30, min_periods=1).sum()
daily_klima['year'] = daily_klima['data_published'].dt.year

fig, ax = plt.subplots(figsize=(12, 6))

# 1) Rolling-Kurve
sns.lineplot(x='data_published', y='klima_mentions_30d',
             data=daily_klima, color='steelblue', ax=ax, label='30d rolling sum')

# 2) Trendlinie pro Jahr mit polyfit
for year, dfy in daily_klima.groupby('year'):
    if len(dfy) < 2:  # zu wenige Punkte
        continue

    x_num = mdates.date2num(dfy['data_published'])
    y = dfy['klima_mentions_30d'].values

    # lineares Fit (Grad=1)
    coeffs = np.polyfit(x_num, y, deg=1)
    poly = np.poly1d(coeffs)

    # Wertebereich f체r dieses Jahr
    x_line = np.linspace(x_num.min(), x_num.max(), 100)
    y_line = poly(x_line)

    ax.plot(mdates.num2date(x_line), y_line, lw=2, label=f'{year} trend')

# 3) Achsenformatierung
locator = mdates.AutoDateLocator()
formatter = mdates.ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)

ax.set_xlabel('Date Published')
ax.set_ylabel('30-Day Rolling Sum of Klima Mentions')
ax.set_title('Rolling 30-Day Sum with Yearly Linear Trendlines (polyfit)')
ax.legend(ncol=3, fontsize=9)
ax.grid(True, linewidth=0.5, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import seaborn as sns

# Basisdaten
daily_klima = (
    merged.groupby('data_published')['klima_mentions_count']
    .sum()
    .reset_index()
    .sort_values('data_published')
)
daily_klima['klima_mentions_30d'] = (
    daily_klima['klima_mentions_count']
    .rolling(window=30, min_periods=1).sum()
)

fig, ax = plt.subplots(figsize=(12, 6))

# 1) Rolling-Kurve
sns.lineplot(
    x='data_published', y='klima_mentions_30d',
    data=daily_klima, color='steelblue', ax=ax, label='30d rolling sum'
)

# 2) Polynomtrend 체ber alle Jahre
x_num = mdates.date2num(daily_klima['data_published'])
y = daily_klima['klima_mentions_30d'].values

# Fit mit Grad 3 (anpassen: 2=quadratisch, 4=glatter etc.)
coeffs = np.polyfit(x_num, y, deg=3)
poly = np.poly1d(coeffs)

x_line = np.linspace(x_num.min(), x_num.max(), 300)
y_line = poly(x_line)

ax.plot(mdates.num2date(x_line), y_line,
        color='darkred', lw=2, label='Polynomial Trend (deg=3)')

# 3) Achsenformatierung
locator = mdates.AutoDateLocator()
formatter = mdates.ConciseDateFormatter(locator)
ax.xaxis.set_major_locator(locator)
ax.xaxis.set_major_formatter(formatter)

ax.set_xlabel('Date Published')
ax.set_ylabel('30-Day Rolling Sum of Klima Mentions')
ax.set_title('Rolling 30-Day Sum with Polynomial Trendline')
ax.legend()
ax.grid(True, linewidth=0.5, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Dot plot to visualize outliers in daily klima mentions
fig, ax = plt.subplots(figsize=(12, 6))

# Scatter plot for each day
ax.scatter(daily_klima['data_published'], daily_klima['klima_mentions_count'], color='steelblue', alpha=0.7, label='Daily Klima Mentions')

ax.set_xlabel('Date Published')
ax.set_ylabel('Klima Mentions per Day')
ax.set_title('Dot Plot of Daily Klima Mentions (Outlier Detection)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### kpis

In [None]:
# --- KPIs: Klima Mentions Overview ---

# Total klima mentions
total_klima_mentions = metadata['klima_mentions_count'].sum()
print(f"Total klima mentions: {total_klima_mentions}")

# Klima mentions per year
metadata['year'] = metadata['data_published'].dt.year
klima_per_year = metadata.groupby('year')['klima_mentions_count'].sum()
print("\nKlima mentions per year:")
print(klima_per_year)

# Klima mentions per newspaper (outlet)
klima_per_newspaper = metadata.groupby('newspaper_name')['klima_mentions_count'].sum().sort_values(ascending=False)
print("\nKlima mentions per newspaper:")
print(klima_per_newspaper)

# Average klima mentions per newspaper per year
avg_klima_per_newspaper_year = metadata.groupby(['year', 'newspaper_name'])['klima_mentions_count'].mean().unstack().fillna(0)
print("\nAverage klima mentions per newspaper per year:")
print(avg_klima_per_newspaper_year)

# Number of newspapers per year
newspapers_per_year = metadata.groupby('year')['newspaper_name'].nunique()
print("\nNumber of newspapers per year:")
print(newspapers_per_year)

# Number of days with at least one klima mention
days_with_klima = (metadata.groupby('data_published')['klima_mentions_count'].sum() > 0).sum()
print(f"\nNumber of days with at least one klima mention: {days_with_klima}")

# Number of newspapers with zero klima mentions (all time)
zero_klima_newspapers = klima_per_newspaper[klima_per_newspaper == 0]
print("\nNewspapers with zero klima mentions:")
print(zero_klima_newspapers)


In [None]:
metadata

In [None]:
# --- Average Klima Mentions per Weekday and Month ---

# --- Klima Mentions by Weekday and Month ---

# Add weekday and month columns
metadata['weekday'] = metadata['data_published'].dt.day_name()
metadata['month'] = metadata['data_published'].dt.month_name()

# Aggregate klima mentions by weekday
weekday_stats = metadata.groupby('weekday')['klima_mentions_count'].sum().reindex([
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
])

# Count number of each weekday in the dataset
weekday_counts = metadata['weekday'].value_counts().reindex([
    'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'
])

# Calculate average klima mentions per weekday
avg_klima_per_weekday = weekday_stats / weekday_counts

plt.figure(figsize=(10, 5))
sns.barplot(x=avg_klima_per_weekday.index, y=avg_klima_per_weekday.values)
plt.xlabel('Weekday')
plt.ylabel('Avg Klima Mentions per Day')
plt.title('Average Klima Mentions per Weekday')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Count number of each month in the dataset
month_counts = metadata['month'].value_counts().reindex(month_order)

# Calculate average klima mentions per month
avg_klima_per_month = month_stats / month_counts

plt.figure(figsize=(10, 5))
sns.barplot(x=avg_klima_per_month.index, y=avg_klima_per_month.values)
plt.xlabel('Month')
plt.ylabel('Avg Klima Mentions per Day')
plt.title('Average Klima Mentions per Month')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
month_counts

In [None]:
# --- Check missing months and newspapers ---
# Find which months are missing or have few entries
print("Number of unique publication days per month:")
print(metadata.groupby('month')['data_published'].nunique())

# List all newspapers present in February
feb_papers = metadata[metadata['month'] == 'February']['newspaper_name'].unique()
print("Newspapers present in February:")
print(feb_papers)

# List all publication dates in February
feb_dates = metadata[metadata['month'] == 'February']['data_published'].unique()
print("Publication dates in February:")
print(feb_dates)

# If needed, show which newspapers are missing in February compared to other months
all_papers = set(metadata['newspaper_name'].unique())
feb_papers_set = set(feb_papers)
missing_in_feb = all_papers - feb_papers_set
print("Newspapers missing in February:")
print(missing_in_feb)

In [None]:
import calendar

# count unique publication days per calendar month (across all years)
unique_days_per_month = (
    metadata.groupby(metadata['data_published'].dt.month)['data_published']
    .nunique()
    .reindex(range(1, 13), fill_value=0)
)
unique_days_per_month.index = [calendar.month_name[m] for m in unique_days_per_month.index]

plt.figure(figsize=(12, 5))
sns.barplot(x=unique_days_per_month.index, y=unique_days_per_month.values, color='steelblue')
plt.xlabel('Month')
plt.ylabel('Unique publication days')
plt.title('Number of Unique Publication Days per Month (all years)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()