In [None]:
import os
import pandas as pd
import json
from pathlib import Path

try:
    # Works as a script (.py)
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    # Works in a Notebook (.ipynb)
    BASE_DIR = Path.cwd()

INPUT_FILE = BASE_DIR.parent / 'data_prepared' / 'articles.json'

if INPUT_FILE.exists():
    print(f"✅ Setup complete. Input file found: {INPUT_FILE}")
else:
    print(f"❌ Warning: Input file NOT found at {INPUT_FILE}")
    print(f"Current BASE_DIR: {BASE_DIR}")

In [None]:
# Load the raw JSON
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# Load the main data into a DataFrame
articles_df = pd.DataFrame(raw_data)


print(f"✅ Successfully processed {len(articles_df)} records.")
display(articles_df.head(3))

In [None]:
# Convert date string to actual datetime objects
articles_df['publishedDate'] = pd.to_datetime(articles_df['publishedDate'], errors='coerce')

# Preview the current state of the columns
print("--- Final Column List ---")
print(articles_df.columns.tolist())

print("\n--- Data Types Check ---")
print(articles_df.dtypes[['publishedDate']])

display(articles_df.head())

In [None]:
def get_article_count(year_val, month_val=None):
    """
    Counts articles for a specific year, and optionally a specific month.
    month_val can be an integer (1-12) or a string (e.g., "December" or "december").
    """
    # Handle Month mapping
    month_map = {
        "január": 1, "jan": 1, "február": 2, "feb": 2, "március": 3, "már":3, "április": 4, "ápr":4,
        "május": 5, "máj":5, "június": 6, "jún":6, "július": 7, "júl":7, "augusztus": 8, "aug":8,
        "szeptember": 9, "szept":9, "október": 10, "okt":10, "november": 11, "nov":11, "december": 12, "dec": 12
    }

    # Start with a mask for the year
    mask = (articles_df['publishedDate'].dt.year == year_val)

    # If a month is provided, update the mask
    if month_val is not None:
        if isinstance(month_val, str):
            # Clean string and get number from map
            month_num = month_map.get(month_val.lower().replace(".", ""), None)
            if not month_num:
                return f"⚠️ Error: '{month_val}' is not a valid month name."
        else:
            month_num = month_val
        
        mask = mask & (articles_df['publishedDate'].dt.month == month_num)

    count = len(articles_df[mask])
    
    # Return a formatted string or just the number, depending on your preference
    scope = f"{month_val} " if month_val else ""
    return f"Articles in {year_val} {scope}: {count}"

print(get_article_count(2022, 12))      # Year + Integer Month
print(get_article_count(2005, "Feb"))   # Year + String Month
print(get_article_count(2024))          # Year only