In [None]:
import os
import pandas as pd
import json
from pathlib import Path

try:
    # Works as a script (.py)
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    # Works in a Notebook (.ipynb)
    BASE_DIR = Path.cwd()

INPUT_FILE = BASE_DIR.parent / 'data_prepared' / 'articles.json'

if INPUT_FILE.exists():
    print(f"✅ Setup complete. Input file found: {INPUT_FILE}")
else:
    print(f"❌ Warning: Input file NOT found at {INPUT_FILE}")
    print(f"Current BASE_DIR: {BASE_DIR}")

In [None]:
# Load the raw JSON
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# Load the main data into a DataFrame
articles_df = pd.DataFrame(raw_data)

print(f"✅ Successfully processed {len(articles_df)} records.")
display(articles_df.head(3))

In [None]:
# Convert date string to actual datetime objects
articles_df['published_date'] = pd.to_datetime(articles_df['published_date'], errors='coerce')

# Preview the current state of the columns
print("--- Final Column List ---")
print(articles_df.columns.tolist())

print("\n--- Data Types Check ---")
print(articles_df.dtypes[['published_date']])

display(articles_df.head())

In [None]:
# How many articles all time?

total_articles = len(articles_df)
print(f"Total articles in database: {total_articles}")

In [None]:
# Which year has the most articles published?

# Count publications per year
yearly_stats = articles_df['published_date'].dt.year.value_counts()

# Identify the top year
top_year = yearly_stats.idxmax()
max_articles = yearly_stats.max()

print(f"The most productive year was {top_year} with {max_articles} articles.")


print("\nArticles per year:")
print(yearly_stats)

In [None]:
# How many articles in a given year?

target_year = 2025
articles_in_year = articles_df[articles_df['published_date'].dt.year == target_year]

print(f"Articles in {target_year}: {len(articles_in_year)}")

In [None]:
# Which is the most active month in a specific year?
target_year = 2025

# Filter for the year first
year_filter = articles_df[articles_df['published_date'].dt.year == target_year]

# Find the most frequent month in that filtered data
if not year_filter.empty:
    top_month = year_filter['published_date'].dt.month_name().value_counts().idxmax()
    count = year_filter['published_date'].dt.month_name().value_counts().max()
    print(f"In {target_year}, the most active month was {top_month} with {count} articles.")
else:
    print(f"No articles found for the year {target_year}.")

In [None]:
# How many articles for a given category?

target_category = "Táplálkozás"
category_count = (articles_df['category'] == target_category).sum()

print(f"Articles in category '{target_category}': {category_count}")

In [None]:
# How many articles for a given tag? 
# To search within the lists of tags without expanding the DataFrame, we use .apply() or .str.contains():

target_tag = "étkezés"

# Check if the tag exists within the list in each row
tag_mask = articles_df['tags'].apply(lambda x: target_tag in x)
tag_count = tag_mask.sum()

print(f"Articles tagged with '{target_tag}': {tag_count}")

display(articles_df[tag_mask][['published_date', 'title']])

In [None]:
# Which months have the most publications? 

monthly_counts = articles_df['published_date'].dt.month_name().value_counts()
print("Articles published by month (all time):")
print(monthly_counts)

In [None]:
# What is the total list of unique tags used across the whole site?

unique_tags = sorted(set(tag for sublist in articles_df['tags'] for tag in sublist))
print(f"Total unique tags ({len(unique_tags)}):")
print(unique_tags)

In [None]:
# Which tags are used most often?

# Create a flat list of all tags and count them
tag_counts = pd.Series([tag for sublist in articles_df['tags'] for tag in sublist]).value_counts()

print("Top 10 most used tags:")
print(tag_counts.head(10))

# Visual check: How many articles use the most popular tag?
most_popular_tag = tag_counts.index[0]
print(f"\nYour most frequent topic is '{most_popular_tag}', appearing in {tag_counts.iloc[0]} articles.")