In [None]:
import os
import pandas as pd
import json
from pathlib import Path

# 1. THE UNIVERSAL WAY: Get the current directory 
try:
    # Works as a script (.py)
    BASE_DIR = Path(__file__).resolve().parent
except NameError:
    # Works in a Notebook (.ipynb)
    BASE_DIR = Path.cwd()

# 2. ADAPTED NAVIGATION:
# .parent moves up from 'mcp_tools_development' to 'ai-engeneering-study-mcp'
# Then we join into 'data_prepared' and use the plural 'events.json'
INPUT_FILE = BASE_DIR.parent / 'data_prepared' / 'blog_posts.json'

# Simple check to verify
if INPUT_FILE.exists():
    print(f"✅ Setup complete. Input file found: {INPUT_FILE}")
else:
    print(f"❌ Warning: Input file NOT found at {INPUT_FILE}")
    # Debugging: show what BASE_DIR actually is
    print(f"Current BASE_DIR: {BASE_DIR}")

In [None]:
# Load the raw JSON
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# 1. Load the main data into a DataFrame
df = pd.DataFrame(raw_data)

# 2. Flatten ONLY the 'metrics' column
# This turns the dictionary into a mini-dataframe
metrics_df = pd.json_normalize(df['metrics'])

# 3. Rename columns to keep the 'eventGuests_' prefix for clarity
metrics_df.columns = [f"metrics_{col}" for col in metrics_df.columns]

# 4. Join the new columns back to the original data and drop the old dict column
blog_posts_df = pd.concat([df.drop(columns=['metrics']), metrics_df], axis=1)

print(f"✅ Successfully processed {len(blog_posts_df)} records.")
print(f"Flattened columns: {list(metrics_df.columns)}")

# Display result
display(blog_posts_df.head(3))

In [None]:
# Convert date string to actual datetime objects
blog_posts_df['published_date'] = pd.to_datetime(blog_posts_df['published_date'], errors='coerce')

# Preview the current state of the columns
print("--- Final Column List ---")
print(blog_posts_df.columns.tolist())

print("\n--- Data Types Check ---")
print(blog_posts_df.dtypes[['published_date']])

display(blog_posts_df.head())

In [None]:
# How many blog posts all time?

total_blog_posts = len(blog_posts_df)
print(f"Total blog posts in database: {total_blog_posts}")

In [None]:
# Which year has the most blog posts published?

# Count publications per year
yearly_stats = blog_posts_df['published_date'].dt.year.value_counts()

# Identify the top year
top_year = yearly_stats.idxmax()
max_articles = yearly_stats.max()

print(f"The most productive year was {top_year} with {max_articles} articles.")

# Optional: Show the full breakdown
print("\Blog posts per year:")
print(yearly_stats)

In [None]:
# How many blog posts in a given year?

target_year = 2025
blog_posts_in_year = blog_posts_df[blog_posts_df['published_date'].dt.year == target_year]

print(f"Blog posts in {target_year}: {len(blog_posts_in_year)}")

In [None]:
# Which is the most active month in a specific year?
target_year = 2025

# Filter for the year first
year_filter = blog_posts_df[blog_posts_df['published_date'].dt.year == target_year]

# Find the most frequent month in that filtered data
if not year_filter.empty:
    top_month = year_filter['published_date'].dt.month_name().value_counts().idxmax()
    count = year_filter['published_date'].dt.month_name().value_counts().max()
    print(f"In {target_year}, the most active month was {top_month} with {count} blog posts.")
else:
    print(f"No articles found for the year {target_year}.")

In [None]:
# How many blog posts for a given category?
target_cat = "#ManóDuma"
cat_count = blog_posts_df['categories'].apply(lambda x: target_cat in x).sum()

print(f"Category '{target_cat}': {cat_count} posts")

In [None]:
# How many blog posts for a given tag?
target_tag = "Erőt adó"
tag_mask = blog_posts_df['tags'].apply(lambda x: target_tag in x)
tag_count = tag_mask.sum()

print(f"Tag '{target_tag}': {tag_count} posts")

# To see the titles:
display(blog_posts_df[tag_mask][['published_date', 'title']])

In [None]:
# Which months have the most publications? (Helps find gaps in your content calendar)
monthly_counts = blog_posts_df['published_date'].dt.month_name().value_counts()
print("Blog posts published by month (all time):")
print(monthly_counts)

In [None]:
# What is the total list of unique tags used across the whole site?
unique_tags = sorted(set(tag for sublist in blog_posts_df['tags'] for tag in sublist))
print(f"Total unique tags ({len(unique_tags)}):")
print(unique_tags)

In [None]:
# Which tags are used most often?

# Create a flat list of all tags and count them
tag_counts = pd.Series([tag for sublist in blog_posts_df['tags'] for tag in sublist]).value_counts()

print("Top 10 most used tags:")
print(tag_counts.head(10))

# Visual check: How many articles use the most popular tag?
most_popular_tag = tag_counts.index[0]
print(f"\nYour most frequent topic is '{most_popular_tag}', appearing in {tag_counts.iloc[0]} articles.")

In [None]:
# Highest Views & Likes (All Time) 
# Highest Views
top_viewed_post = blog_posts_df.loc[blog_posts_df['metrics_views'].idxmax()]
print(f"Post with highest views: '{top_viewed_post['title']}' ({top_viewed_post['metrics_views']} views)")

# Highest Likes
top_liked_post = blog_posts_df.loc[blog_posts_df['metrics_likes'].idxmax()]
print(f"Post with highest likes: '{top_liked_post['title']}' ({top_liked_post['metrics_likes']} likes)")

In [None]:

# Highest Views & Likes (In a Specific Year) 
target_year = 2025
year_df = blog_posts_df[blog_posts_df['published_date'].dt.year == target_year]

if not year_df.empty:
    top_year_views = year_df.loc[year_df['metrics_views'].idxmax()]
    top_year_likes = year_df.loc[year_df['metrics_likes'].idxmax()]
    
    print(f"In {target_year}:")
    print(f"- Most Viewed: '{top_year_views['title']}' ({top_year_views['metrics_views']} views)")
    print(f"- Most Liked: '{top_year_likes['title']}' ({top_year_likes['metrics_likes']} likes)")

In [None]:
# Category with Highest Average Views
# Explode categories to treat each one individually for the calculation
avg_views_per_cat = blog_posts_df.explode('categories').groupby('categories')['metrics_views'].mean()

top_cat = avg_views_per_cat.idxmax()
print(f"Category with highest average views: {top_cat} ({avg_views_per_cat.max():.2f} avg views)")

In [None]:
# Who are the authors?
unique_authors = blog_posts_df['author'].unique()
print(f"Authors on the site: {unique_authors}")

# How many blog posts do the authors have? (Count per author)
author_counts = blog_posts_df['author'].value_counts()
print("\nPost counts by author:")
print(author_counts)

# Posts by a specific author
target_author = "Dr. Prezenszki Zsuzsanna"
author_posts_count = (blog_posts_df['author'] == target_author).sum()
print(f"\n{target_author} has written {author_posts_count} posts.")