In [None]:
# Imports and Configuration
import pandas as pd
import os
import json

# Define relative paths 
INPUT_FILE_POSTS = "../data/wix_blog_posts_data.json"
INPUT_FILE_TAGS = "../data/wix_blog_tags_data.json"
INPUT_FILE_CATEGORIES = "../data/wix_blog_categories_data.json"
OUTPUT_FOLDER = "../data_prepared/"
OUTPUT_FILE = os.path.join(OUTPUT_FOLDER, "blog_posts_intermediate.json")

# Simple check to verify the files is where we think they are
if os.path.exists(INPUT_FILE_POSTS ):
    print(f"‚úÖ Setup complete. Input files found: {INPUT_FILE_POSTS}")
else:
    print(f"‚ùå Warning: Input file NOT found at {INPUT_FILE_POSTS }")

if os.path.exists(INPUT_FILE_TAGS ):
    print(f"‚úÖ Setup complete. Input files found: {INPUT_FILE_TAGS}")
else:
    print(f"‚ùå Warning: Input file NOT found at {INPUT_FILE_TAGS }")
    
if os.path.exists(INPUT_FILE_CATEGORIES ):
    print(f"‚úÖ Setup complete. Input files found: {INPUT_FILE_CATEGORIES }")
else:
    print(f"‚ùå Warning: Input file NOT found at {INPUT_FILE_CATEGORIES }")


In [None]:
# Data Loading and Directory Check

if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
    print(f"Created folder: {OUTPUT_FOLDER}")

# Load the raw data
try:
    posts_df = pd.read_json(INPUT_FILE_POSTS)
    print(f"Successfully loaded {len(posts_df)} records.")
    
    display(posts_df.head(3)) 
    
    print("\nAvailable columns:", *posts_df.columns, sep="\n")
  
except FileNotFoundError:
    print(f"Error: The file {INPUT_FILE_POSTS} was not found.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
# Transformation Logic

# Create a new DataFrame with basic mappings
final_posts_df = (
    posts_df[['title', 'excerpt', 'contentText', 'url', 'tagIds', 'memberId']]
    .assign(
        # Map renamed fields & constants
        id = df['_id'],
        author = "",
        
        # Handle list logic
        categoryIds = posts_df['categoryIds'].apply(lambda x: x if isinstance(x, list) else []),
        
        # Handle Date conversion
        publishedDate = pd.to_datetime(df['firstPublishedDate']).dt.date.astype(str),
        
        # Group Metrics
        metrics = df['metrics'].apply(lambda x: {
            'likes': x.get('likes', 0),
            'views': x.get('views', 0)
        })
    )
    # Reorder columns immediately at the end of the chain
    [['id', 'title', 'excerpt', 'contentText', 'categoryIds', 
      'tagIds', 'publishedDate', 'metrics', 'url', 'author', 'memberId']]
)

display(final_posts_df.head(3))

In [None]:
# Map Tag IDs to Readable Labels
with open(INPUT_FILE_TAGS, 'r', encoding='utf-8') as f:
    tags_raw = json.load(f)

tags_lookup = {tag['_id']: tag['label'] for tag in tags_raw}

def translate_tags(id_list):
    if isinstance(id_list, list):
        return [tags_lookup.get(tag_id, tag_id) for tag_id in id_list]
    return []

# Use assign to create 'tags' and drop 'tagIds' if it exists
final_posts_df = final_posts_df.assign(
    tags = lambda x: x['tagIds'].apply(translate_tags) if 'tagIds' in x.columns else x['tags']
)

# Optional: Remove the old ID column if it's still there
if 'tagIds' in final_posts_df.columns:
    final_posts_df = final_posts_df.drop(columns=['tagIds'])

print("‚úÖ Tags processed.")
display(final_posts_df[['title', 'tags']].head())

In [None]:
# Map Category IDs to Labels
# Create lookup dictionary
cats_lookup = {cat['_id']: cat['label'] for cat in cats_raw}

# Translation function
def translate_categories(id_list):
    if isinstance(id_list, list):
        return [cats_lookup.get(cat_id, cat_id) for cat_id in id_list]
    return []

# Process and Rename in one go using .assign()
# This creates 'categories' and we immediately drop the old ID column
final_posts_df = (
    final_posts_df.assign(
        categories = lambda x: x['categoryIds'].apply(translate_categories) 
                     if 'categoryIds' in x.columns else x.get('categories', [])
    )
)

# Remove the old column only if it exists
if 'categoryIds' in final_posts_df.columns:
    final_posts_df = final_posts_df.drop(columns=['categoryIds'])

print("‚úÖ Categories processed.")
display(final_posts_df[['title', 'categories']].head())

In [None]:
# Export to JSON (keeping Hungarian characters safe with ensure_ascii=False)
final_posts_df.to_json(OUTPUT_FILE, orient='records', force_ascii=False, indent=4)
print(f"üöÄ Data successfully exported to {OUTPUT_FILE}")