In [None]:
# Imports and Configuration
import pandas as pd
import os
import json
from bs4 import BeautifulSoup
import re

# Define relative paths
INPUT_FILE_ARTICLES = "../data/wix_articles_data.json"
INPUT_FILE_CATEGORIES = "../data/wix_articles-category_data.json"
OUTPUT_FOLDER = "../data_prepared/"
OUTPUT_FILE = os.path.join(OUTPUT_FOLDER, "articles.json")

# Simple check to verify the files is where we think they are
if os.path.exists(INPUT_FILE_ARTICLES ):
    print(f"‚úÖ Setup complete. Input files found: {INPUT_FILE_ARTICLES}")
else:
    print(f"‚ùå Warning: Input file NOT found at {INPUT_FILE_ARTICLES }")

if os.path.exists(INPUT_FILE_CATEGORIES ):
    print(f"‚úÖ Setup complete. Input files found: {INPUT_FILE_CATEGORIES }")
else:
    print(f"‚ùå Warning: Input file NOT found at {INPUT_FILE_CATEGORIES }")


In [None]:
# Data Loading for Article

if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
    print(f"Created folder: {OUTPUT_FOLDER}")

try:
    with open(INPUT_FILE_ARTICLES, 'r', encoding='utf-8') as f:
        raw_json_ptr = json.load(f)
    
    # Extract the 'items' list and convert to DataFrame
    # Using .get() prevents a crash if 'items' is missing
    articles_df = pd.DataFrame(raw_json_ptr.get('items', []))
    
    if not articles_df.empty:
        print(f"‚úÖ Successfully loaded {len(articles_df)} records from 'items'.")
        print("\n--- First 3 rows of raw data ---")
        display(articles_df.head(3))
        
        # Pro-tip: df.columns.tolist() is a bit cleaner for printing
        print("\nAvailable columns:", *articles_df.columns, sep="\n")
    else:
        print("‚ö†Ô∏è The 'items' list was empty or missing.")
        
except FileNotFoundError:
    print(f"‚ùå Error: The file {INPUT_FILE_ARTICLES} was not found.")
except json.JSONDecodeError:
    print(f"‚ùå Error: {INPUT_FILE_ARTICLES} contains invalid JSON formatting.")
except Exception as e:
    print(f"‚ùå An unexpected error occurred: {e}")

In [None]:
# Transformation Logic

# Use .assign() to create the new DataFrame with all logic in one block.
final_articles_df = articles_df.assign(
    id=articles_df['_id'],
    title=articles_df['title'],
    excerpt=articles_df['lead'],
    contentText=articles_df['articletext'],
    categoryId=articles_df['category'],
    author="Dr. Prezenszki Zsuzsanna",
    
    # Handle Tags: Ensure data is a list using a list comprehension
    tags=[x if isinstance(x, list) else [] for x in articles_df['arraystring']],
    
    # Handle Date: Convert to datetime and format as YYYY-MM-DD
    publishedDate=pd.to_datetime(articles_df['_createdDate']).dt.date.astype(str),
    
    # Construct URL: Use f-string style formatting or simple concatenation
    url="https://www.kiutarakbol.hu/tudastar/" + 
        articles_df['urlvege1'].astype(str) + "/" + 
        articles_df['urlvege'].astype(str)
)

# Reorder & Finalize
# We define the order we want and filter the DataFrame to only those columns.
final_columns = [
    'id', 'title', 'excerpt', 'contentText', 'categoryId', 
    'tags', 'publishedDate', 'url', 'author'
]
final_articles_df = final_articles_df[final_columns]

print(f"Transformation complete. Processed {len(final_articles_df)} articles.")
display(final_articles_df.head(3))

In [None]:
# Clean_ HTML tags from the blog body text

def clean_wix_html(raw_html):
    """
    Parses raw HTML string, removes boilerplate tags/scripts, 
    and returns a clean, human-readable text string.
    """
    # Check for null values or non-string data to prevent crashes during processing
    if not raw_html or not isinstance(raw_html, str):
        return ""
    
    # Initialize BeautifulSoup with the standard HTML parser
    soup = BeautifulSoup(raw_html, "html.parser")

    # Remove 'script' and 'style' tags entirely so their internal code 
    # (Javascript/CSS) doesn't get extracted as readable text
    for script_or_style in soup(["script", "style"]):
        script_or_style.decompose()

    # Extract text from the HTML tags; use '\n' as a separator 
    # to prevent block elements (like <div> or <p>) from merging into one word
    text = soup.get_text(separator="\n")
    
    # Replace the non-breaking space entity (\xa0) with a standard space
    text = text.replace('\xa0', ' ')
    
    # Use Regex to find multiple consecutive newlines (possibly containing spaces)
    # and collapse them into exactly two newlines for clean paragraph spacing
    text = re.sub(r'\n\s*\n+', '\n\n', text)
    
    # Remove leading and trailing whitespace from the final result
    return text.strip()

# Apply the cleaning function to every row in the 'contentText' column
final_articles_df['contentText'] = final_articles_df['contentText'].apply(clean_wix_html)

print("‚úÖ HTML Cleaning complete.")
display(final_articles_df[['title', 'contentText']].head(3))

In [None]:
# Map Category IDs to Human-Readable Labels 

# Load the category metadata to build a translation map
with open(INPUT_FILE_CATEGORIES, 'r', encoding='utf-8') as f:
    categories_json = json.load(f)
    categories_raw = categories_json.get('items', [])

# Create a fast lookup dictionary: { 'unique-id-123': 'Health & Wellness' }
categories_lookup = {cat['_id']: cat['title'] for cat in categories_raw}

# Define translation logic to handle both single strings and lists of IDs
def translate_category(category_val):
    # If the cell contains a list of IDs, translate each item in the list
    if isinstance(category_val, list):
        return [categories_lookup.get(c_id, c_id) for c_id in category_val]
    # Otherwise, translate the single ID; return the ID itself if not found in lookup
    return categories_lookup.get(category_val, category_val)

# Dynamic Column Selection: Ensure we are targeting the right field name
target_col = 'categoryId' if 'categoryId' in final_articles_df.columns else 'category'

# Map the IDs to names across the entire DataFrame
final_articles_df[target_col] = final_articles_df[target_col].apply(translate_category)

# Standardize the column name to 'category' for the final output
if target_col == 'categoryId':
    final_articles_df = final_articles_df.rename(columns={'categoryId': 'category'})

print(f"‚úÖ Categories processed. Sample lookup: {list(categories_lookup.values())[:2]}")
display(final_articles_df[['title', 'category']].head())

In [None]:
# Export to JSON (keeping Hungarian characters safe with ensure_ascii=False)
final_articles_df.to_json(OUTPUT_FILE, orient='records', force_ascii=False, indent=4)
print(f"üöÄ Data successfully exported to {OUTPUT_FILE}")