In [None]:
# Imports and Configuration
import pandas as pd
import os
import json
import re

# Define relative paths
INPUT_FILE_EVENTS = "../data/wix_events_data.json"
OUTPUT_FOLDER = "../data_prepared/"
OUTPUT_FILE = os.path.join(OUTPUT_FOLDER, "events_intermediate.json")

# Simple check to verify the file is where we think it is
if os.path.exists(INPUT_FILE_EVENTS):
    print(f"‚úÖ Setup complete. Input file found: {INPUT_FILE_EVENTS}")
else:
    print(f"‚ùå Warning: Input file NOT found at {INPUT_FILE_EVENTS}")

In [None]:
# Data Loading and Directory Check
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
    print(f"Created folder: {OUTPUT_FOLDER}")

# Load the raw data and FLATTEN it
try:
    with open(INPUT_FILE_EVENTS, 'r', encoding='utf-8') as f:
        # Load the raw JSON as a Python list
        raw_data = json.load(f)
    
    # Flatten the nested structure (location.type, etc.)
    events_df = pd.json_normalize(raw_data)
    
    print(f"Successfully loaded and flattened {len(events_df)} records.")
    
    print("\n--- First 3 rows of flattened data ---")
    display(events_df[["categories.categories"]].head(3)) 
    
    # These columns will now look like 'location.type', 'dateAndTimeSettings.startDate', etc.
    print("\nAvailable flattened columns (first 10):", *events_df.columns, sep="\n")
    
except FileNotFoundError:
    print(f"Error: The file {INPUT_FILE_EVENTS} was not found.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
# Transformation Logic - Part-1

# We don't need final_events_df = pd.DataFrame() first
final_events_df = events_df.assign(
    id=events_df['_id'],
    title=events_df['title'],
    location=events_df.get('location.type'),
    url=events_df['eventPageUrl'],

    # Standardize the date to string format
    date=pd.to_datetime(events_df['dateAndTimeSettings.startDate']).dt.date.astype(str),

    # Group Guests into a single nested object
    # .fillna(0) ensures we don't put 'nan' into our final dictionary
    eventGuests=events_df.fillna(0).apply(lambda row: {
        'total': int(row.get('summaries.rsvps.totalCount', 0)),
        'going': int(row.get('summaries.rsvps.yesCount', 0)),
        'notGoing': int(row.get('summaries.rsvps.noCount', 0)),
        'waitlist': int(row.get('summaries.rsvps.waitlistCount', 0)),
    }, axis=1)
)

# Reorder and keep only necessary columns
final_columns = ['id', 'title', 'date', 'location', 'eventGuests', 'url']
final_events_df = final_events_df[final_columns]

print(f"‚úÖ Transformed {len(final_events_df)} events.")
display(final_events_df.head(3))

In [None]:
# Transformation Logic - Part-2

# Extract category names
# We use .get() and check if 'val' is a list to prevent crashes on empty/NaN rows
final_events_df['categories'] = events_df['categories.categories'].apply(
    lambda val: [item.get('name') for item in val] if isinstance(val, list) else []
)

# Final selection and Reorder (including the new categories column)
final_columns = ['id', 'title', 'date', 'location', 'categories', 'eventGuests', 'url']
final_events_df = final_events_df[final_columns]

print("‚úÖ Category extraction complete.")
display(final_events_df[['title', 'categories']].head())

In [None]:
# Transformation Logic - Part-3

# Clean categories
# Define the string to remove
TAG_TO_REMOVE = 'publish'

def clean_and_join_categories(cat_list):
    # Ensure we are working with a list; handle None/NaN
    if not isinstance(cat_list, list):
        return ""
    
    # Filter out 'publish' and any None values
    filtered_list = [c for c in cat_list if c and c != TAG_TO_REMOVE]
    
    # Join into a single string (e.g., "egy√©b, marketing")
    return ", ".join(filtered_list)

# Apply the transformation
final_events_df['categories'] = final_events_df['categories'].apply(clean_and_join_categories)

print("‚úÖ 'publish' removed and categories converted to string.")
display(final_events_df[['title', 'categories']].head())


In [None]:
# Transformation Logic - Part-4

def extract_all_text(item):
    """
    Goes through every dictionary and list inside the JSON 
    until it finds 'textData'. No matter how deep it is.
    """
    text_parts = []

    # If it's a list, check every item in the list
    if isinstance(item, list):
        for sub_item in item:
            text_parts.append(extract_all_text(sub_item))

    # If it's a dictionary, look for textData OR more nodes
    elif isinstance(item, dict):
        # FOUND THE GOAL: Grab the text
        if 'textData' in item and 'text' in item['textData']:
            text_parts.append(item['textData']['text'])
        
        # ADD NEWLINES: If it's a paragraph or list item, add a break
        if item.get('type') in ['PARAGRAPH', 'LIST_ITEM', 'HEADING']:
            text_parts.append("\n")

        # DRILL DEEPER: Check every key in the dictionary for more lists/dicts
        for key, value in item.items():
            if isinstance(value, (list, dict)):
                text_parts.append(extract_all_text(value))

    return "".join(text_parts)

def clean_description(raw_nodes):
    """Entry point for the DataFrame apply."""
    if not raw_nodes:
        return ""
    full_text = extract_all_text(raw_nodes)
    
    # Clean up formatting: 
    # Fix multiple newlines
    text = re.sub(r'\n\s*\n+', '\n\n', full_text)
    # Fix spaces that might have been added between bold/normal text
    text = re.sub(r' +', ' ', text)
    return text.strip()

# Apply to your confirmed column 
final_events_df['descriptionText'] = events_df['description.nodes'].apply(clean_description)

print("‚úÖ Description extraction should now capture the lists and nested paragraphs!")
display(final_events_df[['title', 'descriptionText']].head(2))

In [None]:
# Update final columns to include your new description
final_columns = ['id', 'title', 'date', 'location', 'categories', 'eventGuests', 'descriptionText', 'url']
final_events_df = final_events_df[final_columns]

print("‚úÖ Description reconstructed.")
display(final_events_df.head(3))

In [None]:
# Export to JSON (keeping Hungarian characters safe with ensure_ascii=False)
final_events_df.to_json(OUTPUT_FILE, orient='records', force_ascii=False, indent=4)
print(f"üöÄ Data successfully exported to {OUTPUT_FILE}")