In [None]:
# Imports and Configuration
import pandas as pd
import os
import json

# Define relative paths based on your tree structure
# (We are inside /data_prepare/, so we go up one level to reach /data/ or /data_prepared/)
INPUT_FILE_POSTS = "../data_prepared/blog_posts_intermediate.json"
INPUT_FILE_MEMBERS = "../data/wix_members_data.json"
OUTPUT_FOLDER = "../data_prepared/"
OUTPUT_FILE = os.path.join(OUTPUT_FOLDER, "blog_posts.json")

# Simple check to verify the files is where we think they are
if os.path.exists(INPUT_FILE_POSTS ):
    print(f"‚úÖ Setup complete. Input files found: {INPUT_FILE_POSTS}")
else:
    print(f"‚ùå Warning: Input file NOT found at {INPUT_FILE_POSTS }")

if os.path.exists(INPUT_FILE_MEMBERS ):
    print(f"‚úÖ Setup complete. Input files found: {INPUT_FILE_MEMBERS}")
else:
    print(f"‚ùå Warning: Input file NOT found at {INPUT_FILE_MEMBERS }")


In [None]:
# Data Loading and Directory Check
# Ensure the output directory exists
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
    print(f"Created folder: {OUTPUT_FOLDER}")

# Load the raw data
try:
    posts_df = pd.read_json(INPUT_FILE_POSTS)
    print(f"Successfully loaded {len(posts_df)} records.")
    
    display(posts_df.head(3)) 
    
    print("\nAvailable columns:", *posts_df.columns, sep="\n")
  
except FileNotFoundError:
    print(f"Error: The file {INPUT_FILE_POSTS} was not found.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
# 1. Load the raw JSON file
with open(INPUT_FILE_MEMBERS, 'r', encoding='utf-8') as f:
    raw_data = json.load(f)

# 2. Extract only the list from "_items" and create the DataFrame
# This keeps the nested 'profile' as a dictionary inside the cell
df_members = pd.DataFrame(raw_data["_items"])

print(f"Successfully loaded {len(df_members)} member records.")
display(df_members.head(3))

In [None]:
bernadett_name = "Ferenci Bernadett"
# Create a boolean mask to find the row where the nested 'nickname' matches our target name
bernadett_row = df_members[df_members['profile'].str.get('nickname') == bernadett_name]

# Ensure the member exists in the dataframe before attempting to extract the ID
if not bernadett_row.empty:
    # Extract the unique contactId string from the filtered result
    bernadett_id = bernadett_row['contactId'].values[0]
    
    # Filter the blog records by matching the memberId to Bernadett's unique contactId
    df_bernadett_blogs = posts_df[posts_df['memberId'] == bernadett_id]

    # Print summary and display specific columns for verification
    print(f"Matching ID: {bernadett_id}")
    print(f"Found {len(df_bernadett_blogs)} posts by {bernadett_name}.")
    display(df_bernadett_blogs[['id','title','memberId']])
else:
    # Handle the case where the nickname search returns no results
    print(f"Warning: Member '{bernadett_name}' not found in the data.")

In [None]:
# Create a copy to avoid SettingWithCopy warnings if posts_df was a slice
posts_df = posts_df.copy()

# Use the ID we found in the previous step to fill the name
posts_df.loc[posts_df['memberId'] == bernadett_id, 'author'] = bernadett_name

# Verify the update
updated_count = len(posts_df[posts_df['author'] == bernadett_name])
print(f"‚úÖ Updated {updated_count} rows with author name: {bernadett_name}")

# Show a sample of the updated records
display(posts_df[posts_df['author'] == bernadett_name][['title', 'memberId', 'author']].head())

In [None]:
orsolya_name = "Serester Orsolya"
# Create a boolean mask to find the row where the nested 'nickname' matches our target name
orsolya_row = df_members[df_members['profile'].str.get('nickname') == orsolya_name]

# Ensure the member exists in the dataframe before attempting to extract the ID
if not orsolya_row.empty:
    # Extract the unique contactId string from the filtered result
    orsolya_id = orsolya_row['contactId'].values[0]
    
    # Filter the blog records by matching the memberId to Bernadett's unique contactId
    df_orsolya_blogs = posts_df[posts_df['memberId'] == orsolya_id]

    # Print summary and display specific columns for verification
    print(f"Matching ID: {orsolya_id}")
    print(f"Found {len(df_orsolya_blogs)} posts by {orsolya_name}.")
    display(df_orsolya_blogs[['id','title','memberId']])
else:
    # Handle the case where the nickname search returns no results
    print(f"Warning: Member '{orsolya_name}' not found in the data.")

In [None]:
# Create a copy to avoid SettingWithCopy warnings if posts_df was a slice
posts_df = posts_df.copy()

# Use the ID we found in the previous step to fill the name
posts_df.loc[posts_df['memberId'] == orsolya_id, 'author'] = orsolya_name

# Verify the update
updated_count = len(posts_df[posts_df['author'] == orsolya_name])
print(f"‚úÖ Updated {updated_count} rows with author name: {orsolya_name}")

# Show a sample of the updated records
display(posts_df[posts_df['author'] == orsolya_name][['title', 'memberId', 'author']].head())

In [None]:
icus_name = "T√≥th M√°rtonn√© Icus"
# Create a boolean mask to find the row where the nested 'nickname' matches our target name
icus_row = df_members[df_members['profile'].str.get('nickname') == icus_name]

# Ensure the member exists in the dataframe before attempting to extract the ID
if not icus_row.empty:
    # Extract the unique contactId string from the filtered result
    icus_id = icus_row['contactId'].values[0]
    
    # Filter the blog records by matching the memberId to Bernadett's unique contactId
    df_icus_blogs = posts_df[posts_df['memberId'] == icus_id]

    # Print summary and display specific columns for verification
    print(f"Matching ID: {icus_id}")
    print(f"Found {len(df_icus_blogs)} posts by {icus_name}.")
    display(df_icus_blogs[['id','title','memberId']])
else:
    # Handle the case where the nickname search returns no results
    print(f"Warning: Member '{icus_name}' not found in the data.")

In [None]:
# Create a copy to avoid SettingWithCopy warnings if posts_df was a slice
posts_df = posts_df.copy()

# Use the ID we found in the previous step to fill the name
posts_df.loc[posts_df['memberId'] == icus_id, 'author'] = icus_name

# Verify the update
updated_count = len(posts_df[posts_df['author'] == icus_name])
print(f"‚úÖ Updated {updated_count} rows with author name: {icus_name}")

# Show a sample of the updated records
display(posts_df[posts_df['author'] == icus_name][['title', 'memberId', 'author']].head())

In [None]:
tatar_name = "Dr. Tat√°r-Kiss Kl√°ra Krisztina"
# Create a boolean mask to find the row where the nested 'nickname' matches our target name
tatar_row = df_members[df_members['profile'].str.get('nickname') == tatar_name]

# Ensure the member exists in the dataframe before attempting to extract the ID
if not tatar_row.empty:
    # Extract the unique contactId string from the filtered result
    tatar_id = tatar_row['contactId'].values[0]
    
    # Filter the blog records by matching the memberId to Bernadett's unique contactId
    df_tatar_blogs = posts_df[posts_df['memberId'] == tatar_id]

    # Print summary and display specific columns for verification
    print(f"Matching ID: {tatar_id}")
    print(f"Found {len(df_tatar_blogs)} posts by {tatar_name}.")
    display(df_tatar_blogs[['id','title','memberId']])
else:
    # Handle the case where the nickname search returns no results
    print(f"Warning: Member '{tatar_name}' not found in the data.")

In [None]:
# Create a copy to avoid SettingWithCopy warnings if posts_df was a slice
posts_df = posts_df.copy()

# Use the ID we found in the previous step to fill the name
posts_df.loc[posts_df['memberId'] == tatar_id, 'author'] = tatar_name

# Verify the update
updated_count = len(posts_df[posts_df['author'] == icus_name])
print(f"‚úÖ Updated {updated_count} rows with author name: {tatar_name}")

# Show a sample of the updated records
display(posts_df[posts_df['author'] == tatar_name][['title', 'memberId', 'author']].head())

In [None]:
# 1. Calculate metrics
total_posts = len(posts_df)
# We define "empty" as either NaN, "Unknown", or an empty string
empty_mask = (posts_df['author'].isna()) | (posts_df['author'] == "Unknown") | (posts_df['author'] == "")
empty_authors_count = len(posts_df[empty_mask])
filled_authors_count = total_posts - empty_authors_count

# 2. Display Summary Table
summary_data = {
    "Metric": ["Total Posts", "Posts with Authors", "Posts without Authors"],
    "Count": [total_posts, filled_authors_count, empty_authors_count]
}
summary_df = pd.DataFrame(summary_data)
display(summary_df)

print("-" * 30)

# 3. Display rows where author is NOT empty
# Using the inverse (~) of the empty_mask we just created
df_filled = posts_df[~empty_mask]

if not df_filled.empty:
    print(f"‚úÖ Displaying {len(df_filled)} rows with assigned authors:")
    display(df_filled[['title', 'author', 'memberId']])
else:
    print("‚ÑπÔ∏è No rows with authors found yet.")

In [None]:
# 1. Define the mask for what we consider "empty"
# This covers NaNs, the "Unknown" placeholder, and empty strings
empty_mask = (posts_df['author'].isna()) | (posts_df['author'] == "Unknown") | (posts_df['author'] == "")

# 2. Count how many will be updated for the log
authors_to_fill = empty_mask.sum()

# 3. Fill the empty slots
target_author = "Dr. Prezenszki Zsuzsanna"
posts_df.loc[empty_mask, 'author'] = target_author

print(f"‚úÖ Successfully assigned '{target_author}' to {authors_to_fill} posts.")

# 4. Final Verification: Check if any "Unknown" or empty authors remain
remaining_empty = posts_df[(posts_df['author'].isna()) | (posts_df['author'] == "Unknown") | (posts_df['author'] == "")]
print(f"üìä Remaining empty authors: {len(remaining_empty)}")

# Display a few of the newly updated rows
display(posts_df[posts_df['author'] == target_author][['title', 'author']].head())

In [None]:
final_columns = [
    'id', 'title', 'excerpt', 'contentText', 'categories', 
    'tags', 'publishedDate', 'metrics', 'url', 'author'
]

posts_df = posts_df[final_columns]

display(posts_df.head(3))

In [None]:
# Export to JSON (keeping Hungarian characters safe with ensure_ascii=False)
posts_df.to_json(OUTPUT_FILE, orient='records', force_ascii=False, indent=4)
print(f"üöÄ Data successfully exported to {OUTPUT_FILE}")