In [None]:
# Imports and Configuration
import pandas as pd
import os
import json
import re

# Define relative paths
INPUT_FILE_EVENTS = "../data_prepared/events_intermediate.json"
OUTPUT_FOLDER = "../data_prepared/"
OUTPUT_FILE = os.path.join(OUTPUT_FOLDER, "events.json")

# Simple check to verify the file is where we think it is
if os.path.exists(INPUT_FILE_EVENTS):
    print(f"‚úÖ Setup complete. Input file found: {INPUT_FILE_EVENTS}")
else:
    print(f"‚ùå Warning: Input file NOT found at {INPUT_FILE_EVENTS}")

In [None]:
# Data Loading and Directory Check
# Ensure the output directory exists
if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)
    print(f"Created folder: {OUTPUT_FOLDER}")

# Load the raw data
try:
    events_df = pd.read_json(INPUT_FILE_EVENTS)
    print(f"Successfully loaded {len(events_df)} records.")
    
    display(events_df.head(3)) 
    
    print("\nAvailable columns:", *events_df.columns, sep="\n")
    
except FileNotFoundError:
    print(f"Error: The file {INPUT_FILE_EVENTS} was not found.")

In [None]:
# Add the speaker column with an empty string as the default value
events_df['speaker'] = ""

# Define the final column order
final_columns = ['id', 'title', 'date', 'speaker', 'location', 'categories', 'eventGuests', 'descriptionText', 'url']

# Reorder the DataFrame based on final_columns
# We use .copy() to avoid SettingWithCopy warnings if you perform further operations
events_df = events_df[final_columns].copy()

print("‚úÖ Added 'speaker' column and reordered columns.")
print(f"New shape: {events_df.shape}")

display(events_df.head(3))

In [None]:
# Reusable function

def extract_snippet(text, keyword, length=500):
    """
    Finds a keyword in text (case-insensitive) and returns 
    the keyword plus the following N characters.
    """

    pd.set_option('display.max_colwidth', None)
    
    # Handle non-string data (like NaN or None) safely
    if not isinstance(text, str):
        return None
    
    # re.escape handles special characters in the keyword (e.g. ?, ., *)
    # re.IGNORECASE makes it find "ki vezeti", "Ki Vezeti", etc.
    match = re.search(re.escape(keyword), text, re.IGNORECASE)
    
    if match:
        start_index = match.start()
        # Return the match + the specified number of characters
        return text[start_index : start_index + len(keyword) + length]
    
    return None

In [None]:
# Filter descriptionText column for this term:'ki vezeti'
search_term_1 = "ki vezeti"

# Filter the dataframe (case-insensitive search)
filtered_1_df = events_df[events_df['descriptionText'].str.contains(search_term_1, case=False, na=False)].copy()

# Create a new column 'snippet' with the text found
filtered_1_df['snippet'] = filtered_1_df['descriptionText'].apply(
    lambda x: extract_snippet(x, search_term_1)
)

print(f"‚úÖ Found {len(filtered_1_df)} matches.")

display(filtered_1_df[['id', 'title', 'snippet']])

In [None]:
# Update the 'speaker' column using the index of your filtered results
events_df.loc[filtered_1_df.index, 'speaker'] = "Dr. Prezenszki Zsuzsanna"

# Print the number of updated rows
print(f"‚úÖ Updated {len(filtered_1_df)} rows with the new speaker.")

# Count events with and without a speaker
total_has_speaker = (events_df['speaker'] != "").sum()
total_no_speaker = (events_df['speaker'] == "").sum()

print(f"üìä Global Speaker Statistics (Total Dataset):")
print(f"    - Events with a speaker: {total_has_speaker}")
print(f"    - Events without a speaker: {total_no_speaker}")
print(f"    - Total events: {len(events_df)}")

display(events_df.loc[filtered_1_df.index, ['id', 'title', 'speaker']].head(4))

In [None]:
# Filter descriptionText column for this term:'ki seg√≠t neked'
search_term_2 = "ki seg√≠t neked"

# Filter: Text contains keyword AND speaker is empty ("")
mask = (events_df['descriptionText'].str.contains(search_term_2, case=False, na=False)) & \
       (events_df['speaker'] == "")

# Create the filtered dataframe
filtered_2_df = events_df[mask].copy()

# Extract snippets only for these specific rows
filtered_2_df['snippet'] = filtered_2_df['descriptionText'].apply(lambda x: extract_snippet(x, search_term_2, length=550))

print(f"‚úÖ Found {len(filtered_2_df)} matches with no speaker assigned.")
display(filtered_2_df[['id', 'title', 'snippet']])

In [None]:
# Define the sub-filter term
sub_search_term_2_matuszka = "Matuszka"

# We use case=False to be safe (catches "Matuszka", "MATUSZKA", etc.)
filtered_matuszka_df = filtered_2_df[filtered_2_df['descriptionText'].str.contains(sub_search_term_2_matuszka, case=False, na=False)].copy()

# Print the counts
print(f"üìä Filtering Results:")
print(f"   - Total rows in filtered_2_df: {len(filtered_2_df)}")
print(f"   - Rows specifically mentioning '{sub_search_term_2_matuszka}': {len(filtered_matuszka_df)}")

display(filtered_matuszka_df[['id', 'title', 'speaker', 'snippet']])

In [None]:
# Update the 'speaker' column in the main DataFrame
events_df.loc[filtered_matuszka_df.index, 'speaker'] = "Dr. Matuszka Istv√°n"

# Print the number of updated rows
print(f"‚úÖ Updated {len(filtered_matuszka_df)} rows with the new speaker.")

# Statistics specifically for the CURRENT working batch (filtered_2_df)
# This helps to see how much of 'ki seg√≠t neked' is left to process
batch_filtered_2_has_speaker = (events_df.loc[filtered_2_df.index, 'speaker'] != "").sum()
batch_filtered_2_no_speaker = (events_df.loc[filtered_2_df.index, 'speaker'] == "").sum()

print(f"üìä Batch Statistics ('ki seg√≠t neked' group):")
print(f"   - Already assigned in this batch: {batch_filtered_2_has_speaker}")
print(f"   - Remaining to assign in this batch: {batch_filtered_2_no_speaker}")

print("\n‚úÖ Verification of updated 'Matuszka' rows:")
display(events_df.loc[filtered_matuszka_df.index, ['id', 'title', 'speaker']].head(4))

In [None]:

# Create a fresh filter from the main DataFrame where Speaker is empty and Text contains the keyword
mask_remaining = (events_df['speaker'] == "") & \
                 (events_df['descriptionText'].str.contains(search_term_2, case=False, na=False))

# Create the new DataFrame based on the live data
rest_filtered_2_df = events_df[mask_remaining].copy()

# Add the snippet so you can see the context
rest_filtered_2_df['snippet'] = rest_filtered_2_df['descriptionText'].apply(
    lambda x: extract_snippet(x, search_term_2, length=500)
)

print(f"‚úÖ Found {len(rest_filtered_2_df)} total events matching '{search_term_2}' that still need a speaker.")

display(rest_filtered_2_df[['id', 'title', 'snippet']])

In [None]:
# Filter 
sub_search_term_2_andrea_1 = "vezet≈ëje: Gy≈ërfi Andrea, klinikai szakpszichol√≥gus"
sub_search_term_2_andrea_2 = "vezet≈ëje:\n‚ÄãGy≈ërfi Andrea, pszichol√≥gus"

mask_andrea_clean = (rest_filtered_2_df['descriptionText'].str.contains(sub_search_term_2_andrea_1, case=False)) | \
       (rest_filtered_2_df['descriptionText'].str.contains(sub_search_term_2_andrea_2, case=False))

# Create the filtered dataframe
filtered_df_andrea = rest_filtered_2_df[mask_andrea_clean].copy()

# Print the counts
print(f"üìä Filtering Results:")
print(f"   - Total rows in rest_filtered_2_df: {len(rest_filtered_2_df)}")
print(f"   - Rows specifically mentioning '{sub_search_term_2_andrea_1}' or '{sub_search_term_2_andrea_2}': {len(filtered_df_andrea)}")

display(filtered_df_andrea[['id', 'title', 'speaker', 'snippet']])

In [None]:
# Update the 'speaker' column in the main DataFrame
events_df.loc[filtered_df_andrea.index, 'speaker'] = "Gy≈ërfi Andrea"

# Statistics specifically for the CURRENT working batch (filtered_2_df)
# This helps to see how much of 'ki seg√≠t neked' is left to process
batch_filtered_2_has_speaker = (events_df.loc[filtered_2_df.index, 'speaker'] != "").sum()
batch_filtered_2_no_speaker = (events_df.loc[filtered_2_df.index, 'speaker'] == "").sum()

print(f"üìä Batch Statistics ('ki seg√≠t neked' group):")
print(f"   - Already assigned in this batch: {batch_filtered_2_has_speaker}")
print(f"   - Remaining to assign in this batch: {batch_filtered_2_no_speaker}")

print("\n‚úÖ Verification of updated 'Gy≈ërfi Andrea' rows:")
display(events_df.loc[filtered_df_andrea.index, ['id', 'title', 'speaker']].head(4))

In [None]:
# Create a fresh filter from the main DataFrame where Speaker is empty and Text contains the keyword
mask_remaining = (events_df['speaker'] == "") & \
                 (events_df['descriptionText'].str.contains(search_term_2, case=False, na=False))

# Create the new DataFrame based on the live data
rest_filtered_2_df = events_df[mask_remaining].copy()

# Add the snippet so you can see the context
rest_filtered_2_df['snippet'] = rest_filtered_2_df['descriptionText'].apply(
    lambda x: extract_snippet(x, search_term_2, length=500)
)

print(f"‚úÖ Found {len(rest_filtered_2_df)} total events matching '{search_term_2}' that still need a speaker.")

display(rest_filtered_2_df[['id', 'title', 'snippet']])

In [None]:
# Update the 'speaker' column in the main DataFrame
events_df.loc[rest_filtered_2_df.index, 'speaker'] = "Dr. Prezenszki Zsuzsanna"

# Statistics for the WHOLE project (events_df)
total_has_speaker = (events_df['speaker'] != "").sum()
total_no_speaker = (events_df['speaker'] == "").sum()

# Statistics specifically for the CURRENT working batch (filtered_2_df)
# This helps to see how much of 'ki seg√≠t neked' is left to process
batch_filtered_2_has_speaker = (events_df.loc[filtered_2_df.index, 'speaker'] != "").sum()
batch_filtered_2_no_speaker = (events_df.loc[filtered_2_df.index, 'speaker'] == "").sum()

print(f"üìä Global Speaker Statistics (Total Dataset):")
print(f"   - Events with a speaker: {total_has_speaker}")
print(f"   - Events without a speaker: {total_no_speaker}")
print(f"   - Total events: {len(events_df)}")
print("-" * 30)
print(f"üìä Batch Statistics ('ki seg√≠t neked' group):")
print(f"   - Already assigned in this batch: {batch_filtered_2_has_speaker}")
print(f"   - Remaining to assign in this batch: {batch_filtered_2_no_speaker}")

print("\n‚úÖ Verification of updated 'Gy≈ërfi Andrea' rows:")
display(events_df.loc[rest_filtered_2_df.index, ['id', 'title', 'speaker']].head(4))

In [None]:
# Filter descriptionText column for this term:
search_term_3 = "Czimer Gy√∂rgyi"
search_term_3_a = "Er≈ëforr√°sgy≈±jt≈ë H√©tv√©ge"
search_term_3_b = "Csob√°nkai Er≈ëforr√°st√°bor"

mask_filtered_3_df = (
    (events_df['descriptionText'].str.contains(search_term_3, case=False, na=False)) & 
    (~events_df['title'].str.contains(search_term_3_a, case=False, na=False)) & 
    (~events_df['title'].str.contains(search_term_3_b, case=False, na=False))
)

filtered_3_df = events_df[mask_filtered_3_df].copy()

# Extract snippets
filtered_3_df['snippet'] = filtered_3_df['descriptionText'].apply(
    lambda x: extract_snippet(x, search_term_3, length=500)
)

print(f"‚úÖ Found {len(filtered_3_df)} matches.")
display(filtered_3_df[['id', 'title', 'snippet']])

In [None]:
# Update the 'speaker' column in the main DataFrame
events_df.loc[filtered_3_df.index, 'speaker'] = "Czimer Gy√∂rgyi"

# Statistics for the WHOLE project (events_df)
total_has_speaker = (events_df['speaker'] != "").sum()
total_no_speaker = (events_df['speaker'] == "").sum()

print(f"üìä Global Speaker Statistics (Total Dataset):")
print(f"   - Events with a speaker: {total_has_speaker}")
print(f"   - Events without a speaker: {total_no_speaker}")
print(f"   - Total events: {len(events_df)}")

print("\n‚úÖ Verification of updated 'Czimer Gy√∂rgyi' rows:")
display(events_df.loc[filtered_3_df.index, ['id', 'title', 'speaker']].head(4))

In [None]:
# Filter term
search_term_4 = "T√≥tiv√°n Tibor"

# Apply the mask to events_df to create a proper DataFrame
filtered_4_df = events_df[events_df['descriptionText'].str.contains(search_term_4, case=False, na=False)].copy()

# Extract snippets (now works because filtered_df_4 is a DataFrame)
filtered_4_df['snippet'] = filtered_4_df['descriptionText'].apply(
    lambda x: extract_snippet(x, search_term_4, length=250)
)

print(f"‚úÖ Found {len(filtered_4_df)} matches.")

display(filtered_4_df[['id', 'title', 'snippet']])

In [None]:
# Update the 'speaker' column in the main DataFrame
events_df.loc[filtered_4_df.index, 'speaker'] = "T√≥tiv√°n Tibor"

# Statistics for the WHOLE project
total_has_speaker = (events_df['speaker'] != "").sum()
total_no_speaker = (events_df['speaker'] == "").sum()

print(f"üìä Global Speaker Statistics (Total Dataset):")
print(f"   - Events with a speaker: {total_has_speaker}")
print(f"   - Events without a speaker: {total_no_speaker}")
print(f"   - Total events: {len(events_df)}")

print("\n‚úÖ Verification of updated 'T√≥tiv√°n Tibor' rows:")
display(events_df.loc[filtered_4_df.index, ['id', 'title', 'speaker']].head(4))

In [None]:
# Define the titles 
target_titles = [
    "H√≥vir√°gt√∫ra a Normaf√°n",
    "√ñsszehangolva a Simonton m√≥dszerrel (online, okt. 16.)",
    "Tavaszi meg√∫jul√°s - Budapest",
    "√ñsszehangolva a V√°rosligetben - SIMONTON KLUB"
]

# Identify the rows where speaker is empty AND title matches our list
mask = (events_df['speaker'].isna() | (events_df['speaker'] == "")) & (events_df['title'].isin(target_titles))

# Apply the update only to those specific rows
events_df.loc[mask, 'speaker'] = "Dr. Prezenszki Zsuzsanna"

print(f"‚úÖ Successfully updated {mask.sum()} rows.")

display(events_df[events_df['title'].isin(target_titles)][['title', 'speaker']])

In [None]:
# Calculate the summary of speakers
speaker_summary = events_df['speaker'].replace("", "‚ö†Ô∏è MISSING/UNASSIGNED").value_counts().reset_index()
speaker_summary.columns = ['Speaker Name', 'Count']

print("üìä FINAL PROJECT SUMMARY")
print("-" * 30)
display(speaker_summary)

print("\nüëÄ DATA PREVIEW (First 5 rows):")
display(events_df[['id', 'title', 'speaker']].head(5))

In [None]:
# Export to JSON
# force_ascii=False is crucial for keeping Hungarian characters like ≈ë, √∫, √©
if pd.api.types.is_datetime64_any_dtype(events_df['date']):
    events_df['date'] = events_df['date'].dt.strftime('%Y-%m-%d')

# date_format='iso' ensures dates are readable strings
# categories will be included as long as they are in the events_df
events_df.to_json(
    OUTPUT_FILE, 
    orient='records', 
    indent=4, 
    force_ascii=False, 
    date_format='iso'
)

print(f"‚úÖ Success! Cleaned data is saved to: {OUTPUT_FILE}")