In [31]:
import pandas as pd
import json
from pathlib import Path 
from dataclasses import dataclass, field, asdict

In [25]:
_ls = []

with open("outdoor_playgrounds/playgrounds_deduplicated.json", 'r', encoding='utf-8') as f:
    data = json.load(f)
    _ls = data

In [34]:
from src.core.event import Event
from src.utils.file_utils import save_to_json
event_obj_ls = []
for event_dict in _ls:
    event_obj = Event.from_dict(event_dict)
    event_obj.full_address, event_obj.latitude, event_obj.longitude = event_obj.get_address_n_coord()
    event_obj.images = event_obj.get_images(Path("outdoor_playgrounds/images"))
    event_obj_ls += [event_obj]

events_dict_ls_ = [asdict(event) for event in event_obj_ls]
save_to_json(events_dict_ls_, Path('outdoor_playgrounds/final.json'),2)

│ │ │ [custom_search.search_valid_url] Search failed: 'items'
│ │ │ [custom_search.search_images] No images found from any source. Query: Toa Payoh Lorong 4 Playground: Shadow Play Fun by Local Community
│ │ │ [custom_search.search_valid_url] Search failed: 'items'
│ │ │ [custom_search.search_images] No images found from any source. Query: Toa Payoh Central Playground: Dry & Wet Fun by Local Community
│ │ │ [custom_search.search_valid_url] Search failed: 'items'
│ │ │ [custom_search.search_images] No images found from any source. Query: SkyResidences @ Dawson: Treehouse & Obstacles by Playpoint
│ │ │ [custom_search.search_images] No images found from any source. Query: St George’s Tower Playground: Climber’s Paradise by Playpoint
│ │ │ [custom_search.search_valid_url] Search failed: 'items'
│ │ │ [custom_search.search_images] No images found from any source. Query: Whampoa Park Playground: Forest-themed Obstacles by Playpoint
│ │ │ [custom_search.search_valid_url] Search failed: 'items

True

In [16]:
def preprocess_title(title):
    """
    Normalize title for better fuzzy matching:
    - Convert to lowercase
    - Remove extra whitespace
    - Remove special characters but keep letters, numbers, and spaces
    """
    if not title:
        return ""
    
    # Convert to lowercase
    title = title.lower()
    
    # Remove special characters but keep alphanumeric and spaces
    title = re.sub(r'[^\w\s]', ' ', title)
    
    # Replace multiple spaces with single space and strip
    title = re.sub(r'\s+', ' ', title).strip()
    
    return title

def fuzzy_deduplicate(items, title_key='title', similarity_threshold=85, include_ratio=True):
    """
    Identify potential duplicates using fuzzy string matching.
    
    Args:
        items: List of dictionaries
        title_key: Key containing the title field
        similarity_threshold: Minimum similarity score (0-100) to consider as duplicate
        include_ratio: Whether to include similarity ratios in results
    
    Returns:
        Dictionary with duplicate groups and their similarity scores
    """
    
    # Preprocess all titles
    processed_titles = {}
    for i, item in enumerate(items):
        original_title = item.get(title_key, '')
        processed_title = preprocess_title(original_title)
        processed_titles[i] = {
            'original': original_title,
            'processed': processed_title,
            'item': item
        }
    
    # Find duplicates
    duplicate_groups = []
    processed_indices = set()
    
    for i, title_data in processed_titles.items():
        if i in processed_indices:
            continue
            
        current_group = [i]
        current_title = title_data['processed']
        
        # Compare with remaining titles
        for j in range(i + 1, len(processed_titles)):
            if j in processed_indices:
                continue
                
            other_title = processed_titles[j]['processed']
            
            # Use different fuzzy matching algorithms
            ratio = fuzz.ratio(current_title, other_title)
            partial_ratio = fuzz.partial_ratio(current_title, other_title)
            token_sort_ratio = fuzz.token_sort_ratio(current_title, other_title)
            token_set_ratio = fuzz.token_set_ratio(current_title, other_title)
            
            # Use the highest score from different algorithms
            max_similarity = max(ratio, partial_ratio, token_sort_ratio, token_set_ratio)
            
            if max_similarity >= similarity_threshold:
                current_group.append(j)
                if include_ratio:
                    processed_titles[j]['similarity_to_first'] = max_similarity
        
        # Only add groups with more than one item (actual duplicates)
        if len(current_group) > 1:
            duplicate_groups.append(current_group)
            processed_indices.update(current_group)
    
    # Format results
    results = []
    for group_indices in duplicate_groups:
        group = []
        for idx in group_indices:
            item_info = {
                'index': idx,
                'original_title': processed_titles[idx]['original'],
                'processed_title': processed_titles[idx]['processed'],
                'item': processed_titles[idx]['item']
            }
            if 'similarity_to_first' in processed_titles[idx]:
                item_info['similarity_score'] = processed_titles[idx]['similarity_to_first']
            group.append(item_info)
        results.append(group)
    
    return results

# Test the function
print("Running fuzzy deduplication...")
duplicate_groups = fuzzy_deduplicate(_ls, similarity_threshold=80)
print(f"Found {len(duplicate_groups)} groups of potential duplicates")


Running fuzzy deduplication...
Found 35 groups of potential duplicates


In [11]:
def display_duplicates(duplicate_groups, max_groups=10):
    """Display duplicate groups in a readable format"""
    
    print(f"=== DUPLICATE GROUPS (showing first {min(len(duplicate_groups), max_groups)}) ===\n")
    
    for i, group in enumerate(duplicate_groups[:max_groups]):
        print(f"GROUP {i+1}: ({len(group)} items)")
        print("-" * 50)
        
        for j, item in enumerate(group):
            similarity_text = ""
            if 'similarity_score' in item:
                similarity_text = f" (similarity: {item['similarity_score']:.1f}%)"
            
            print(f"  {j+1}. [{item['index']}] {item['original_title']}{similarity_text}")
            
            # Show some additional context (venue, organiser)
            venue = item['item'].get('venue_name', 'N/A')
            organiser = item['item'].get('organiser', 'N/A')
            print(f"      Venue: {venue}")
            print(f"      Organiser: {organiser}")
            print()
        
        print()

def remove_duplicates_interactive(items, duplicate_groups):
    """
    Remove duplicates interactively, allowing user to choose which ones to keep
    """
    indices_to_remove = set()
    
    print("For each group, choose which item to KEEP (others will be removed)")
    print("Enter the number of the item to keep, or 'skip' to keep all items in the group\n")
    
    for i, group in enumerate(duplicate_groups):
        print(f"\nGROUP {i+1}:")
        for j, item in enumerate(group):
            similarity_text = ""
            if 'similarity_score' in item:
                similarity_text = f" (similarity: {item['similarity_score']:.1f}%)"
            print(f"  {j+1}. {item['original_title']}{similarity_text}")
        
        choice = input(f"Which item to keep? (1-{len(group)} or 'skip'): ").strip().lower()
        
        if choice != 'skip':
            try:
                keep_index = int(choice) - 1
                if 0 <= keep_index < len(group):
                    # Mark all other items in group for removal
                    for j, item in enumerate(group):
                        if j != keep_index:
                            indices_to_remove.add(item['index'])
                    print(f"Keeping: {group[keep_index]['original_title']}")
                else:
                    print("Invalid choice, skipping this group")
            except ValueError:
                print("Invalid input, skipping this group")
    
    # Create new list without the removed items
    cleaned_items = [item for i, item in enumerate(items) if i not in indices_to_remove]
    
    print(f"\nRemoved {len(indices_to_remove)} duplicates")
    print(f"Original count: {len(items)}")
    print(f"After deduplication: {len(cleaned_items)}")
    
    return cleaned_items

def remove_duplicates_automatic(items, duplicate_groups, keep_strategy='first'):
    """
    Automatically remove duplicates using a specified strategy
    
    Args:
        items: Original list of items
        duplicate_groups: Groups of duplicates found
        keep_strategy: 'first', 'last', 'longest_title', 'shortest_title'
    """
    indices_to_remove = set()
    
    for group in duplicate_groups:
        if keep_strategy == 'first':
            keep_index = 0
        elif keep_strategy == 'last':
            keep_index = len(group) - 1
        elif keep_strategy == 'longest_title':
            keep_index = max(range(len(group)), key=lambda i: len(group[i]['original_title']))
        elif keep_strategy == 'shortest_title':
            keep_index = min(range(len(group)), key=lambda i: len(group[i]['original_title']))
        else:
            keep_index = 0  # default to first
        
        # Mark all other items for removal
        for j, item in enumerate(group):
            if j != keep_index:
                indices_to_remove.add(item['index'])
    
    # Create new list without the removed items
    cleaned_items = [item for i, item in enumerate(items) if i not in indices_to_remove]
    
    print(f"Automatic deduplication complete!")
    print(f"Strategy used: {keep_strategy}")
    print(f"Removed {len(indices_to_remove)} duplicates")
    print(f"Original count: {len(items)}")
    print(f"After deduplication: {len(cleaned_items)}")
    
    return cleaned_items

# Display the found duplicates
display_duplicates(duplicate_groups)


=== DUPLICATE GROUPS (showing first 10) ===

GROUP 1: (3 items)
--------------------------------------------------
  1. [0] PropNex Family Zone: Supertree-inspired Playground
      Venue: Active Garden, Gardens by the Bay
      Organiser: PropNex & Gardens by the Bay

  2. [67] Propnex Family Zone Supertree Fun (similarity: 95.4%)
      Venue: Propnex Family Zone, Gardens by the Bay
      Organiser: Gardens by the Bay

  3. [103] PropNex Family Zone (similarity: 100.0%)
      Venue: PropNex Family Zone
      Organiser: 


GROUP 2: (2 items)
--------------------------------------------------
  1. [1] COMO Adventure Grove: Botanic Garden Fun Zone
      Venue: COMO Adventure Grove, Singapore Botanic Gardens
      Organiser: Singapore Botanic Gardens & PlayPoint Asia

  2. [101] COMO Adventure Grove (similarity: 100.0%)
      Venue: COMO Adventure Grove
      Organiser: 


GROUP 3: (3 items)
--------------------------------------------------
  1. [2] Jacob Ballas Children’s Garden: Asia’s 

In [21]:
# OPTION 1: Automatic deduplication (keeps first item from each duplicate group)
# print("=== AUTOMATIC DEDUPLICATION ===")
# cleaned_list_auto = remove_duplicates_automatic(_ls, duplicate_groups, keep_strategy='first')

# OPTION 2: Interactive deduplication (uncomment to use)
print("\n=== INTERACTIVE DEDUPLICATION ===")
cleaned_list_interactive = remove_duplicates_interactive(_ls, duplicate_groups)

# OPTION 3: Try different similarity thresholds
# print("\n=== TRYING DIFFERENT SIMILARITY THRESHOLDS ===")
# for threshold in [70, 80, 85, 90]:
#     groups = fuzzy_deduplicate(_ls, similarity_threshold=threshold)
#     print(f"Threshold {threshold}%: Found {len(groups)} groups of duplicates")

# OPTION 4: Save the cleaned data
print("\n=== SAVING CLEANED DATA ===")
# Save as JSON
with open('playgrounds_deduplicated.json', 'w', encoding='utf-8') as f:
    json.dump(cleaned_list_interactive, f, indent=2, ensure_ascii=False)

# Save as CSV for easy viewing
# df_clean = pd.DataFrame(cleaned_list_auto)
# df_clean.to_csv('playgrounds_deduplicated.csv', index=False)

# print(f"Saved deduplicated data to:")
# print("- playgrounds_deduplicated.json")  
# print("- playgrounds_deduplicated.csv")
# print(f"Final count: {len(cleaned_list_auto)} playgrounds")



=== INTERACTIVE DEDUPLICATION ===
For each group, choose which item to KEEP (others will be removed)
Enter the number of the item to keep, or 'skip' to keep all items in the group


GROUP 1:
  1. PropNex Family Zone: Supertree-inspired Playground
  2. Propnex Family Zone Supertree Fun (similarity: 95.4%)
  3. PropNex Family Zone (similarity: 100.0%)
Keeping: PropNex Family Zone: Supertree-inspired Playground

GROUP 2:
  1. COMO Adventure Grove: Botanic Garden Fun Zone
  2. COMO Adventure Grove (similarity: 100.0%)
Keeping: COMO Adventure Grove: Botanic Garden Fun Zone

GROUP 3:
  1. Jacob Ballas Children’s Garden: Asia’s Largest
  2. Jacob Ballas Children's Garden Adventures (similarity: 89.2%)
  3. Jacob Ballas Children’s Garden (similarity: 100.0%)
Keeping: Jacob Ballas Children’s Garden: Asia’s Largest

GROUP 4:
  1. Far East Organization Children’s Garden Wet Fun
  2. Far East Organization Children's Garden (similarity: 100.0%)
  3. Far East Organization Children’s Playground (sim

In [17]:
# UTILITY FUNCTIONS FOR FINE-TUNING

def analyze_duplicates(duplicate_groups):
    """Analyze the duplicate groups to understand patterns"""
    
    print("=== DUPLICATE ANALYSIS ===")
    print(f"Total duplicate groups: {len(duplicate_groups)}")
    
    group_sizes = [len(group) for group in duplicate_groups]
    print(f"Average group size: {sum(group_sizes) / len(group_sizes):.1f}")
    print(f"Largest group size: {max(group_sizes)}")
    print(f"Total duplicates that would be removed: {sum(group_sizes) - len(duplicate_groups)}")
    
    # Show similarity score distribution
    all_scores = []
    for group in duplicate_groups:
        for item in group:
            if 'similarity_score' in item:
                all_scores.append(item['similarity_score'])
    
    if all_scores:
        print(f"Similarity scores - Min: {min(all_scores):.1f}, Max: {max(all_scores):.1f}, Avg: {sum(all_scores)/len(all_scores):.1f}")

def find_specific_duplicates(items, search_term, similarity_threshold=80):
    """Find duplicates for a specific playground"""
    
    matches = []
    search_processed = preprocess_title(search_term)
    
    for i, item in enumerate(items):
        title = item.get('title', '')
        processed_title = preprocess_title(title)
        
        # Check similarity
        ratio = fuzz.ratio(search_processed, processed_title)
        partial_ratio = fuzz.partial_ratio(search_processed, processed_title)
        token_sort_ratio = fuzz.token_sort_ratio(search_processed, processed_title)
        token_set_ratio = fuzz.token_set_ratio(search_processed, processed_title)
        
        max_similarity = max(ratio, partial_ratio, token_sort_ratio, token_set_ratio)
        
        if max_similarity >= similarity_threshold:
            matches.append({
                'index': i,
                'title': title,
                'similarity': max_similarity,
                'venue': item.get('venue_name', 'N/A')
            })
    
    print(f"Found {len(matches)} matches for '{search_term}':")
    for match in sorted(matches, key=lambda x: x['similarity'], reverse=True):
        print(f"  {match['similarity']:.1f}% - {match['title']} ({match['venue']})")
    
    return matches

# Run analysis
analyze_duplicates(duplicate_groups)

# Example: Find duplicates for a specific playground
# find_specific_duplicates(_ls, "Gardens by the Bay")


=== DUPLICATE ANALYSIS ===
Total duplicate groups: 35
Average group size: 2.5
Largest group size: 4
Total duplicates that would be removed: 51
Similarity scores - Min: 80.0, Max: 100.0, Avg: 92.0
