In [5]:
import pandas as pd
import ast
from rapidfuzz import process, fuzz

# Step 1: Load data
df = pd.read_csv("fuzzy_grouped_properties.csv")

In [6]:
df.head()

Unnamed: 0,PropertyName,PropertySubName,NearbyLocations,LocationAdvantages,Link,PriceDetails,TopFacilities
0,Smartworld One DXP,"2, 3, 4 BHK Apartment in Sector 113, Gurgaon","['Bajghera Road', 'Palam Vihar Halt', 'DPSG Pa...","{'Bajghera Road': '800 Meter', 'Palam Vihar Ha...",https://www.99acres.com/smartworld-one-dxp-sec...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Salon', 'Restaurant', 'Spa'..."
1,M3M Crown,"3, 4 BHK Apartment in Sector 111, Gurgaon","['DPSG Palam Vihar Gurugram', 'The NorthCap Un...","{'DPSG Palam Vihar Gurugram': '1.4 Km', 'The N...",https://www.99acres.com/m3m-crown-sector-111-g...,"{'3 BHK': {'building_type': 'Apartment', 'area...","['Bowling Alley', 'Mini Theatre', 'Manicured G..."
2,Adani Brahma Samsara Vilasa,"Land, 3, 4 BHK Independent Floor in Sector 63,...","['AIPL Business Club Sector 62', 'Heritage Xpe...","{'AIPL Business Club Sector 62': '2.7 Km', 'He...",https://www.99acres.com/adani-brahma-samsara-v...,{'3 BHK': {'building_type': 'Independent Floor...,"['Terrace Garden', 'Gazebo', 'Fountain', 'Amph..."
3,Sobha City,"2, 3, 4 BHK Apartment in Sector 108, Gurgaon","['The Shikshiyan School', 'WTC Plaza', 'Luxus ...","{'The Shikshiyan School': '2.9 KM', 'WTC Plaza...",https://www.99acres.com/sobha-city-sector-108-...,"{'2 BHK': {'building_type': 'Apartment', 'area...","['Swimming Pool', 'Volley Ball Court', 'Aerobi..."
4,Signature Global City 93,"2, 3 BHK Independent Floor in Sector 93 Gurgaon","['Pranavananda Int. School', 'DLF Site central...","{'Pranavananda Int. School': '450 m', 'DLF Sit...",https://www.99acres.com/signature-global-city-...,{'2 BHK': {'building_type': 'Independent Floor...,"['Mini Theatre', 'Doctor on Call', 'Concierge ..."


In [9]:
import pandas as pd
import ast
from rapidfuzz import process, fuzz

# Step 1: Load data
df = pd.read_csv("fuzzy_grouped_properties.csv")  # change to your actual file

# Step 2: Parse LocationAdvantage dicts safely
def parse_location_dict(x):
    try:
        return ast.literal_eval(x)
    except:
        return {}

df['__ParsedLocDict__'] = df['LocationAdvantages'].apply(parse_location_dict)  # Temp parsed column

# Step 3: Extract all unique place names
all_places = set()
for loc_dict in df['__ParsedLocDict__']:
    all_places.update(loc_dict.keys())
all_places = list(all_places)

# Step 4: Fuzzy group similar names
mapped_places = {}
visited = set()

for place in all_places:
    if place in visited:
        continue
    matches = process.extract(place, all_places, scorer=fuzz.token_sort_ratio, score_cutoff=90)
    canonical = place
    for match, score, _ in matches:
        mapped_places[match] = canonical
        visited.add(match)

# Step 5: Apply mapping to replace keys in each dictionary
def replace_keys_with_fuzzy(d, mapping):
    return {mapping.get(k, k): v for k, v in d.items()}

df['__CleanedLocDict__'] = df['__ParsedLocDict__'].apply(lambda d: replace_keys_with_fuzzy(d, mapped_places))

# Step 6: Replace original column with cleaned version (as stringified dict to keep format)
df['LocationAdvantages'] = df['__CleanedLocDict__'].apply(lambda d: str(d))

# Step 7: Drop temp columns
df.drop(columns=['__ParsedLocDict__', '__CleanedLocDict__'], inplace=True)

# Step 8: Save updated data
df.to_csv("location_advantage_fuzzy_cleaned.csv", index=False)

print("✔️ LocationAdvantage column updated with fuzzy-matched values.")


✔️ LocationAdvantage column updated with fuzzy-matched values.


In [12]:
import pandas as pd
import ast
import re # Import re for regex operations
from rapidfuzz import process, fuzz

# Step 1: Load data
df = pd.read_csv("fuzzy_grouped_properties.csv") # change to your actual file

# Step 2: Parse LocationAdvantage dicts safely and normalize keys immediately
def parse_location_dict(x):
    parsed_dict = {}
    try:
        raw_dict = ast.literal_eval(x)
        for key, value in raw_dict.items():
            # More aggressive normalization: lowercase, strip, replace multiple spaces
            normalized_key = re.sub(r'\s+', ' ', str(key).lower().strip())
            parsed_dict[normalized_key] = value
    except (ValueError, SyntaxError):
        pass # Return empty dict for unparseable strings
    return parsed_dict

df['__ParsedLocDict__'] = df['LocationAdvantages'].apply(parse_location_dict) # Temp parsed column

# Step 3: Extract all unique place names (now already normalized from Step 2)
all_places_normalized = set()
for loc_dict in df['__ParsedLocDict__']:
    all_places_normalized.update(loc_dict.keys()) # Keys are already normalized here
all_places_normalized_list = list(all_places_normalized)

# --- Add this print statement to inspect the list BEFORE fuzzy grouping ---
print("\n--- all_places_normalized_list BEFORE Fuzzy Grouping (inspect this!) ---")
for p in sorted(all_places_normalized_list):
    print(f"- {p}")
print(f"Total entries BEFORE fuzzy grouping: {len(all_places_normalized_list)}")
print("------------------------------------------------------------------\n")


# Step 4: Fuzzy group similar names based on normalized forms
mapped_places = {} 
visited = set()

for place_norm in all_places_normalized_list: # 'place_norm' is already normalized
    if place_norm in visited:
        continue
    
    matches = process.extract(place_norm, all_places_normalized_list, scorer=fuzz.token_sort_ratio, score_cutoff=90)
    
    canonical_form = place_norm 
    for match_norm, score, _ in matches:
        mapped_places[match_norm] = canonical_form
        visited.add(match_norm)

# Step 5: Apply mapping (keys in __ParsedLocDict__ are already normalized, so direct mapping lookup)
def replace_keys_with_fuzzy(d, mapping):
    cleaned_dict = {}
    for normalized_key, value in d.items():
        # Get the canonical normalized key directly from the mapping
        canonical_normalized_key = mapping.get(normalized_key, normalized_key)
        cleaned_dict[canonical_normalized_key] = value
    return cleaned_dict

df['__CleanedLocDict__'] = df['__ParsedLocDict__'].apply(lambda d: replace_keys_with_fuzzy(d, mapped_places))

# Step 6: Replace original column with cleaned version (as stringified dict to keep format)
df['LocationAdvantages'] = df['__CleanedLocDict__'].apply(lambda d: str(d))

# Step 7: Drop temp columns
df.drop(columns=['__ParsedLocDict__', '__CleanedLocDict__'], inplace=True)

# Step 8: Save updated data
df.to_csv("location_advantage_fuzzy_cleaned.csv", index=False)

print("✔️ LocationAdvantage column updated with fuzzy-matched values.")


--- all_places_normalized_list BEFORE Fuzzy Grouping (inspect this!) ---
- aapno ghar
- aapnoghar
- aarvy healthcare
- aarvy healthcare hospital
- aarvy healthcare super speciality
- aarvy hospital
- aatish hospital
- accenture ddc5
- adarsh public school,garhi harsaru
- adarsh senior secondary school
- agri business management collage
- aiims
- aiims jhajjar
- aipl business centre
- aipl business club
- aipl business club sector 62
- aipl business co working space
- aipl business tower
- aipl joy street mall
- airia mall
- airia mall sector 68
- airport
- ajit stadium dhanwapur
- alfaa health care hospital
- alpine convent school
- alpine hospital
- alpine school
- altrade business centre
- aman hospital
- aman hospital & surgical centre
- ambience mall
- ambience mall new
- ambience public school
- american express
- amity
- amity university
- amity university gurugram
- amma hospital
- anand multispeciality hospital
- anand preschool
- ananta hospital
- ansal plaza
- anya gurgaon
-

In [20]:
import pandas as pd
import ast
import re
from rapidfuzz import process, fuzz

# Step 1: Load data
# IMPORTANT: Ensure 'fuzzy_grouped_properties.csv' is your original raw data file,
# not any intermediate fuzzy_cleaned versions.
df = pd.read_csv("fuzzy_grouped_properties.csv")

# Step 2: Parse LocationAdvantage dicts safely and aggressively normalize keys and VALUES immediately
def parse_location_dict(x):
    parsed_dict = {}
    try:
        raw_dict = ast.literal_eval(x)
        for key, value_str in raw_dict.items(): # Renamed 'value' to 'value_str' for clarity
            normalized_key = str(key).lower().strip()
            
            # Aggressive Key Normalization Steps:
            # 1. Remove text within parentheses (e.g., "(gurugram)")
            normalized_key = re.sub(r'\s*\(.*?\)\s*', '', normalized_key).strip()
            # 2. Replace multiple spaces with a single space
            normalized_key = re.sub(r'\s+', ' ', normalized_key).strip()
            
            # --- Targeted Fix for known persistent issues (e.g., 'aapno ghar' vs 'aapnoghar') ---
            if normalized_key == 'aapnoghar':
                normalized_key = 'aapno ghar'
            # --- End Targeted Fix ---

            # --- VALUE CONVERSION (NEW FIX FOR TYPEERROR) ---
            numeric_value = 0.0
            # Use regex to find number and optional unit (KM, Meter)
            match = re.match(r'(\d+\.?\d*)\s*(KM|Meter)?', str(value_str), re.IGNORECASE)
            if match:
                num = float(match.group(1))
                unit = match.group(2)
                if unit:
                    unit = unit.lower()
                
                if unit == 'km':
                    numeric_value = num * 1000.0 # Convert KM to meters
                elif unit == 'meter':
                    numeric_value = num # Already in meters
                else:
                    numeric_value = num # Assume meters if no unit or unrecognized unit
            else:
                # If extraction fails, try direct conversion to float
                try:
                    numeric_value = float(value_str)
                except ValueError:
                    numeric_value = 0.0 # Default to 0.0 if cannot parse
            # --- END VALUE CONVERSION ---
            
            parsed_dict[normalized_key] = numeric_value
    except (ValueError, SyntaxError):
        # Return empty dict for unparseable strings to avoid errors
        pass 
    return parsed_dict

df['__ParsedLocDict__'] = df['LocationAdvantages'].apply(parse_location_dict)

# Step 3: Extract all unique place names (already more aggressively normalized from Step 2)
all_places_normalized = set()
for loc_dict in df['__ParsedLocDict__']:
    all_places_normalized.update(loc_dict.keys())
all_places_normalized_list = list(all_places_normalized)

print("\n--- all_places_normalized_list AFTER All Normalization (inspect this!) ---")
# Print sorted list to make it easier to inspect
for p in sorted(all_places_normalized_list):
    print(f"- {p}")
print(f"Total entries AFTER all normalization: {len(all_places_normalized_list)}")
print("------------------------------------------------------------------\n")


# Step 4: Fuzzy group similar names based on normalized forms
mapped_places = {} 
visited = set()

for place_norm in all_places_normalized_list:
    if place_norm in visited:
        continue
    
    # Use token_sort_ratio for robust comparison, with a cutoff of 85
    matches = process.extract(place_norm, all_places_normalized_list, scorer=fuzz.token_sort_ratio, score_cutoff=85)
    
    canonical_form = place_norm # The current 'place_norm' becomes the canonical form for its group
    for match_norm, score, _ in matches:
        mapped_places[match_norm] = canonical_form
        visited.add(match_norm)

# Step 5: Apply mapping (keys in __ParsedLocDict__ are already normalized, so direct mapping lookup)
def replace_keys_with_fuzzy(d, mapping):
    cleaned_dict = {}
    for normalized_key, value in d.items():
        # Look up the canonical form in the mapping, default to normalized_key if not found
        canonical_normalized_key = mapping.get(normalized_key, normalized_key)
        # Aggregate values for the same canonical key (if multiple original keys map to it)
        # Value is now guaranteed to be numeric (float)
        cleaned_dict[canonical_normalized_key] = cleaned_dict.get(canonical_normalized_key, 0.0) + value 
    return cleaned_dict

df['__CleanedLocDict__'] = df['__ParsedLocDict__'].apply(lambda d: replace_keys_with_fuzzy(d, mapped_places))

# Step 6: Replace original column with cleaned version (as stringified dict to keep format)
df['LocationAdvantages'] = df['__CleanedLocDict__'].apply(lambda d: str(d))

# Step 7: Drop temporary columns
df.drop(columns=['__ParsedLocDict__', '__CleanedLocDict__'], inplace=True)

# Step 8: Save updated data
df.to_csv("location_advantage_fuzzy_cleaned.csv", index=False)

print("✔️ LocationAdvantage column updated with fuzzy-matched values and saved to 'location_advantage_fuzzy_cleaned.csv'.")
print("The script now also handles converting distance strings (e.g., '800 Meter', '2.5 KM') to numeric values (in meters) for proper aggregation.")
print("Please inspect 'location_advantage_fuzzy_cleaned.csv' and the printed list of unique places to confirm the merges and value conversions.")


--- all_places_normalized_list AFTER All Normalization (inspect this!) ---
- aapno ghar
- aarvy healthcare
- aarvy healthcare hospital
- aarvy healthcare super speciality
- aarvy hospital
- aatish hospital
- accenture ddc5
- adarsh public school,garhi harsaru
- adarsh senior secondary school
- agri business management collage
- aiims
- aiims jhajjar
- aipl business centre
- aipl business club
- aipl business club sector 62
- aipl business co working space
- aipl business tower
- aipl joy street mall
- airia mall
- airia mall sector 68
- airport
- ajit stadium dhanwapur
- alfaa health care hospital
- alpine convent school
- alpine hospital
- alpine school
- altrade business centre
- aman hospital
- aman hospital & surgical centre
- ambience mall
- ambience mall new
- ambience public school
- american express
- amity
- amity university
- amity university gurugram
- amma hospital
- anand multispeciality hospital
- anand preschool
- ananta hospital
- ansal plaza
- anya gurgaon
- ap sports

In [18]:
import pandas as pd
import ast

# --- Step 1: Load the fuzzy-cleaned CSV file ---
try:
    df_cleaned = pd.read_csv("location_advantage_fuzzy_cleaned.csv")
    # Ensure 'LocationAdvantages' column is treated as a string before parsing
    df_cleaned['LocationAdvantages'] = df_cleaned['LocationAdvantages'].astype(str)
except FileNotFoundError:
    print("Error: 'location_advantage_fuzzy_cleaned.csv' not found. Please ensure your fuzzy cleaning script has been run and the file exists in the correct directory.")
    exit()

# --- Helper function to parse stringified dictionaries safely ---
def parse_location_dict_safe(x):
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return {} # Return an empty dict for unparseable strings

# --- Step 2: Extract all unique location names from the 'LocationAdvantages' column ---
all_canonical_locations = set()

for index, row in df_cleaned.iterrows():
    loc_dict = parse_location_dict_safe(row['LocationAdvantages'])
    all_canonical_locations.update(loc_dict.keys())

# Convert to a sorted list for easy viewing
unique_places_in_csv = sorted(list(all_canonical_locations))

# --- Step 3: Print the list of unique places and their count ---
print("--- Unique Canonical Places in location_advantage_fuzzy_cleaned.csv ---")
print("List of all unique canonical places:")
for place in unique_places_in_csv:
    print(f"- {place}")
print(f"\nTotal number of unique places: {len(unique_places_in_csv)}")
print("---------------------------------------------------------------")

--- Unique Canonical Places in location_advantage_fuzzy_cleaned.csv ---
List of all unique canonical places:
- aapno ghar
- aarvy healthcare
- aarvy healthcare hospital
- aarvy healthcare super speciality
- aarvy hospital
- aatish hospital
- accenture ddc5
- adarsh public school,garhi harsaru
- adarsh senior secondary school
- agri business management collage
- aiims
- aiims jhajjar
- aipl business centre
- aipl business club
- aipl business club sector 62
- aipl business co working space
- aipl business tower
- aipl joy street mall
- airia mall
- airia mall sector 68
- airport
- ajit stadium dhanwapur
- alfaa health care hospital
- alpine convent school
- alpine hospital
- alpine school
- altrade business centre
- aman hospital & surgical centre
- ambience mall
- ambience public school
- american express
- amity
- amity university
- amity university gurugram
- amma hospital
- anand multispeciality hospital
- anand preschool
- ananta hospital
- ansal plaza
- anya gurgaon
- ap sports cr