In [None]:
import pandas as pd
import json # For parsing the stringified dictionary if it's still a string
import ast  # For literal_eval if needed

# Load your fuzzy-cleaned data
df = pd.read_csv("location_advantage_fuzzy_cleaned.csv")

all_canonical_locations = set()

for index, row in df.iterrows():
    try:
        # Ensure the LocationAdvantages column is treated as a dictionary
        # If it's stored as a string, use ast.literal_eval or json.loads
        loc_dict = ast.literal_eval(row['LocationAdvantages'])
        all_canonical_locations.update(loc_dict.keys())
    except (ValueError, SyntaxError):
        # Handle cases where parsing might fail
        continue

# Convert to a sorted list for consistent column order in location_df
unique_location_columns = sorted(list(all_canonical_locations))

In [None]:
# Assuming 'PropertyName' is the unique identifier for your properties
property_names = df['PropertyName'].tolist()
location_df = pd.DataFrame(index=property_names, columns=unique_location_columns)

In [None]:
for index, row in df.iterrows():
    property_name = row['PropertyName']
    try:
        loc_dict = ast.literal_eval(row['LocationAdvantages'])
        for location, distance_str in loc_dict.items():
            # Convert distance string (e.g., '800 Meter', '2.5 KM') to numeric (e.g., meters)
            # You'll need a robust function for this conversion if not already done
            distance_value = your_distance_conversion_function(distance_str) # Implement this function
            location_df.loc[property_name, location] = distance_value
    except (ValueError, SyntaxError):
        continue

# Fill any NaN values (properties without a certain location advantage) with a suitable default
# e.g., a very large number, or 0, depending on your distance interpretation
location_df = location_df.fillna(0) # Or a large number like 999999 for "no advantage"

In [5]:
import pandas as pd
import ast
import re
import pickle

# --- Step 1: Load your fuzzy-cleaned data ---
# Make sure 'location_advantage_fuzzy_cleaned.csv' is the output from your previous fuzzy cleaning script.
try:
    df = pd.read_csv("location_advantage_fuzzy_cleaned.csv")
    # Ensure 'LocationAdvantages' column is treated as a string before parsing
    df['LocationAdvantages'] = df['LocationAdvantages'].astype(str)
except FileNotFoundError:
    print("Error: 'location_advantage_fuzzy_cleaned.csv' not found. Please ensure your fuzzy cleaning script has been run and the file exists.")
    exit()

# --- Helper function to parse stringified dictionaries safely ---
def parse_location_dict_safe(x):
    try:
        return ast.literal_eval(x)
    except (ValueError, SyntaxError):
        return {} # Return an empty dict for unparseable strings

# --- Helper function to convert distance strings to a consistent numeric format (e.g., meters) ---
def convert_distance_to_meters(distance_str):
    if not isinstance(distance_str, str):
        return None # Or a default value like 0, or raise an error

    distance_str = distance_str.strip().lower()
    
    # Regular expression to extract number and unit
    match = re.match(r'([\d.]+)\s*(meter|km)', distance_str)
    
    if match:
        value = float(match.group(1))
        unit = match.group(2)
        
        if unit == 'km':
            return value * 1000  # Convert kilometers to meters
        elif unit == 'meter':
            return value
    return None # Return None if conversion fails for any reason


# --- Step 2: Extract all unique canonical locations from the cleaned data ---
all_canonical_locations = set()

for index, row in df.iterrows():
    loc_dict = parse_location_dict_safe(row['LocationAdvantages'])
    all_canonical_locations.update(loc_dict.keys())

# Convert to a sorted list for consistent column order in location_df
unique_location_columns = sorted(list(all_canonical_locations))

# --- Step 3: Initialize location_df with properties as index and unique locations as columns ---
# Assuming 'PropertyName' is the column that uniquely identifies your properties
if 'PropertyName' not in df.columns:
    print("Error: 'PropertyName' column not found in the DataFrame. Please adjust the column name.")
    exit()

# Ensure 'PropertyName' is unique for indexing
# If 'PropertyName' is not unique, you might need to create a unique identifier
property_names = df['PropertyName'].unique().tolist() # Use unique property names for the index
location_df = pd.DataFrame(index=property_names, columns=unique_location_columns)

# --- Step 4: Populate location_df with distances ---
# Iterate through each row of your main DataFrame
for index, row in df.iterrows():
    property_name = row['PropertyName']
    loc_dict = parse_location_dict_safe(row['LocationAdvantages'])

    for location, distance_str in loc_dict.items():
        # Convert the distance string to a numeric value in meters
        distance_value = convert_distance_to_meters(distance_str)
        
        # Ensure the location exists in our unique columns before assigning
        if location in location_df.columns:
            location_df.loc[property_name, location] = distance_value

# --- Step 5: Fill any NaN values ---
# If a property doesn't have a specific location advantage, fill with a suitable default.
# For distances, a large number (e.g., 999999) or 0 might be appropriate, depending on your model's interpretation.
# Using 0 if missing distance means "no relevance" or "zero distance" for that location
location_df = location_df.fillna(0) # Or another appropriate default value



  location_df = location_df.fillna(0) # Or another appropriate default value


In [6]:
print("\n--- Debugging location_df Uniqueness ---")
print(f"Are there duplicate columns (locations)? {location_df.columns.duplicated().any()}")
if location_df.columns.duplicated().any():
    print(f"Duplicate column names: {location_df.columns[location_df.columns.duplicated()].tolist()}")

print(f"Are there duplicate index (property) names? {location_df.index.duplicated().any()}")
if location_df.index.duplicated().any():
    print(f"Duplicate index names: {location_df.index[location_df.index.duplicated()].tolist()}")
print("---------------------------------------")


--- Debugging location_df Uniqueness ---
Are there duplicate columns (locations)? False
Are there duplicate index (property) names? False
---------------------------------------


In [7]:
# Convert the DataFrame to a numeric type if necessary (after filling NaNs)
location_df = location_df.apply(pd.to_numeric, errors='coerce')


# --- Step 6: Pickle the cleaned and deduplicated location_df ---
try:
    with open('location_distance.pkl', 'wb') as f:
        pickle.dump(location_df, f)
    print("\n✔️ 'location_distance.pkl' created successfully with unique location data.")
    print("\nSample of the first 5 rows of the generated location_df:")
    print(location_df.head())
except Exception as e:
    print(f"Error pickling location_df: {e}")


✔️ 'location_distance.pkl' created successfully with unique location data.

Sample of the first 5 rows of the generated location_df:
                             AIIMS  AIIMS Jhajjar  AIPL Business Centre  \
Smartworld One DXP               0            0.0                   0.0   
M3M Crown                        0            0.0                   0.0   
Adani Brahma Samsara Vilasa      0            0.0                   0.0   
Sobha City                       0            0.0                   0.0   
Signature Global City 93         0            0.0                   0.0   

                             AIPL Business Club  AIPL Business Club Sector 62  \
Smartworld One DXP                          0.0                           0.0   
M3M Crown                                   0.0                           0.0   
Adani Brahma Samsara Vilasa                 0.0                        2700.0   
Sobha City                                  0.0                           0.0   
Signature 