<a href="https://colab.research.google.com/github/anhpdd/ml-property-valuation-klang-valley/blob/main/notebooks/1_2_prop_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
# Install dependencies
!pip install -q osmnx

# Core
import pandas as pd
import numpy as np
import geopandas as gpd
import osmnx as ox
import networkx as nx

# Geospatial
from shapely.geometry import Point, LineString, Polygon, MultiPolygon
from shapely.ops import unary_union, polygonize
from shapely import wkt
from geopy.distance import great_circle

# Utilities
import requests
import xml.etree.ElementTree as ET
from typing import List, Dict, Any, Optional
from tqdm.auto import tqdm
from IPython.display import display

# Visualization
import folium

# Mount Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [24]:
# Get all values from the worksheet and convert them into a pandas DataFrame
road_df = pd.read_excel('/content/drive/MyDrive/Colab/Capstone 1/assessed_roads.xlsx')

# Display the DataFrame's info
road_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68677 entries, 0 to 68676
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   records_id         68677 non-null  int64 
 1   property_type      68677 non-null  object
 2   district           68677 non-null  object
 3   mukim              68677 non-null  object
 4   scheme_name        68677 non-null  object
 5   road_name          68677 non-null  object
 6   date               68677 non-null  object
 7   tenure             68677 non-null  object
 8   land_m2            68677 non-null  int64 
 9   property_m2        68677 non-null  int64 
 10  transaction_price  68677 non-null  int64 
 11  year               68677 non-null  int64 
 12  unit_level         68677 non-null  int64 
 13  way_id             68677 non-null  int64 
 14  geometry           68676 non-null  object
dtypes: int64(7), object(8)
memory usage: 7.9+ MB


## Test 2

In [61]:
OSM_API_BASE_URL = "https://api.openstreetmap.org/api/0.6"

# Define your headers once
HEADERS = {
    'User-Agent': 'MyDataProject/1.0 (https://example.com; myemail@example.com)'
}

def fetch_osm_data(url: str, timeout: int = 25) -> ET.Element | None:
    """
    Fetches data from the OSM API and parses it as XML.
    Includes a required User-Agent header.
    """
    try:
        # Add the headers to your request
        response = requests.get(url, timeout=timeout, headers=HEADERS)
        response.raise_for_status()
        return ET.fromstring(response.content)

    except requests.exceptions.HTTPError as e:
        # Your excellent 404 handling
        if e.response is not None and e.response.status_code == 404:
            return None
        print(f"HTTP Error for {url}: {e}")
        return None # Explicitly return None on other HTTP errors

    except requests.exceptions.RequestException as e:
        print(f"Request failed for {url}: {e}")
        return None # Explicitly return None on failure

    except ET.ParseError as e:
        print(f"XML parsing failed for {url}. Error: {e}")
        return None # Explicitly return None on failure


def extract_line_geometry(input_df: pd.DataFrame, name_column: str, id_column: str) -> pd.DataFrame:
    """
    Fetches OSM data for Way IDs and adds road geometry to the input DataFrame.

    Args:
        input_df: DataFrame with Way IDs
        name_column: Column containing road/location name
        id_column: Column containing OSM Way ID

    Returns:
        Original DataFrame with added 'geometry' column (NaN for failed extractions)
    """
    print(f"üìç Extracting geometry for {len(input_df):,} Way IDs...")

    # Create a dictionary to store way_id -> geometry mapping
    way_id_to_geometry = {}

    # Process each unique way_id
    unique_way_ids = input_df[id_column].dropna().unique()

    for way_id in tqdm(unique_way_ids, desc="Fetching geometries"):
        way_id_str = str(int(way_id))

        # Fetch OSM data
        url = f"{OSM_API_BASE_URL}/way/{way_id_str}/full"
        root = fetch_osm_data(url)

        if not root:
            continue

        # Find the way element
        road_way_elem = root.find(f".//way[@id='{way_id_str}']")
        if not road_way_elem:
            continue

        # Cache node coordinates
        node_coords_cache = {}
        for node_elem in root.findall(".//node"):
            try:
                node_id = node_elem.get('id')
                lat = float(node_elem.get('lat'))
                lon = float(node_elem.get('lon'))
                node_coords_cache[node_id] = (lon, lat)
            except (TypeError, ValueError):
                continue

        # Build coordinate list
        road_coords = []
        for nd_ref_elem in road_way_elem.findall('nd'):
            node_ref = nd_ref_elem.get('ref')
            if node_ref in node_coords_cache:
                road_coords.append(node_coords_cache[node_ref])

        # Create LineString if valid
        if len(road_coords) >= 2:
            way_id_to_geometry[way_id] = LineString(road_coords)

    # Map geometries back to original DataFrame
    output_df = input_df.copy()
    output_df['geometry'] = output_df[id_column].map(way_id_to_geometry)

    # Summary
    success_count = output_df['geometry'].notna().sum()
    print(f"‚úÖ Extracted geometry for {success_count:,}/{len(input_df):,} rows ({success_count/len(input_df)*100:.1f}%)")

    return output_df

def generate_geometry_summary(
    result_gdf: Optional[gpd.GeoDataFrame],
    original_df: pd.DataFrame,
    name_column: str
) -> Optional[pd.DataFrame]:
    """
    Analyzes the result of the geometry extraction and prints a summary.

    Compares the final GeoDataFrame against the initial DataFrame to determine
    success and failure rates for creating road geometries.

    Args:
        result_gdf (Optional[gpd.GeoDataFrame]): The GeoDataFrame returned by the
                                                 extract_line_geometry function.
                                                 Can be None if the process failed entirely.
        original_df (pd.DataFrame): The original DataFrame that was passed to the
                                    geometry extraction function.
        name_column (str): The name of the column containing the unique road/location names.

    Returns:
        Optional[pd.DataFrame]: A DataFrame containing the rows of the original data
                                that failed to produce a valid geometry, or None if all
                                were successful.
    """
    print("\n--- Geometry Extraction Summary ---")

    if original_df.empty:
        print("Original DataFrame is empty. No roads to process.")
        return None

    total_unique_roads = original_df[name_column].nunique()
    print(f"Total unique roads attempted: {total_unique_roads}")

    if result_gdf is None or result_gdf.empty:
        print("No geometries were successfully created.")
        print(f"Success rate: 0.00%")
        print(f"Failure rate: 100.00%")
        return original_df # All failed records

    successful_count = len(result_gdf)
    failed_count = total_unique_roads - successful_count

    print(f"Successfully created geometries: {successful_count} roads ({successful_count/total_unique_roads:.2%})")
    print(f"Failed to create geometries: {failed_count} roads ({failed_count/total_unique_roads:.2%})")

    # Identify which specific roads failed
    successful_names = set(result_gdf[name_column])
    original_names = set(original_df[name_column])
    failed_names = original_names - successful_names

    if failed_names:
        print(f"\nReturning a DataFrame with {len(failed_names)} failed records for review.")
        failed_df = original_df[original_df[name_column].isin(failed_names)].copy()
        return failed_df
    else:
        print("\nCongratulations! All road geometries were created successfully.")
        return None

In [59]:
road_df = df[df['geometry'].isna()][['road_name', 'way_id', 'district']].drop_duplicates().reset_index(drop=True)
road_df

Unnamed: 0,road_name,way_id,district
0,JALAN MUTIARA 3/8A,435014514,GOMBAK


In [62]:
# Get the road route details
road_route_details = extract_line_geometry(road_df, 'road_name','way_id').reset_index(drop=True)

#Convert way_id to int type
road_route_details['way_id'] = road_route_details['way_id'].astype(int)

# Inspect df
road_route_details.head()

üìç Extracting geometry for 1 Way IDs...


Fetching geometries:   0%|          | 0/1 [00:00<?, ?it/s]

‚úÖ Extracted geometry for 1/1 rows (100.0%)


  if not root:
  if not road_way_elem:


Unnamed: 0,road_name,way_id,district,geometry
0,JALAN MUTIARA 3/8A,435014514,GOMBAK,"LINESTRING (101.6693384 3.2622569, 101.6689928..."


In [63]:
# Generate test 2 summary
generate_geometry_summary(road_route_details, road_df, 'road_name')


--- Geometry Extraction Summary ---
Total unique roads attempted: 1
Successfully created geometries: 1 roads (100.00%)
Failed to create geometries: 0 roads (0.00%)

Congratulations! All road geometries were created successfully.


## Test 3

In [64]:
def get_district_data(object_id: str) -> dict | None:
    """
    Fetches OSM data for a given object_id, trying as relation, then way, then node.
    Correctly handles inner and outer ways for relations to form polygons with holes.
    """
    final_object_type = None
    final_tags = {}
    final_name_tag = None
    final_all_polygons_coordinates = []
    processed_successfully = False

    # --- Try to process as a RELATION ---
    try:
        relation_url = f"{OSM_API_BASE_URL}/relation/{object_id}/full"
        root_xml = fetch_osm_data(relation_url)
        if not root_xml:
            raise ValueError("XML data could not be fetched.")

        relation_element = root_xml.find(f".//relation[@id='{object_id}']")
        if relation_element is None:
            raise ValueError("Relation element not found in XML.")

        # (This part for getting tags and caching node coordinates is correct and remains the same)
        current_tags = {tag.get('k'): tag.get('v') for tag in relation_element.findall('tag') if tag.get('k')}
        current_name_tag = current_tags.get('name')
        nodes_coords_cache = {
            node.get('id'): (float(node.get('lon')), float(node.get('lat')))
            for node in root_xml.findall('.//node') if node.get('id') and node.get('lat') and node.get('lon')
        }

        # (This part for separating ways into outer/inner segments is also correct)
        outer_way_segments = []
        inner_way_segments = []
        for member in relation_element.findall("member[@type='way']"):
            way_elem = root_xml.find(f".//way[@id='{member.get('ref')}']")
            if way_elem is not None:
                coords = [nodes_coords_cache[nd.get('ref')] for nd in way_elem.findall('nd') if nd.get('ref') in nodes_coords_cache]
                if coords:
                    role = member.get('role', 'outer')
                    if role == 'outer':
                        outer_way_segments.append(coords)
                    elif role == 'inner':
                        inner_way_segments.append(coords)

        # --- DELETED SECTION: The original, fragile stitching loops have been removed. ---

        # +++ NEW, ROBUST STITCHING AND POLYGON CREATION LOGIC +++

        # Convert coordinate segments into Shapely LineString objects
        outer_lines = [LineString(segment) for segment in outer_way_segments]
        inner_lines = [LineString(segment) for segment in inner_way_segments]

        # Merge all connecting lines into continuous, single paths
        merged_outer_lines = unary_union(outer_lines)
        merged_inner_lines = unary_union(inner_lines)

        # Form valid polygons from the closed rings created by the merged lines
        stitched_outer_polygons = list(polygonize(merged_outer_lines))
        stitched_inner_polygons = list(polygonize(merged_inner_lines))

        # --- RESUMING LOGIC WITH CORRECTLY FORMED POLYGONS ---

        final_shapely_polygons = []
        # Create a mutable list of inner polygons to track which ones have been used
        remaining_inners = list(stitched_inner_polygons)

        for outer_poly in stitched_outer_polygons:
            holes_for_this_poly = []
            # This list will hold inner polygons that haven't been assigned to this outer_poly
            unassigned_inners = []

            for inner_poly in remaining_inners:
                # Check if the inner polygon is properly contained within the outer one
                if outer_poly.contains(inner_poly):
                    holes_for_this_poly.append(inner_poly.exterior.coords)
                else:
                    unassigned_inners.append(inner_poly)

            # Update the list of remaining inners for the next outer polygon
            remaining_inners = unassigned_inners

            # Create the final polygon with its associated holes
            final_shapely_polygons.append(Polygon(outer_poly.exterior.coords, holes_for_this_poly))

        # --- (The rest of the function for formatting output remains the same) ---

        if final_shapely_polygons:
            for shp_poly in final_shapely_polygons:
                exterior_coords = list(shp_poly.exterior.coords)
                if not shp_poly.exterior.is_ccw:
                    exterior_coords.reverse()
                poly_data = [exterior_coords]
                for interior_ring in shp_poly.interiors:
                    interior_coords = list(interior_ring.coords)
                    if interior_ring.is_ccw:
                        interior_coords.reverse()
                    poly_data.append(interior_coords)
                final_all_polygons_coordinates.append(poly_data)

            final_tags = current_tags
            final_name_tag = current_name_tag
            final_object_type = "relation"
            processed_successfully = True
            print(f"Successfully processed ID {object_id} as RELATION with {len(final_shapely_polygons)} polygon(s).")
        else:
            print(f"INFO: Relation {object_id} could not form valid polygons.")

    except Exception as e_relation:
        print(f"Error processing relation for {object_id}: {e_relation}. Trying as way.")
        processed_successfully = False

    # (The fallback logic for 'WAY' and 'NODE' remains unchanged)
    # ...

    # --- Final Return ---
    if not processed_successfully or not final_all_polygons_coordinates:
        print(f"FINAL: Could not derive usable geometry for OSM object ID {object_id}.")
        return None

    return {
        'name': final_name_tag,
        'tags': final_tags,
        'all_polygons_coordinates': final_all_polygons_coordinates,
        'id': object_id,
        'type': final_object_type
    }

In [65]:
def create_geometry(row):
    coords_list = row['all_polygons_coordinates'] # This will be a list of lists of coords
    if not coords_list:
        return None

    if row['type'] == 'node':
        # For a node, coords_list will contain one list with one (lon, lat) tuple: [[(lon, lat)]]
        if coords_list and len(coords_list[0]) == 1:
            return Point(coords_list[0][0])
        return None

    #
    elif row['type'] == 'way_polygon':
        if coords_list and coords_list[0] and len(coords_list[0]) >= 4:
            if coords_list[0][0] != coords_list[0][-1]:
                print(f"Warning: way_polygon {row['id']} was not closed, closing it now.")
                return Polygon(coords_list[0] + [coords_list[0][0]])
            return Polygon(coords_list[0])
        else:
            print(f"Warning: way_polygon {row['id']} has insufficient points ({len(coords_list[0])}). Cannot form Polygon.")
            return None
        return None

    elif row['type'] == 'way_line':
        if coords_list and coords_list[0] and len(coords_list[0]) >= 2:
            return LineString(coords_list[0])
        else:
            print(f"Warning: way_line {row['id']} has insufficient points ({len(coords_list[0])}). Cannot form LineString.")
            return None
        return None

    elif row['type'] == 'relation':
        polygons = []
        for poly_data in coords_list:
            if poly_data:
                exterior_coords = poly_data[0]
                interior_coords_list = poly_data[1:]

                # Create Shapely Polygon with holes
                try:
                    poly = Polygon(exterior_coords, interior_coords_list)
                    if poly.is_valid:
                        polygons.append(poly)
                    else:
                        print(f"Warning: Invalid polygon created for relation {row['id']}. Attempting to make valid.")
                        valid_poly = poly.buffer(0)
                        if valid_poly.is_valid:
                            if isinstance(valid_poly, MultiPolygon):
                                polygons.extend(valid_poly.geoms)
                            else:
                                polygons.append(valid_poly)
                        else:
                            print(f"Warning: Could not make polygon valid for relation {row['id']}.")
                except Exception as e:
                    print(f"Error creating polygon for relation {row['id']}: {e}")

        if polygons:
            return MultiPolygon(polygons) if len(polygons) > 1 else polygons[0]
        return None

    else:
        print(f"Unsupported or old type '{row['type']}'. Skipping geometry creation.")
        return None


def create_amen_gdf(df, id_column):
    gdf_list = []
    df[id_column] = df[id_column].astype(str)

    # Fetch data for each ID and append to gdf_list
    for osm_id in df[id_column].unique(): # Use unique IDs to avoid redundant API calls
        result = get_district_data(osm_id)
        if result:
            gdf_list.append(result)

    if not gdf_list:
        print("No valid district data was fetched.")
        return None

    # Create a DataFrame from the fetched OSM data
    dff = pd.DataFrame(gdf_list)

    # Merge the original DataFrame with the fetched OSM data
    # Use left_on=id_column, right_on='id' to merge correctly
    merge_df = pd.merge(df, dff, left_on=id_column, right_on='id', how='left')

    # Create geometry column
    merge_df['geometry'] = merge_df.apply(create_geometry, axis=1)

    # Filter out rows where geometry could not be created
    merge_df = merge_df[merge_df['geometry'].notnull()].reset_index(drop=True)

    # Create GeoDataFrame
    gdf = gpd.GeoDataFrame(merge_df, geometry='geometry', crs="EPSG:4326")
    return gdf

In [66]:
# --- Main Execution ---
district_osm_dict = {
    'GOMBAK': '12438352',
    'HULU LANGAT': '12438351',
    'KLANG': '12391135',
    'HULU SELANGOR': '10714199',
    'KUALA LANGAT': '10743362',
    'KUALA LUMPUR': '2939672',
    'KUALA SELANGOR': '10714137',
    'PETALING': '12391134',
    'PUTRAJAYA': '4443881',
    'SABAK BERNAM': '10714136',
    'SEPANG': '10743315'
}

district_df = pd.DataFrame.from_dict(district_osm_dict, orient='index', columns=['id'])
district_df.index.name = 'district'
district_df = district_df.reset_index()
district_df['id'] = district_df['id'].astype(str)

display(district_df.head())

Unnamed: 0,district,id
0,GOMBAK,12438352
1,HULU LANGAT,12438351
2,KLANG,12391135
3,HULU SELANGOR,10714199
4,KUALA LANGAT,10743362


In [67]:
dis = create_amen_gdf(district_df, 'id')
dis

  if not root_xml:


Successfully processed ID 12438352 as RELATION with 1 polygon(s).
Successfully processed ID 12438351 as RELATION with 1 polygon(s).
Successfully processed ID 12391135 as RELATION with 7 polygon(s).
Successfully processed ID 10714199 as RELATION with 1 polygon(s).
Successfully processed ID 10743362 as RELATION with 1 polygon(s).
Successfully processed ID 2939672 as RELATION with 1 polygon(s).
Successfully processed ID 10714137 as RELATION with 8 polygon(s).
Successfully processed ID 12391134 as RELATION with 1 polygon(s).
Successfully processed ID 4443881 as RELATION with 1 polygon(s).
Successfully processed ID 10714136 as RELATION with 1 polygon(s).
Successfully processed ID 10743315 as RELATION with 1 polygon(s).


Unnamed: 0,district,id,name,tags,all_polygons_coordinates,type,geometry
0,GOMBAK,12438352,Gombak,"{'admin_level': '6', 'boundary': 'administrati...","[[[(101.7378884, 3.3761076), (101.7374094, 3.3...",relation,"POLYGON ((101.73789 3.37611, 101.73741 3.37704..."
1,HULU LANGAT,12438351,Hulu Langat,"{'admin_level': '6', 'alt_name': 'Ulu Langat',...","[[[(101.8536932, 3.2757966), (101.8533789, 3.2...",relation,"POLYGON ((101.85369 3.2758, 101.85338 3.2753, ..."
2,KLANG,12391135,Klang,"{'admin_level': '6', 'alt_name:ar': 'ŸÉŸÑÿßŸÜÿ∫;ŸÉŸÑÿß...","[[[(101.2633578, 2.9140149), (101.2659556, 2.9...",relation,"MULTIPOLYGON (((101.26336 2.91401, 101.26596 2..."
3,HULU SELANGOR,10714199,Hulu Selangor,"{'admin_level': '6', 'alt_name': 'Ulu Selangor...","[[[(101.3271374, 3.7865726), (101.3268815, 3.7...",relation,"POLYGON ((101.32714 3.78657, 101.32688 3.78625..."
4,KUALA LANGAT,10743362,Kuala Langat,"{'admin_level': '6', 'alt_name:ar': 'ŸÉŸàÿßŸÑÿß ŸÑÿßŸÜ...","[[[(101.5756689, 2.9796746), (101.5745482, 2.9...",relation,"POLYGON ((101.57567 2.97967, 101.57455 2.97941..."
5,KUALA LUMPUR,2939672,Kuala Lumpur,"{'admin_level': '4', 'alt_name:ar': 'ŸÉŸàÿßŸÑÿßŸÑŸÖÿ®Ÿà...","[[[(101.6609188, 3.242458), (101.6608132, 3.24...",relation,"POLYGON ((101.66092 3.24246, 101.66081 3.24152..."
6,KUALA SELANGOR,10714137,Kuala Selangor,"{'admin_level': '6', 'alt_name:ar': 'ŸÉŸàÿßŸÑÿß ÿ≥ŸÑÿß...","[[[(101.4277331, 3.1919875), (101.4332286, 3.1...",relation,"MULTIPOLYGON (((101.42773 3.19199, 101.43323 3..."
7,PETALING,12391134,Petaling,"{'admin_level': '6', 'alt_name:ar': 'ÿ®Ÿäÿ™ÿßŸÑŸäŸÜÿ∫;...","[[[(101.4817252, 3.0596404), (101.482797, 3.06...",relation,"POLYGON ((101.48173 3.05964, 101.4828 3.06044,..."
8,PUTRAJAYA,4443881,Putrajaya,"{'admin_level': '4', 'alt_name:ar': 'ÿ®Ÿàÿ™ÿ±ÿßÿ¨ÿßŸä'...","[[[(101.7003795, 2.9130896), (101.707218, 2.91...",relation,"POLYGON ((101.70038 2.91309, 101.70722 2.91329..."
9,SABAK BERNAM,10714136,Sabak Bernam,"{'admin_level': '6', 'boundary': 'administrati...","[[[(101.3397558, 3.6948587), (101.3397456, 3.6...",relation,"POLYGON ((101.33976 3.69486, 101.33975 3.69483..."


In [68]:
def validate_record_locations(
    records_gdf: gpd.GeoDataFrame,
    districts_gdf: gpd.GeoDataFrame,
    district_col: str = 'district'
) -> gpd.GeoDataFrame:
    """
    Finds the actual district for each road (LineString) based on maximum length overlap.

    Args:
        records_gdf (gpd.GeoDataFrame): Roads to check. Must have LineString geometry and `district_col`.
        districts_gdf (gpd.GeoDataFrame): District boundaries with Polygon geometry.
        district_col (str): The column name linking records to districts.

    Returns:
        gpd.GeoDataFrame: Original records with 'actual_district', 'is_in_correct_district',
                          and 'overlap_length' columns.
    """
    # Validation and Preparation
    if records_gdf.empty or districts_gdf.empty:
        print("Warning: One or both GeoDataFrames are empty.")
        records_gdf['actual_district'] = 'N/A'
        records_gdf['is_in_correct_district'] = False
        return records_gdf

    # Ensure both GeoDataFrames use the same CRS
    if records_gdf.crs != districts_gdf.crs:
        print(f"Warning: CRS mismatch. Reprojecting records_gdf to match districts_gdf.")
        records_gdf = records_gdf.to_crs(districts_gdf.crs)

    # Validate and Fix District Geometries
    districts_clean = districts_gdf.copy()
    districts_clean['geometry'] = districts_clean['geometry'].buffer(0)

    # Find District with Maximum Road Length Overlap
    results = []

    for idx, record in records_gdf.iterrows():
        road_geom = record.geometry
        assigned_district = record[district_col]

        best_district = None
        max_overlap_length = 0
        overlap_details = {}

        # Check intersection with each district
        for _, district in districts_clean.iterrows():
            district_geom = district.geometry
            district_name = district[district_col]

            if district_geom.intersects(road_geom):
                # Calculate the length of road within this district
                intersection = district_geom.intersection(road_geom)

                # Handle different intersection result types
                if intersection.is_empty:
                    overlap_length = 0
                elif hasattr(intersection, 'length'):
                    overlap_length = intersection.length
                else:
                    # Handle GeometryCollection or MultiLineString
                    overlap_length = sum(
                        geom.length for geom in intersection.geoms
                        if hasattr(geom, 'length')
                    )

                overlap_details[district_name] = overlap_length

                # Track the district with maximum overlap
                if overlap_length > max_overlap_length:
                    max_overlap_length = overlap_length
                    best_district = district_name

        # Build result record
        record_dict = record.to_dict()
        record_dict['actual_district'] = best_district if best_district else 'Outside any district'
        record_dict['is_in_correct_district'] = (assigned_district == best_district) if best_district else False
        record_dict['overlap_length'] = max_overlap_length
        record_dict['total_road_length'] = road_geom.length
        record_dict['overlap_percentage'] = (max_overlap_length / road_geom.length * 100) if road_geom.length > 0 else 0

        results.append(record_dict)

    # Create Result GeoDataFrame
    validated_gdf = gpd.GeoDataFrame(results, crs=records_gdf.crs)

    return validated_gdf

In [69]:
districts_gdf = gpd.GeoDataFrame(dis, crs="EPSG:4326")
input_tprop = road_route_details.copy()
records_gdf = gpd.GeoDataFrame(input_tprop, crs="EPSG:4326")

In [70]:
validated_records = validate_record_locations(records_gdf, districts_gdf)

# Merge once all roads pass test 3

In [None]:
#df = pd.read_excel('/content/drive/MyDrive/Colab/Capstone 1/tprop_df_validated.xlsx')

In [76]:
#validated_df = df.merge(validated_records, on =['road_name', 'way_id'], how = 'left')
#validated_df

Unnamed: 0,records_id,property_type,district_x,mukim,scheme_name,road_name_x,date,tenure,land_m2,property_m2,...,way_id,geometry_x,road_name_y,district_y,geometry_y,actual_district,is_in_correct_district,overlap_length,total_road_length,overlap_percentage
0,1,1 - 1 1/2 Storey Semi-Detached,GOMBAK,BANDAR SELAYANG,TAMAN SELAYANG MUTIARA,JALAN MUTIARA 3/8A,October 2023,Leasehold,20100,7400,...,435014514,,JALAN MUTIARA 3/8A,GOMBAK,"LINESTRING (101.66934 3.26226, 101.66899 3.26287)",GOMBAK,True,0.000702,0.000702,100.0
1,2,1 - 1 1/2 Storey Semi-Detached,GOMBAK,BANDAR SELAYANG,TAMAN SELAYANG MUTIARA,JALAN MUTIARA 4/4A,December 2023,Leasehold,20500,7400,...,435014515,"LINESTRING (101.6671296 3.2609005, 101.6662603...",,,,,,,,
2,3,1 - 1 1/2 Storey Semi-Detached,GOMBAK,BANDAR SELAYANG,TAMAN SELAYANG MUTIARA,JALAN MUTIARA 5/2,December 2023,Leasehold,20500,7400,...,612175755,"LINESTRING (101.6644568 3.2603757, 101.6644387...",,,,,,,,
3,4,1 - 1 1/2 Storey Semi-Detached,GOMBAK,BANDAR SELAYANG,TAMAN SELAYANG MUTIARA,JALAN 43,September 2023,Leasehold,22400,7000,...,221469662,"LINESTRING (101.6618551 3.2546375, 101.663127 ...",,,,,,,,
4,5,1 - 1 1/2 Storey Semi-Detached,HULU LANGAT,BANDAR SELAYANG,TAMAN SELAYANG MUTIARA,JALAN MUTIARA 3/3,September 2023,Leasehold,30300,7400,...,865847604,"LINESTRING (101.8813142 2.9427858, 101.8815441...",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68672,68090,Condominium/Apartment,PETALING,SUNGAI BULUH,ARMANEE TERRACE CONDOMINIUM-PJU 8,ARMANEE TERRACE CONDOMINIUM-PJU 8,March 2023,Leasehold,27323,27323,...,161066643,"LINESTRING (101.6035423 3.1729937, 101.6048322...",,,,,,,,
68673,68178,Condominium/Apartment,PETALING,SUNGAI BULUH,PERDANA EXCLUSIVE CONDOMINIUM,PERDANA EXCLUSIVE CONDOMINIUM,October 2023,Leasehold,29468,29468,...,161012065,"LINESTRING (101.6050438 3.1701204, 101.6044376...",,,,,,,,
68674,68678,Town House,SEPANG,DENGKIL,BANDAR SIERRA PUCHONG (ODORA PARKHOMES),JALAN SIERRA 9/2,January 2025,Leasehold,18100,18100,...,399044222,"LINESTRING (101.6576867 2.9699101, 101.657549 ...",,,,,,,,
68675,68679,Town House,SEPANG,DENGKIL,KOTA WARISAN,JALAN WARISAN PERMAI 2/1,January 2025,Freehold,16400,16400,...,42850167,"LINESTRING (101.7023756 2.8253799, 101.7023462...",,,,,,,,


In [None]:
#validated_records.to_excel('/content/drive/MyDrive/Colab/Capstone 1/validated_df.xlsx')