# Notebook 4.3 - Feature engineering type of road

# Import libraries

In [61]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, Polygon
from shapely.validation import explain_validity
import shutil

# Choose the city

In [62]:
#Choose city ("Madrid", "Barcelona", or "Valencia")
city = ("Madrid")

__Load cleaned data and external open source data file on types of road__

In [63]:
#Read corresponding files (cleaned data and road data)
if city == "Madrid":
    data = pd.read_csv('../../data/4_data_cleaned/madrid_cleaned_base_features.csv')
    roads_gdf = gpd.read_file('../../data/3_external_data/open_street_map/open_street_mad/gis_osm_roads_free_1.shp')
elif city == "Barcelona":
    data = pd.read_csv('../../data/4_data_cleaned/barcelona_cleaned_base_features.csv')
    roads_gdf = gpd.read_file('../../data/3_external_data/open_street_map/open_street_barc/gis_osm_roads_free_1.shp')
elif city == "Valencia":
    data = pd.read_csv('../../data/4_data_cleaned/valencia_cleaned_base_features.csv')
    roads_gdf = gpd.read_file('../../data/3_external_data/open_street_map/open_street_val/gis_osm_roads_free_1.shp')
else:
    raise ValueError("City not recognized. Please choose either 'Madrid', 'Barcelona', or 'Valencia'.")

In [64]:
data.head()

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,DISTANCE_TO_METRO,DISTANCE_TO_MAIN_STREET,LONGITUDE,LATITUDE,NEIGHBORHOOD,ZIP_CODE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812
0,A10000037964896093228,255000,97,3,2,3,0,1.0,0,1,...,0.254412,3.027988,-3.650253,40.473921,Pinar del Rey,28033,0,0,1,0
1,A10000072440601830803,82000,62,2,1,3,0,1.0,0,1,...,0.268472,4.693939,-3.640243,40.384968,Palomeras sureste,28018,0,1,0,0
2,A10000538600815177437,133000,67,3,1,3,0,1.0,1,0,...,1.061146,2.623258,-3.665263,40.384547,San Diego,28018,1,0,0,0
3,A10000654405436195291,204000,180,3,2,3,0,1.0,0,1,...,0.427977,3.131739,-3.65179,40.430336,Ventas,28017,1,0,0,0
4,A10000872160480475600,161000,54,2,1,3,0,1.0,0,0,...,0.377045,2.702218,-3.725637,40.384103,Buena Vista,28019,0,0,0,1


In [65]:
roads_gdf.head()

Unnamed: 0,osm_id,code,fclass,name,ref,oneway,maxspeed,layer,bridge,tunnel,geometry
0,3996189,5111,motorway,M-40,M-40,F,100,0,F,F,"LINESTRING (-3.74414 40.36183, -3.74372 40.361..."
1,3996190,5113,primary,Rotonda de Arcentales,,B,0,1,F,F,"LINESTRING (-3.59691 40.43058, -3.59703 40.430..."
2,3996191,5131,motorway_link,,,F,70,0,F,F,"LINESTRING (-3.59786 40.42672, -3.59771 40.427..."
3,3996192,5113,primary,Plaza de Grecia,,B,0,0,F,F,"LINESTRING (-3.60803 40.43482, -3.60800 40.43479)"
4,3996195,5122,residential,Calle de Mánchester,,F,0,0,F,F,"LINESTRING (-3.60687 40.42931, -3.60662 40.429..."


# Create feature on types of road

__Find type of road for each house__

In [66]:
# Create function to create Point Objects from LATITUDE and LONGITUDE Columns
def create_point_from_lat_lon(lat, lon):
    try:
        return Point(float(lon), float(lat))
    except Exception as e:
        print(f"Error creating point from lat/lon: {e}")
        return None
    
# Create Geometry Column Using LATITUDE and LONGITUDE Columns
data['GEOMETRY'] = data.apply(lambda row: create_point_from_lat_lon(row['LATITUDE'], row['LONGITUDE']), axis=1)

In [67]:
# Convert houses DataFrame to GeoDataFrame
sale_gdf = gpd.GeoDataFrame(data, geometry='GEOMETRY')
sale_gdf.set_crs(epsg=4326, inplace=True)
sale_gdf.head()

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,DISTANCE_TO_MAIN_STREET,LONGITUDE,LATITUDE,NEIGHBORHOOD,ZIP_CODE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812,GEOMETRY
0,A10000037964896093228,255000,97,3,2,3,0,1.0,0,1,...,3.027988,-3.650253,40.473921,Pinar del Rey,28033,0,0,1,0,POINT (-3.65025 40.47392)
1,A10000072440601830803,82000,62,2,1,3,0,1.0,0,1,...,4.693939,-3.640243,40.384968,Palomeras sureste,28018,0,1,0,0,POINT (-3.64024 40.38497)
2,A10000538600815177437,133000,67,3,1,3,0,1.0,1,0,...,2.623258,-3.665263,40.384547,San Diego,28018,1,0,0,0,POINT (-3.66526 40.38455)
3,A10000654405436195291,204000,180,3,2,3,0,1.0,0,1,...,3.131739,-3.65179,40.430336,Ventas,28017,1,0,0,0,POINT (-3.65179 40.43034)
4,A10000872160480475600,161000,54,2,1,3,0,1.0,0,0,...,2.702218,-3.725637,40.384103,Buena Vista,28019,0,0,0,1,POINT (-3.72564 40.38410)


In [68]:
# Ensure the roads geometry is parsed correctly
roads_gdf = roads_gdf.to_crs(epsg=4326)
roads_gdf.head()

Unnamed: 0,osm_id,code,fclass,name,ref,oneway,maxspeed,layer,bridge,tunnel,geometry
0,3996189,5111,motorway,M-40,M-40,F,100,0,F,F,"LINESTRING (-3.74414 40.36183, -3.74372 40.361..."
1,3996190,5113,primary,Rotonda de Arcentales,,B,0,1,F,F,"LINESTRING (-3.59691 40.43058, -3.59703 40.430..."
2,3996191,5131,motorway_link,,,F,70,0,F,F,"LINESTRING (-3.59786 40.42672, -3.59771 40.427..."
3,3996192,5113,primary,Plaza de Grecia,,B,0,0,F,F,"LINESTRING (-3.60803 40.43482, -3.60800 40.43479)"
4,3996195,5122,residential,Calle de Mánchester,,F,0,0,F,F,"LINESTRING (-3.60687 40.42931, -3.60662 40.429..."


In [69]:
# Reproject to a projected CRS (e.g., EPSG:3857)
sale_gdf = sale_gdf.to_crs(epsg=3857)
roads_gdf = roads_gdf.to_crs(epsg=3857)

# Ensure unique indices
sale_gdf = sale_gdf.reset_index(drop=True)
roads_gdf = roads_gdf.reset_index(drop=True)

# Convert all column names to lowercase to avoid case sensitivity issues
roads_gdf.columns = roads_gdf.columns.str.lower()
sale_gdf.columns = sale_gdf.columns.str.lower()

In [70]:
# Convert houses DataFrame to GeoDataFrame
sale_gdf = gpd.GeoDataFrame(data, geometry='GEOMETRY')
sale_gdf.set_crs(epsg=4326, inplace=True)

# Ensure the roads geometry is parsed correctly
roads_gdf = roads_gdf.to_crs(epsg=4326)

# Reproject to a projected CRS (e.g., EPSG:3857)
sale_gdf = sale_gdf.to_crs(epsg=3857)
roads_gdf = roads_gdf.to_crs(epsg=3857)

# Ensure unique indices
sale_gdf = sale_gdf.reset_index(drop=True)
roads_gdf = roads_gdf.reset_index(drop=True)

# Convert all column names to uppercase to avoid case sensitivity issues
roads_gdf.columns = roads_gdf.columns.str.upper()
sale_gdf.columns = sale_gdf.columns.str.upper()

In [71]:
roads_gdf.head()

Unnamed: 0,OSM_ID,CODE,FCLASS,NAME,REF,ONEWAY,MAXSPEED,LAYER,BRIDGE,TUNNEL,GEOMETRY
0,3996189,5111,motorway,M-40,M-40,F,100,0,F,F,"LINESTRING (-416795.792 4918662.674, -416749.4..."
1,3996190,5113,primary,Rotonda de Arcentales,,B,0,1,F,F,"LINESTRING (-400406.702 4928711.027, -400419.5..."
2,3996191,5131,motorway_link,,,F,70,0,F,F,"LINESTRING (-400511.587 4928146.762, -400495.7..."
3,3996192,5113,primary,Plaza de Grecia,,B,0,0,F,F,"LINESTRING (-401643.773 4929332.231, -401640.2..."
4,3996195,5122,residential,Calle de Mánchester,,F,0,0,F,F,"LINESTRING (-401515.266 4928526.513, -401487.5..."


In [72]:
sale_gdf.head()

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,DISTANCE_TO_MAIN_STREET,LONGITUDE,LATITUDE,NEIGHBORHOOD,ZIP_CODE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812,GEOMETRY
0,A10000037964896093228,255000,97,3,2,3,0,1.0,0,1,...,3.027988,-3.650253,40.473921,Pinar del Rey,28033,0,0,1,0,POINT (-406344.353 4935052.081)
1,A10000072440601830803,82000,62,2,1,3,0,1.0,0,1,...,4.693939,-3.640243,40.384968,Palomeras sureste,28018,0,1,0,0,POINT (-405230.002 4922043.426)
2,A10000538600815177437,133000,67,3,1,3,0,1.0,1,0,...,2.623258,-3.665263,40.384547,San Diego,28018,1,0,0,0,POINT (-408015.174 4921981.971)
3,A10000654405436195291,204000,180,3,2,3,0,1.0,0,1,...,3.131739,-3.65179,40.430336,Ventas,28017,1,0,0,0,POINT (-406515.454 4928675.980)
4,A10000872160480475600,161000,54,2,1,3,0,1.0,0,0,...,2.702218,-3.725637,40.384103,Buena Vista,28019,0,0,0,1,POINT (-414736.046 4921917.057)


In [73]:
# Set the geometry column explicitly
sale_gdf = sale_gdf.set_geometry('GEOMETRY')
roads_gdf = roads_gdf.set_geometry('GEOMETRY') 

In [74]:
# Define a distance threshold in meters
distance_threshold = 30

# Classify houses based on road proximity
road_types = ['motorway', 'motorway_link', 'primary', 'primary_link', 'secondary', 'secondary_link', 'tertiary', 'tertiary_link', 'pedestrian']

# Create a copy of the houses GeoDataFrame to track remaining houses
remaining_houses_gdf = sale_gdf.copy()
classified_houses = []

for road_type in road_types:
    # Filter roads by road type
    filtered_roads = roads_gdf[roads_gdf['FCLASS'] == road_type]

    if not filtered_roads.empty:
        # Perform the nearest spatial join
        nearest_gdf = gpd.sjoin_nearest(remaining_houses_gdf, filtered_roads, how='left', distance_col='DISTANCE')

        # Classify roads based on distance threshold
        nearest_gdf['CLASSIFIED_ROAD_FCLASS'] = nearest_gdf.apply(
            lambda row: road_type if row['DISTANCE'] <= distance_threshold else 'other', axis=1
        )

        # Filter only the classified houses
        classified = nearest_gdf[nearest_gdf['CLASSIFIED_ROAD_FCLASS'] != 'other']
        classified_houses.append(classified[['ASSETID', 'CLASSIFIED_ROAD_FCLASS']])

        # Remove classified houses from the remaining pool
        remaining_houses_gdf = remaining_houses_gdf[~remaining_houses_gdf['ASSETID'].isin(classified['ASSETID'])]

In [75]:
# Combine all classified houses
combined_classified_df = pd.concat(classified_houses).drop_duplicates('ASSETID')

# Check for houses not classified yet and classify based on nearest road
remaining_houses = sale_gdf[~sale_gdf['ASSETID'].isin(combined_classified_df['ASSETID'])]
nearest_remaining_gdf = gpd.sjoin_nearest(remaining_houses, roads_gdf, how='left', distance_col='DISTANCE')

nearest_remaining_gdf['CLASSIFIED_ROAD_FCLASS'] = nearest_remaining_gdf.apply(
    lambda row: row['FCLASS'] if row['DISTANCE'] <= distance_threshold else 'other', axis=1
)

# Combine all results
final_classified_df = pd.concat([combined_classified_df, nearest_remaining_gdf[['ASSETID', 'CLASSIFIED_ROAD_FCLASS']]])

# Merge back to the original GeoDataFrame
merged_gdf = pd.merge(sale_gdf, final_classified_df[['ASSETID', 'CLASSIFIED_ROAD_FCLASS']], left_on='ASSETID', right_on='ASSETID', how='left')

__Aggregating type of roads__

In [76]:
# Define road type groups
road_type_groups = {
    "motorway/primary": ['motorway', 'motorway_link', 'trunk', 'trunk_link', 'primary', 'primary_link'],
    "secondary/tertiary": ['secondary', 'secondary_link', 'tertiary', 'tertiary_link'],
    "pedestrian": ['pedestrian', 'footway']
}

# Function to map the classified_road_fclass to the defined groups
def map_road_classification(road_class):
    for group, types in road_type_groups.items():
        if road_class in types:
            return group
    return 'other'

# Apply the mapping function to the classified_road_fclass column
merged_gdf['ROAD_CLASS_GROUP'] = merged_gdf['CLASSIFIED_ROAD_FCLASS'].apply(map_road_classification)

In [77]:
merged_gdf.head()

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,LATITUDE,NEIGHBORHOOD,ZIP_CODE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812,GEOMETRY,CLASSIFIED_ROAD_FCLASS,ROAD_CLASS_GROUP
0,A10000037964896093228,255000,97,3,2,3,0,1.0,0,1,...,40.473921,Pinar del Rey,28033,0,0,1,0,POINT (-406344.353 4935052.081),residential,other
1,A10000072440601830803,82000,62,2,1,3,0,1.0,0,1,...,40.384968,Palomeras sureste,28018,0,1,0,0,POINT (-405230.002 4922043.426),residential,other
2,A10000538600815177437,133000,67,3,1,3,0,1.0,1,0,...,40.384547,San Diego,28018,1,0,0,0,POINT (-408015.174 4921981.971),residential,other
3,A10000654405436195291,204000,180,3,2,3,0,1.0,0,1,...,40.430336,Ventas,28017,1,0,0,0,POINT (-406515.454 4928675.980),residential,other
4,A10000872160480475600,161000,54,2,1,3,0,1.0,0,0,...,40.384103,Buena Vista,28019,0,0,0,1,POINT (-414736.046 4921917.057),residential,other


__Apply categorical encoding__

- One-hot encoding of each type of road as there is no inherent order in the types of roads

In [78]:
# One-hot encode the grouped road types
encoded_df = pd.get_dummies(merged_gdf['ROAD_CLASS_GROUP'])

# Merge the encoded columns back into the original dataframe
merged_gdf = pd.concat([merged_gdf, encoded_df], axis=1)

#Change all new encoded columns to upper for consistency with cleaned data
merged_gdf.columns = merged_gdf.columns.str.upper()
merged_gdf.columns = merged_gdf.columns.str.upper()

# Debug: Check the final dataframe with encoded columns
print("Final Houses GeoDataFrame with encoded road class groups (first 5 rows):")
merged_gdf.head()

Final Houses GeoDataFrame with encoded road class groups (first 5 rows):


Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,PERIOD_201806,PERIOD_201809,PERIOD_201812,GEOMETRY,CLASSIFIED_ROAD_FCLASS,ROAD_CLASS_GROUP,MOTORWAY/PRIMARY,OTHER,PEDESTRIAN,SECONDARY/TERTIARY
0,A10000037964896093228,255000,97,3,2,3,0,1.0,0,1,...,0,1,0,POINT (-406344.353 4935052.081),residential,other,False,True,False,False
1,A10000072440601830803,82000,62,2,1,3,0,1.0,0,1,...,1,0,0,POINT (-405230.002 4922043.426),residential,other,False,True,False,False
2,A10000538600815177437,133000,67,3,1,3,0,1.0,1,0,...,0,0,0,POINT (-408015.174 4921981.971),residential,other,False,True,False,False
3,A10000654405436195291,204000,180,3,2,3,0,1.0,0,1,...,0,0,0,POINT (-406515.454 4928675.980),residential,other,False,True,False,False
4,A10000872160480475600,161000,54,2,1,3,0,1.0,0,0,...,0,0,1,POINT (-414736.046 4921917.057),residential,other,False,True,False,False


- Drop original column before one-hot encoding

In [79]:
merged_gdf = merged_gdf.drop(columns=["GEOMETRY", "CLASSIFIED_ROAD_FCLASS", "ROAD_CLASS_GROUP"])
merged_gdf.head()

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,NEIGHBORHOOD,ZIP_CODE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812,MOTORWAY/PRIMARY,OTHER,PEDESTRIAN,SECONDARY/TERTIARY
0,A10000037964896093228,255000,97,3,2,3,0,1.0,0,1,...,Pinar del Rey,28033,0,0,1,0,False,True,False,False
1,A10000072440601830803,82000,62,2,1,3,0,1.0,0,1,...,Palomeras sureste,28018,0,1,0,0,False,True,False,False
2,A10000538600815177437,133000,67,3,1,3,0,1.0,1,0,...,San Diego,28018,1,0,0,0,False,True,False,False
3,A10000654405436195291,204000,180,3,2,3,0,1.0,0,1,...,Ventas,28017,1,0,0,0,False,True,False,False
4,A10000872160480475600,161000,54,2,1,3,0,1.0,0,0,...,Buena Vista,28019,0,0,0,1,False,True,False,False


# Write the resulting GeoDataFrame to csv

In [80]:
if city == "Madrid":
    merged_gdf.to_csv("../../data/5_cleaned_and_feature_engineering/feature_road_type/madrid_cleaned_incl_road_type.csv", index=False)
elif city == "Barcelona":
    merged_gdf.to_csv("../../data/5_cleaned_and_feature_engineeringfeature_/road_type/barcelona_cleaned_incl_road_type.csv", index=False)
elif city == "Valencia":
    merged_gdf.to_csv("../../data/5_cleaned_and_feature_engineering/feature_road_type/valencia_cleaned_incl_road_type.csv", index=False)