# Notebook 4.4 - Feature engineering parks

# Import libraries

In [36]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, Polygon
from shapely.validation import explain_validity
import shutil

# Choose the city

In [37]:
#Choose city ("Madrid", "Barcelona", or "Valencia")
city = ("Valencia")

__Load cleaned data and external open source data file on types of road__

In [38]:
#Read corresponding files (cleaned data and road data)
if city == "Madrid":
    data = pd.read_csv('../../data/4_data_cleaned/madrid_cleaned_base_features.csv')
    parks_gdf = gpd.read_file('../../data/3_external_data/open_street_map/open_street_mad/gis_osm_pois_a_free_1.shp')
elif city == "Barcelona":
    data = pd.read_csv('../../data/4_data_cleaned/barcelona_cleaned_base_features.csv')
    parks_gdf = gpd.read_file('../../data/3_external_data/open_street_map/open_street_barc/gis_osm_pois_a_free_1.shp')
elif city == "Valencia":
    data = pd.read_csv('../../data/4_data_cleaned/valencia_cleaned_base_features.csv')
    parks_gdf = gpd.read_file('../../data/3_external_data/open_street_map/open_street_val/gis_osm_pois_a_free_1.shp')
else:
    raise ValueError("City not recognized. Please choose either 'Madrid', 'Barcelona', or 'Valencia'.")

In [39]:
data.head()

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,DISTANCE_TO_METRO,DISTANCE_TO_MAIN_STREET,LONGITUDE,LATITUDE,NEIGHBORHOOD,ZIP_CODE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812
0,A10000434603646497633,126500,70,2,1,3,0,1.0,0,0,...,0.734973,0.85468,-0.330256,39.463535,El Cabanyal-El Canyamelar,46011,1,0,0,0
1,A10001334147587469388,630000,199,4,2,3,1,1.0,1,1,...,0.724361,2.102044,-0.357513,39.456699,Ciutat de les Arts i de les Ciencies,46005,0,0,0,1
2,A10002599312155392987,215000,100,3,2,3,0,1.0,1,1,...,0.42313,0.800718,-0.33432,39.463025,El Grau,46022,0,0,0,1
3,A10002648121225460937,240000,112,3,2,1,0,1.0,0,1,...,0.378732,3.256303,-0.401935,39.469964,Nou Moles,46018,0,0,0,1
4,A10002658173109908582,1160000,289,5,4,2,0,1.0,0,0,...,0.186091,0.770513,-0.367956,39.472377,El Pla del Remei,46004,0,0,0,1


In [40]:
parks_gdf.head()

Unnamed: 0,osm_id,code,fclass,name,geometry
0,4790961,2204,park,Parc de la Cultura,"POLYGON ((-0.38336 39.47045, -0.38336 39.47050..."
1,4826234,2204,park,,"POLYGON ((-0.38224 39.47257, -0.38198 39.47259..."
2,5007139,2016,market_place,Mercat Central,"POLYGON ((-0.37958 39.47318, -0.37953 39.47321..."
3,5007139,2721,attraction,Mercat Central,"POLYGON ((-0.37958 39.47318, -0.37953 39.47321..."
4,7724652,2204,park,Parque del Parterre,"POLYGON ((-0.37097 39.47155, -0.37086 39.47190..."


# Create feature on share of parks within 500 meter radius

__Prepare the two datasets as GeoDataFrames__

In [41]:
# Create function to create Point Objects from LATITUDE and LONGITUDE Columns
def create_point_from_lat_lon(lat, lon):
    try:
        return Point(float(lon), float(lat))
    except Exception as e:
        print(f"Error creating point from lat/lon: {e}")
        return None
    
# Create Geometry Column Using LATITUDE and LONGITUDE Columns
data['GEOMETRY'] = data.apply(lambda row: create_point_from_lat_lon(row['LATITUDE'], row['LONGITUDE']), axis=1)

In [42]:
# Convert houses DataFrame to GeoDataFrame
sale_gdf = gpd.GeoDataFrame(data, geometry='GEOMETRY')
sale_gdf.set_crs(epsg=4326, inplace=True)
sale_gdf.head()

# Ensure the roads geometry is parsed correctly
parks_gdf = parks_gdf.to_crs(epsg=4326)
parks_gdf.head()

Unnamed: 0,osm_id,code,fclass,name,geometry
0,4790961,2204,park,Parc de la Cultura,"POLYGON ((-0.38336 39.47045, -0.38336 39.47050..."
1,4826234,2204,park,,"POLYGON ((-0.38224 39.47257, -0.38198 39.47259..."
2,5007139,2016,market_place,Mercat Central,"POLYGON ((-0.37958 39.47318, -0.37953 39.47321..."
3,5007139,2721,attraction,Mercat Central,"POLYGON ((-0.37958 39.47318, -0.37953 39.47321..."
4,7724652,2204,park,Parque del Parterre,"POLYGON ((-0.37097 39.47155, -0.37086 39.47190..."


In [43]:
parks_gdf = parks_gdf[parks_gdf['fclass'] == 'park']
parks_gdf.set_crs(epsg=4326, inplace=True)

# Reproject both GeoDataFrames to the same CRS (e.g., EPSG:3857)
sale_gdf = sale_gdf.to_crs(epsg=3857)
parks_gdf = parks_gdf.to_crs(epsg=3857)

__Calculate share of parks within 500 meters__

In [44]:
# Define a radius threshold (e.g., 500 meters)
radius_threshold = 500  # in meters

# Buffer the house points to create a 500-meter radius around each house
sale_gdf['buffer'] = sale_gdf['GEOMETRY'].buffer(radius_threshold)

# Function to calculate the percentage of park area within the buffer
def calculate_park_area_percentage(house_buffer, parks_gdf):
    # Intersect the buffer with parks
    intersection = parks_gdf.intersection(house_buffer)

    # Calculate the area of the intersection
    intersection_area = intersection.area.sum()

    # Calculate the area of the buffer
    buffer_area = house_buffer.area

    # Calculate the percentage of the area covered by parks

    park_area_percentage = (intersection_area / buffer_area) * 100

    return park_area_percentage

# Apply the function to each house
sale_gdf['park_area_percentage'] = sale_gdf['buffer'].apply(lambda x: calculate_park_area_percentage(x, parks_gdf))

# Drop the buffer column and GEOMETRY as no longer needed
sale_gdf = sale_gdf.drop(columns=['buffer', 'GEOMETRY'])

# Inspect data incl. new feature

In [45]:
sale_gdf.tail(20)

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,DISTANCE_TO_MAIN_STREET,LONGITUDE,LATITUDE,NEIGHBORHOOD,ZIP_CODE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812,park_area_percentage
27344,A9987738260333417458,372000,110,2,2,3,0,1.0,1,1,...,1.627467,-0.379503,39.469025,Sant Francesc,46002,0,0,0,1,7.020542
27345,A9988124641878621594,81000,85,3,1,3,0,1.0,0,1,...,3.556986,-0.393052,39.455096,Barrio de Favara,46007,0,0,1,0,1.24839
27346,A9988580647876636096,168000,150,2,2,3,1,1.0,1,1,...,2.939921,-0.384187,39.456915,La Raiosa,46007,0,1,0,0,2.096035
27347,A9989056763083051067,244000,97,3,1,2,0,1.0,0,1,...,1.081695,-0.367234,39.469189,El Pla del Remei,46004,1,0,0,0,12.429896
27348,A998950546678228445,214000,100,3,1,3,0,1.0,0,1,...,2.073874,-0.36915,39.460296,Russafa,46005,0,0,1,0,0.328778
27349,A9990023624093089262,143000,103,3,2,3,1,1.0,1,1,...,1.375533,-0.374775,39.489639,Sant Antoni,46020,1,0,0,0,1.43118
27350,A999162102955720358,181000,122,3,2,3,1,1.0,0,1,...,3.001681,-0.381683,39.455192,La Raiosa,46007,1,0,0,0,9.273214
27351,A9991884652594369696,38000,77,3,1,3,0,1.0,0,0,...,1.845228,-0.376528,39.49375,Torrefiel,46020,0,0,0,1,2.693717
27352,A9992993634176194683,55500,105,4,2,3,0,1.0,1,1,...,3.248125,-0.394243,39.498707,Ciutat Fallera,46015,1,0,0,0,18.164038
27353,A9993650448768923173,316000,89,1,1,3,0,1.0,0,0,...,0.87334,-0.370776,39.472359,La Xerea,46004,0,0,1,0,7.738606


# Write the resulting Geodataframe to CSV   

In [46]:
if city == "Madrid":
    sale_gdf.to_csv("../../data/5_cleaned_and_feature_engineering/feature_parks/madrid_cleaned_incl_parks.csv", index=False)
elif city == "Barcelona":
    sale_gdf.to_csv("../../data/5_cleaned_and_feature_engineering/feature_parks/barcelona_cleaned_incl_parks.csv", index=False)
elif city == "Valencia":
    sale_gdf.to_csv("../../data/5_cleaned_and_feature_engineering/feature_parks/valencia_cleaned_incl_parks.csv", index=False)