# Notebook 1 - Merge of raw sales data and polygons

__Import libraries__

In [8]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point, Polygon
from shapely.validation import explain_validity

__Choose city for which to merge data__

In [9]:
#Choose city ("Madrid", "Barcelona", or "Valencia")
city = "Barcelona"

__Load dataset__

In [10]:
#Read corresponding sales and polygon files
if city == "Madrid":
    data_polygon = pd.read_csv('../../data/1_raw_idealista_data/Madrid_pol.csv')
    data_sale = pd.read_csv('../../data/1_raw_idealista_data/Madrid_sale.csv')
elif city == "Barcelona":
    data_polygon = pd.read_csv('../../data/1_raw_idealista_data/Barcelona_pol.csv')
    data_sale = pd.read_csv('../../data/1_raw_idealista_data/Barcelona_sale.csv')
elif city == "Valencia":
    data_polygon = pd.read_csv('../../data/1_raw_idealista_data/Valencia_pol.csv')
    data_sale = pd.read_csv('../../data/1_raw_idealista_data/Valencia_sale.csv')
else:
    raise ValueError("City not recognized. Please choose either 'Madrid', 'Barcelona', or 'Valencia'.")

__Merge sales data with polygons__

In [11]:
# Function to parse the geometry string and create Point objects
def parse_house_point(geometry_str):
    try:
        lon, lat = map(float, geometry_str.split('|'))
        return Point(lon, lat)
    except Exception as e:
        print(f"Error parsing point geometry: {e}")
        return None

# Function to parse the geometry string and create Polygon objects
def parse_polygon(geometry_str):
    try:
        coords = list(map(float, geometry_str.split('|')))
        midpoint = len(coords) // 2
        longitudes = coords[:midpoint]
        latitudes = coords[midpoint:]
        if len(longitudes) != len(latitudes):
            raise ValueError("Mismatch in the number of longitudes and latitudes")
        points = list(zip(longitudes, latitudes))
        polygon = Polygon(points)
        if not polygon.is_valid:
            print(f"Invalid Polygon: {explain_validity(polygon)}")
            polygon = polygon.buffer(0)  # Attempt to fix invalid polygons
        return polygon
    except Exception as e:
        print(f"Error parsing polygon: {e}")
        return None

# Create a new geometry column for points (houses)
data_sale['geometry'] = data_sale['geometry'].apply(parse_house_point)

# Convert houses DataFrame to GeoDataFrame
data_sale_gdf = gpd.GeoDataFrame(data_sale, geometry='geometry')
data_sale_gdf.set_crs(epsg=4326, inplace=True)

# Parse the geometry column to create polygons
data_polygon['geometry'] = data_polygon['geometry'].apply(parse_polygon)

# Remove any rows where the geometry parsing failed
data_polygon = data_polygon[data_polygon['geometry'].notnull()]

# Convert polygons DataFrame to GeoDataFrame
data_polygon_gdf = gpd.GeoDataFrame(data_polygon, geometry='geometry')
data_polygon_gdf.set_crs(epsg=4326, inplace=True)

# Debug: Check the first few rows of each GeoDataFrame
print("House GeoDataFrame (first 5 rows):")
print(data_sale_gdf.head())

print("Polygon GeoDataFrame (first 5 rows):")
print(data_polygon_gdf.head())

# Perform the spatial join
joined_gdf = gpd.sjoin(data_sale_gdf, data_polygon_gdf, how='left', predicate='within', lsuffix='left', rsuffix='right')

# Debug: Check the joined GeoDataFrame
print("Joined GeoDataFrame (first 5 rows):")
print(joined_gdf.head())

# Ensure unique indices to avoid duplicate issues
joined_gdf = joined_gdf.reset_index(drop=True)

# Extract the 'LOCATIONNAME' from the joined result and add it to the original houses GeoDataFrame
data_sale_gdf['neighborhood'] = joined_gdf['LOCATIONNAME']

# Debug: Check the final results
print("Final Houses GeoDataFrame with 'neighborhood' (first 5 rows):")
print(data_sale_gdf.head())

House GeoDataFrame (first 5 rows):
                 ASSETID  PERIOD     PRICE    UNITPRICE  CONSTRUCTEDAREA  \
0  A11898131848556022319  201803  323000.0  3845.238095               84   
1  A18099432772155664747  201803  217000.0  2583.333333               84   
2   A2003099089407882787  201803  114000.0  1407.407407               81   
3   A1010373782315301134  201803  378000.0  4784.810127               79   
4  A12978912200216838006  201803  434000.0  3909.909910              111   

   ROOMNUMBER  BATHNUMBER  HASTERRACE  HASLIFT  HASAIRCONDITIONING  ...  \
0           4           1           1        1                   1  ...   
1           3           2           0        1                   1  ...   
2           2           1           0        1                   1  ...   
3           2           1           0        1                   0  ...   
4           4           2           1        1                   1  ...   

   CADASTRALQUALITYID  BUILTTYPEID_1  BUILTTYPEID_2  BUIL

__Save combined sales and polygon data to a CSV file__

In [12]:
if city == "Madrid":
    output_csv_path = '../../data/2_raw_idealista_data_incl_polygon/madrid_sale_with_polygon.csv'
    data_sale_gdf.to_csv(output_csv_path, index=False)
elif city == "Barcelona":
    output_csv_path = '../../data/2_raw_idealista_data_incl_polygon/barcelona_sale_with_polygon.csv'
    data_sale_gdf.to_csv(output_csv_path, index=False)
elif city == "Valencia":
    output_csv_path = '../../data/2_raw_idealista_data_incl_polygon/valencia_sale_with_polygon.csv'
    data_sale_gdf.to_csv(output_csv_path, index=False)