# Find nearest centroids for census data

The purpose of this notebook is to map the processed census data to specific locations, using  geographic information system (GIS) tools.
This will enable us to integrate it with our real estate information.
More details available in the README file.

### Import libraries

In [59]:
import pandas as pd
from geopy.distance import geodesic

### Read data

In [146]:
df_pc = pd.read_csv('../../../../data/3_external_data/census/geodata/Spain_Postal_Code.csv', dtype={'Admin_Code3': str, 'Postal_Code': str})
df_cs = pd.read_csv('../../../../data/3_external_data/census/geodata/seccion_censal_centroids.csv', dtype={'CUMUN': str})

In [147]:
df_pc = df_pc[['Postal_Code', 'Latitude', 'Longitude', 'Admin_Code3']]
df_cs = df_cs[['CUSEC', 'CUMUN','longitude','latitude']]

df_pc['Admin_Code3'] = df_pc['Admin_Code3'].astype(str).str.zfill(5)
df_pc['Postal_Code'] = df_pc['Postal_Code'].astype(str).str.zfill(5)

In [148]:
df_pc.head()

Unnamed: 0,Postal_Code,Latitude,Longitude,Admin_Code3
0,39812,43.2881,-3.5391,39067
1,15688,43.1063,-8.4458,15060
2,39213,42.9659,-4.1171,39027
3,4409,36.9658,-2.6021,4012
4,33844,43.2217,-6.2505,33005


### Find the nearest centroids

In [149]:
def find_nearest_centroid(lat, lon, municipality, centroids_df):
    # Filter centroids by the same municipality
    same_municipality_centroids = centroids_df[centroids_df['CUMUN'] == municipality]
    
    # If there are centroids in the same municipality, search among them
    if not same_municipality_centroids.empty:
        nearest_centroid_id, min_distance = _find_nearest(lat, lon, same_municipality_centroids)
        # Check if the nearest distance is greater than 20 km
        if min_distance <= 20000:
            return nearest_centroid_id, min_distance
    
    # If no centroids in the same municipality or distance > 20 km, search the entire DataFrame
    return _find_nearest(lat, lon, centroids_df)

def _find_nearest(lat, lon, centroids):
    min_distance = float('inf')
    nearest_centroid_id = None
    for _, centroid in centroids.iterrows():
        centroid_coords = (centroid['latitude'], centroid['longitude'])
        address_coords = (lat, lon)
        distance = geodesic(address_coords, centroid_coords).meters
        if distance < min_distance:
            min_distance = distance
            nearest_centroid_id = centroid['CUSEC']
    return nearest_centroid_id, min_distance

In [150]:
df_pc['Nearest_Centroid_ID'] = None
df_pc['Distance_to_Centroid'] = None

In [151]:
for idx, postal_code in df_pc.iterrows():
    nearest_centroid_id, distance = find_nearest_centroid(postal_code['Latitude'], postal_code['Longitude'], postal_code['Admin_Code3'], df_cs)
    df_pc.at[idx, 'Nearest_Centroid_ID'] = nearest_centroid_id
    df_pc.at[idx, 'Distance_to_Centroid'] = distance

In [152]:
df_pc[df_pc['Distance_to_Centroid'] == df_pc['Distance_to_Centroid'].max()]

Unnamed: 0,Postal_Code,Latitude,Longitude,Admin_Code3,Nearest_Centroid_ID,Distance_to_Centroid
25555,30382,37.6333,-0.0833,30016,3001606008,55495.527921


In [158]:
df_pc = df_pc.rename(columns={"Nearest_Centroid_ID": "Census_Section"})


In [166]:
df_pc['Census_Section'].isnull().sum()

0

### Write resulting mapping data to CSV

In [None]:
df_pc.to_csv('../../../../data/3_external_data/census/mapping_data/map_cs_pc.csv')