In [12]:
import pandas as pd

# display all pandas columns
pd.set_option('display.max_columns', None)  # Show all columns

## Data import

In [13]:
df_traffic_data = pd.read_excel('../data_deliveries/1_all_traffic_sensors.xlsx') 
df_air_quality_and_locations = pd.read_parquet("../data_deliveries/3_df_air_quality_and_locations_from_2018.parquet")

In [14]:
len(df_traffic_data)

5095

In [15]:
len(df_traffic_data.drop_duplicates(subset=['longitud', 'latitud']))

5094

In [16]:
len(df_air_quality_and_locations)

1428083

In [17]:
len(df_air_quality_and_locations.drop_duplicates(subset=["LONGITUD", "LATITUD"]))

24

In [18]:
# Remove duplicates for air quality data
df_air_quality_and_locations = df_air_quality_and_locations.drop_duplicates(subset=["LONGITUD", "LATITUD"])

# Remove duplicates for traffic data
df_traffic_data = df_traffic_data.drop_duplicates(subset=['longitud', 'latitud'])  # Assuming 'id' is the identifier for the traffic sensor

In [19]:
df_air_quality_and_locations.columns = df_air_quality_and_locations.columns.str.lower()
df_air_quality_and_locations = df_air_quality_and_locations.rename(columns = {'codigo':'id_no2'})

In [20]:
len(df_air_quality_and_locations)

24

## Visualization

In [34]:
import folium


# Create a map centered around the average coordinates of the NO2 sensors
map_center = [df_air_quality_and_locations["latitud"].mean(), df_air_quality_and_locations["longitud"].mean()]
m = folium.Map(location=map_center, zoom_start=12)


# Add CircleMarkers for traffic sensors
for _, row in df_traffic_data.iterrows():
    folium.CircleMarker(
        location=[row["latitud"], row["longitud"]],
        radius=1,  # Size of the circle
        color='red',  # Color for traffic sensors
        fill=True,
        fill_color='red',
        fill_opacity=0.1,
        popup=f'Sensor Traffic: {row["id_trafico"]}, {row["latitud"]}, {row["longitud"]}'
    ).add_to(m)

# Add CircleMarkers for NO2 sensors
for _, row in df_air_quality_and_locations.iterrows():
    folium.CircleMarker(
        location=[row["latitud"], row["longitud"]],
        radius=3,  # Size of the circle
        color='blue',  # Color for NO2 sensors
        fill=True,
        fill_color='blue',
        fill_opacity=0.9,
        popup=f'Sensor NO2: {row["longitud"]}, {row["latitud"]}'
    ).add_to(m)

# Add marker for meteorological sensor
folium.CircleMarker(
    location=[40.5, -3.7],
    radius=10,
    color='yellow',
    fill=True,
    fill_color='yellow',
    fill_opacity=0.9,
    popup='Sensor Meteorológico: 40.5, -3.7'
).add_to(m)

# Add legend
legend_html = '''
<div style="position: fixed; 
            bottom: 50px; left: 50px; width: 200px; height: 130px; 
            background-color: white; border:2px solid grey; z-index:9999; 
            font-size:14px; padding: 10px
            ">
<p><b>Leyenda</b></p>
<p><i class="fa fa-circle" style="color:red"></i> Sensores de Tráfico</p>
<p><i class="fa fa-circle" style="color:blue"></i> Sensores de NO2</p>
<p><i class="fa fa-circle" style="color:yellow"></i> Sensor Meteorológico</p>
</div>
'''
m.get_root().html.add_child(folium.Element(legend_html))

# Display the map
m

## Mapear Sensores de NO2 con sensores de Trafico

`Para ello, como primera iteracion, nos quedaremos solo con los sensores de NO2 que tengan al menos un sensor de trafico en el radio de 200 metros. En el caso de que no haya sensores, no usaremos ese sensor de NO2. En el caso de que haya mas de uno, usaremos los valores intercuartiles.`

In [22]:
2stop

SyntaxError: invalid decimal literal (386111058.py, line 1)

In [None]:
#porque antes he eliminado duplicados...
df_traffic_data = pd.read_excel('../data_deliveries/1_all_traffic_sensors.xlsx') 

In [None]:
df_air_quality_and_locations = df_air_quality_and_locations[['id_no2','latitud','longitud']]
df_traffic_data = df_traffic_data[['id_trafico','latitud','longitud']]

df_air_quality_and_locations = df_air_quality_and_locations.rename(columns={'CODIGO':'id_no2'})

df_air_quality_and_locations['id_no2'] = df_air_quality_and_locations['id_no2'].astype(str)
df_traffic_data['id_trafico'] = df_traffic_data['id_trafico'].astype(str)


In [None]:
df_air_quality_and_locations = df_air_quality_and_locations.rename(columns = {'latitud':'latitud_no2','longitud':'longitud_no2'})

In [None]:
df_traffic_data = df_traffic_data.rename(columns = {'latitud':'latitud_trafico','longitud':'longitud_trafico'})

In [None]:
import numpy as np
import pandas as pd
from scipy.spatial import cKDTree

# ---------------------------------
# 1. Constantes y utilidad Haversine
# ---------------------------------
R = 6_371_000  # radio medio de la Tierra en metros

def haversine(lat1, lon1, lat2, lon2):
    """
    Distancia gran‑círculo entre dos puntos en radianes.
    Regresa metros.
    """
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# ------------------------------------------------
# 2. Convertir coordenadas a radianes (una sola vez)
# ------------------------------------------------
df_air_quality_and_locations[['latitud_rad_no2', 'longitud_rad_no2']] = np.radians(df_air_quality_and_locations[['latitud_no2', 'longitud_no2']])
df_traffic_data[['latitud_rad_trafico', 'longitud_rad_trafico']] = np.radians(df_traffic_data[['latitud_trafico', 'longitud_trafico']])


# ------------------------------------------
# 3. Construir KDTree para filtrar candidatos
# ------------------------------------------
tree = cKDTree(df_traffic_data[['latitud_rad_trafico', 'longitud_rad_trafico']].values)

# Radio de búsqueda: 200 m → arco en radianes
radio = 200 / R

# -----------------------------------------------
# 4. Buscar y calcular distancias Haversine reales
# -----------------------------------------------
asignaciones = []

for _, no2 in df_air_quality_and_locations.iterrows():
    # índices de sensores de tráfico dentro de 200 m (aprox)
    idx_cercanos = tree.query_ball_point([no2.latitud_rad_no2, no2.longitud_rad_no2], radio)

    # para cada candidato, distancia exacta
    for idx in idx_cercanos:
        traf = df_traffic_data.iloc[idx]
        dist = haversine(
            no2.latitud_rad_no2, no2.longitud_rad_no2,
            traf.latitud_rad_trafico, traf.longitud_rad_trafico
        )
        asignaciones.append({
            'id_no2':      no2.id_no2,
            'latitud_no2':  no2.latitud_no2,
            'longitud_no2':  no2.longitud_no2,
            'id_trafico':  traf.id_trafico,
            'distance_m':  round(dist, 1),  
            'latitud_trafico':  traf.latitud_trafico,
            'longitud_trafico':  traf.longitud_trafico
        })

df_mapping_no2_to_traffic = (
    pd.DataFrame(asignaciones)
      .sort_values(['id_no2', 'distance_m'])
      .reset_index(drop=True)
)

In [None]:
df_mapping_no2_to_traffic.to_csv("4_no2_to_traffic_sensor_mapping.csv" ,index=False)

In [None]:
#df_mapping_no2_to_traffic.to_csv("../../data/processed/mapping/no2_to_traffic_sensor_mapping.csv" ,index=False)