In [1]:
import pandas as pd
import pyarrow
import os
# save and load data
import pickle
save_dir = "/saved_data"
wd = "/content/drive/MyDrive/Datathon"

def save_variable(variable, filename):
  f = open(filename, "wb")
  pickle.dump(variable, f)
  f.close()
  return filename

def load_variable(filename):
  f = open(filename, "rb")
  variable = pd.read_pickle(f)
  f.close()
  return variable

In [2]:
# Get the current directory
current_dir = os.getcwd()

# List all files in the directory
files = os.listdir(current_dir)

# Filter out the Parquet files
parquet_files = [file for file in files if file.endswith('.parquet.gzip') or file.endswith('.parquet')]

# Loop through each Parquet file, read it, and assign to a DataFrame
for file in parquet_files:
    # Construct the full file path
    file_path = os.path.join(current_dir, file)
    
    # Extract dataframe name from file name without extension
    dataframe_name = os.path.splitext(file)[0]
    # Remove the extension .gzip if present
    dataframe_name = dataframe_name.replace('.gzip', '')
    # Remove the extension .parquet if present
    dataframe_name = dataframe_name.replace('.parquet', '')
    # Add the _df suffix
    #dataframe_name += '_df'
    
    # Read the Parquet file into a DataFrame and assign it to a dynamically created variable
    globals()[dataframe_name] = pd.read_parquet(file_path)

    # Now you have individual variables named interventions_df, x_df, etc., each containing a DataFrame
    # You can directly access these DataFrames and perform operations on them, for example:
    print(f"DataFrame '{dataframe_name}' shape: {globals()[dataframe_name].shape}")
    print(globals()[dataframe_name].head())  # Example operation on the DataFrame
     

DataFrame 'aed_locations' shape: (15227, 11)
     id  type                address  number  postal_code municipality  \
0  13.0  None    Blvd. Fr. Roosevelt    24.0       7060.0     SOIGNIES   
1  70.0  None        Ch. De Wégimont    76.0       4630.0      Ayeneux   
2  71.0  None  Place Saint - Lambert     NaN       4020.0        Liège   
3  72.0  None          Rue du Doyard     NaN       4990.0     Lierneux   
4  73.0  None     Fond Saint Servais     NaN       4000.0        Liège   

  province location public available hours  
0  Hainaut     None      Y      None  None  
1    Liège     None   None      None  None  
2    Liège     None   None      None  None  
3    Liège     None   None      None  None  
4    Liège     None   None      None  None  
DataFrame 'ambulance_locations' shape: (279, 9)
                                     base  \
0             BA ANTW [Borgerhout] AMBUCE   
1                BA ANTW [Hoboken] AMBUCE   
2  BA ANTW [Hof Ter Schelde] AC Antwerpen   
3           

DataFrame 'interventions2' shape: (200627, 46)
    Mission ID                       Service Name  PostalCode permanence  \
0  50221920087       FH PDS COMI Wallonie Picarde                 7784.0   
1  50221920088       FH PDS TOUR Wallonie Picarde                 7500.0   
2  50221920089         FH PDS LOUV Hainaut Centre                 7100.0   
3  50221920090  FW HVP HEUV [Nieuwkerke] Westhoek                 8950.0   
4  50221920092                BH SGHI Croix Rouge                 7331.0   

           CityName permanence      StreetName permanence  \
0  Comines-Warneton (Warneton)          Chauss√©e d'Ypres   
1            Tournai (Tournai)            Avenue de Maire   
2  La Louvi√®re (La Louvi√®re)  Boulevard du Roi Baudouin   
3      Heuvelland (Nieuwkerke)            Dranouterstraat   
4     Saint-Ghislain (Baudour)             Rue Louis Caty   

  HouseNumber permanence  Latitude permanence  Longitude permanence  \
0                   None             50.76034             

DataFrame 'interventions_bxl2' shape: (38620, 36)
    Mission ID                T0         Cityname Intervention  \
0  20221520003  01JUN22:00:02:45  1030 schaerbeek (schaerbeek)   
1  20221520005  01JUN22:00:04:58  1030 schaerbeek (schaerbeek)   
2  20221520006  01JUN22:00:07:43  1081 koekelberg (koekelberg)   
3  20221520006  01JUN22:00:07:43  1081 koekelberg (koekelberg)   
4  20221520007  01JUN22:00:09:18  1030 schaerbeek (schaerbeek)   

   Longitude intervention  Latitude intervention  description_nl  \
0            4.406731e+10           5.085516e+10  Nieuwe melding   
1            4.368798e+10           5.086318e+10  Nieuwe melding   
2            4.332312e+10           5.085755e+10  Nieuwe melding   
3            4.332312e+10           5.085755e+10  Nieuwe melding   
4            4.383695e+10           5.085225e+10  Nieuwe melding   

  ic_description_nl                           EventType and EventLevel  \
0           Medisch                                  P033 N05 - TRAUMA

In [3]:
###############################################################################
# 0. check the variables and nan's
# print(interventions_bxl.info())
print(interventions_bxl.columns)

###############################################################################
# 1. select potentially useful variables
# keys = ["postalcode_intervention", "latitude_intervention", "longitude_intervention",
#         "eventtype_firstcall", "eventLevel_firstcall", "eventtype_trip",
#         "eventlevel_trip", "t0", "t3", "waiting_time"]
keys = ["postalcode_intervention", "latitude_intervention", "longitude_intervention",
        "eventtype_firstcall", "eventLevel_firstcall", "eventtype_trip",
        "eventlevel_trip", "waiting_time"]
interventions_bxl_1 = interventions_bxl[keys]

###############################################################################
# 2. drop data with nan's
interventions_bxl_2 = interventions_bxl_1.dropna()

###############################################################################
# 3. check event type
# 999 cases' event type changed on trip compared with on first call
# print((interventions_bxl_2["eventtype_firstcall"] != \
#        interventions_bxl_2["eventtype_trip"]).sum())

# related to heart: "P011 - Chest pain", "P039 - Cardiac problem
#  (other than thoracic pain)", "P003 - Cardiac arrest"
# need AED: "P003 - Cardiac arrest"
# print(interventions_bxl_2["eventtype_firstcall"].unique())
heart_related = ["P011 - Chest pain",
                 "P039 - Cardiac problem (other than thoracic pain)",
                 "P003 - Cardiac arrest"]
interventions_bxl_3 = interventions_bxl_2[interventions_bxl_2\
                      ["eventtype_firstcall"].isin(heart_related)]
# print(interventions_bxl_3) # 6358 × 6

###############################################################################
# 4. compare event type and level at the first call and at the trip
# 78 changed event type & 490 changed event level
# use the ones of first call because people who pass by the patient may not be able to distinguish
# levels generally decreases
# print((interventions_bxl_3["eventtype_firstcall"] != \
#        interventions_bxl_3["eventtype_trip"]).sum())
# print((interventions_bxl_3["eventLevel_firstcall"] != \
#        interventions_bxl_3["eventlevel_trip"]).sum())
# print(interventions_bxl_3[["eventtype_firstcall", "eventtype_trip"]]\
#                           [interventions_bxl_3["eventtype_firstcall"] != \
#                            interventions_bxl_3["eventtype_trip"]])
# print(interventions_bxl_3[["eventLevel_firstcall", "eventlevel_trip"]]\
#                           [interventions_bxl_3["eventLevel_firstcall"] != \
#                            interventions_bxl_3["eventlevel_trip"]])

# check event levels
level = interventions_bxl_3["eventLevel_firstcall"].apply(lambda x: int(x[1:]))
interventions_bxl_4 = interventions_bxl_3.assign(level_num = level)
# print(interventions_bxl_4)

###############################################################################
# 5. delete outliers in waiting_time
# define the function to delete outliers in a numeric variable
def delete_outliers(dataframe, column_name, threshold = 2):
  # data is a column of dataframe
  data = dataframe[column_name]
  m = data.mean()
  sd  = data.std()
  return dataframe[(data - m).abs() < threshold * sd]

# some waiting time is more than 20 hours, 1 is 999999999.0, which do not make sense
interventions_bxl_5 = interventions_bxl_4[interventions_bxl_4["waiting_time"] < 200]
# print(interventions_bxl_5)

###############################################################################
# 6. delete outliers in latitude & longtitude
# brussels: 50.8467°N 4.3525°E
interventions_bxl_6 = interventions_bxl_5[\
                      (interventions_bxl_5["latitude_intervention"] > 4500000) &\
                      (interventions_bxl_5["latitude_intervention"] < 5500000) &\
                      (interventions_bxl_5["longitude_intervention"] > 400000) &\
                      (interventions_bxl_5["longitude_intervention"] < 500000)]
# print(interventions_bxl_6) # 5156 x 9

###############################################################################
# 7. change the scale of latitude & longtitude
interventions_bxl_7 = interventions_bxl_6
interventions_bxl_7["latitude_intervention"] = \
                   interventions_bxl_6["latitude_intervention"].div(100000)
interventions_bxl_7["longitude_intervention"] = \
                   interventions_bxl_6["longitude_intervention"].div(100000)
print(interventions_bxl_7)

Index(['mission_id', 'service_name', 'postalcode_permanence',
       'cityname_permanence', 'streetname_permanence',
       'housenumber_permanence', 'latitude_permanence', 'longitude_permanence',
       'permanence_short_name', 'permanence_long_name', 'vector_type',
       'eventtype_firstcall', 'eventLevel_firstcall', 'eventtype_trip',
       'eventlevel_trip', 'postalcode_intervention', 'cityname_intervention',
       'latitude_intervention', 'longitude_intervention', 't0', 't1',
       't1confirmed', 't2', 't3', 't4', 't5', 't6', 't7', 't9',
       'intervention_time_t1reported', 'waiting_time', 'intervention_duration',
       'departure_time_t1reported', 'unavailable_time',
       'name_destination_hospital', 'postalcode_destination_hospital',
       'cityname_destination_hospital', 'streetname_destination_hospital',
       'housenumber_destination_hospital', 'calculated_traveltime_departure_',
       'calculated_distance_departure_to', 'calculated_traveltime_destinatio',
       '

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interventions_bxl_7["latitude_intervention"] = \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interventions_bxl_7["longitude_intervention"] = \


In [4]:
aed_location= aed_locations

In [5]:
###############################################################################
# 0. check the variables and nan's
# print(aed_location.info())
# print(aed_location["address"])
# print(aed_location) # 15225 × 11

###############################################################################
# 1. only keep data in brussels
aed_location_1 = aed_location[aed_location["province"] == "Bruxelles-Brussel"]
# print(aed_location_1) # 2042 x 11

# print(aed_location_1["number"].isna().sum()) # 209 are nan's

###############################################################################
# 2. add the latitude and the longitude of corresponding address
from geopy.geocoders import Nominatim
import time

def find_location(address, number, province = "Bruxelles-Brussel",
                  country = "Belgium"):
  full_address = "{} {}, {}, {}".format(int(number), address, province, country)
  geolocator = Nominatim(user_agent = "my_app")
  location = geolocator.geocode(full_address, timeout=None)
  if location:
    return location.latitude, location.longitude
  return float("nan")

# calculating the latitude and the longitude data
# cost ~20 min...
# do not delete print()!
# lat_and_lon = []
# for i in range(aed_location_1.shape[0]):
#   address = list(aed_location_1["address"])[i]
#   number_raw = list(aed_location_1["number"])[i]
#   if pd.isna(number_raw):
#     lat_and_lon.append(float("nan"))
#     print("index = {}".format(i)) # slow the speed
#     continue
#   number = int(number_raw)
#   rst = find_location(address, number)
#   lat_and_lon.append(rst)
#   print("index = {}".format(i))

# save_variable(lat_and_lon, wd + save_dir + "/lat_and_lon.txt")
lat_and_lon = load_variable("lat_and_lon.txt")

latitude = []
longitude = []
for item in lat_and_lon:
  if pd.isna(item):
    latitude.append(float("nan"))
    longitude.append(float("nan"))
  else:
    latitude.append(item[0])
    longitude.append(item[1])

aed_location_2 = aed_location_1.assign(latitude = latitude,
                                       longitude = longitude)
# print(aed_location_2)

###############################################################################
# 3. only keep latitude & longitude data
keys = ["id", "postal_code", "latitude", "longitude", 'address']
aed_location_3 = aed_location_2[keys]

###############################################################################
# 4. drop data with nan's
aed_location_4 = aed_location_3.dropna()
print(aed_location_4) # 1562 x 4

            id  postal_code   latitude  longitude                 address
11        86.0       1040.0  50.842792   4.384351            Schumanplein
14        96.0       1090.0  50.882962   4.335248  Graafschap - Jettelaan
18       101.0       1070.0  50.831942   4.328980      Tweestationsstraat
60       315.0       1040.0  50.845150   4.369893               Wetstraat
69       335.0       1130.0  50.884114   4.419841      Chaussée de Haecht
...        ...          ...        ...        ...                     ...
15084  16518.0       1030.0  50.865560   4.378253        Rue de Jerusalem
15092  16526.0       1000.0  50.849255   4.363412          Koningsstraat 
15192  16629.0       1210.0  50.855420   4.357981     Avenue du Boulevard
15204  16643.0       1070.0  50.815393   4.294843     Avenue Emile Gryson
15225  16666.0       1050.0  50.821723   4.362921     Albert Leemansplein

[1562 rows x 5 columns]


In [6]:
import math

df = aed_location_2[["address", "number", "latitude", "longitude"]]
# x = list(df["latitude"])
# for i in range(1, len(x)):
#   if abs(x[i] - x[i - 1]) < 1e-8:
#     print("{}: ({}, {})".format(i, x[i], x[i-1]))
df.iloc[53:57, :]

Unnamed: 0,address,number,latitude,longitude
320,Bourgetlaan,42.0,50.877313,4.432468
321,Bourgetlaan,42.0,50.877313,4.432468
322,Bourgetlaan,42.0,50.877313,4.432468
327,Woluwedal,102.0,50.855719,4.444934


In [7]:
aed_location_4

Unnamed: 0,id,postal_code,latitude,longitude,address
11,86.0,1040.0,50.842792,4.384351,Schumanplein
14,96.0,1090.0,50.882962,4.335248,Graafschap - Jettelaan
18,101.0,1070.0,50.831942,4.328980,Tweestationsstraat
60,315.0,1040.0,50.845150,4.369893,Wetstraat
69,335.0,1130.0,50.884114,4.419841,Chaussée de Haecht
...,...,...,...,...,...
15084,16518.0,1030.0,50.865560,4.378253,Rue de Jerusalem
15092,16526.0,1000.0,50.849255,4.363412,Koningsstraat
15192,16629.0,1210.0,50.855420,4.357981,Avenue du Boulevard
15204,16643.0,1070.0,50.815393,4.294843,Avenue Emile Gryson


# See the Statistics on Critical Points 

In [8]:
# Loading the DataFrame from the pickle file
critical_points = pd.read_pickle("merged_df.pkl")
critical_points

Unnamed: 0,streetname_permanence,initial_count,late_interventions_count,average_speed,average_time_difference,proportion_late_interventions,midpoint_coordinates,street_distance,start_point,end_point
0,Avenue De FrÈ - De FrÈlaan,318,111,4.606633,0 days 00:17:24.561550093,0.349057,"(50.8031574, 4.3412639)",0.0,"(Avenue De Fré - De Frélaan, Stalle, Uccle - U...","(Avenue De Fré - De Frélaan, Stalle, Uccle - U..."
1,Avenue Hippocrate - Hippokrateslaan,268,82,3.722012,0 days 00:32:23.670930279,0.30597,"(50.853064849999996, 4.4535604499999994)",0.0,"(Brussels / UCL, Avenue Hippocrate - Hippokrat...","(Brussels / UCL, Avenue Hippocrate - Hippokrat..."
2,Avenue de l'Arbre Ballon,315,45,4.666605,0 days 00:13:27.738957354,0.142857,"(50.8930652, 4.3166745)",0.0,(Avenue de l'Arbre Ballon - Dikkebeuklaan / Av...,(Avenue de l'Arbre Ballon - Dikkebeuklaan / Av...
3,Avenue de l'HÈliport,807,140,3.479987,0 days 00:16:58.442625852,0.173482,"(50.8643118, 4.3556945)",0.0,"(Avenue de l'Héliport - Helihavenlaan, Quartie...","(Avenue de l'Héliport - Helihavenlaan, Quartie..."
4,Avenue de la ChÍnaie,96,27,4.389067,0 days 00:19:49.679810361,0.28125,,,,
5,Avenue du Laerbeek - Laarbeeklaan,162,65,5.728057,0 days 00:16:54.900619440,0.401235,"(50.886818649999995, 4.307567420033124)",0.0,"(Vrije Universiteit Brussel — Campus Jette, 10...","(Vrije Universiteit Brussel — Campus Jette, 10..."
6,Bergense Steenweg,664,106,3.578969,0 days 00:20:40.491780417,0.159639,"(50.8445236, 4.3377892)",0.0,"(Chaussée de Mons - Bergense Steenweg, Dansaer...","(Chaussée de Mons - Bergense Steenweg, Dansaer..."
7,Boulevard du Triomphe,464,90,3.788297,0 days 00:14:32.322593765,0.193966,"(50.8173133, 4.4011946)",0.0,"(Boulevard du Triomphe - Triomflaan, Cité Volt...","(Boulevard du Triomphe - Triomflaan, Cité Volt..."
8,Broekstraat,245,72,4.097009,0 days 00:15:09.900237107,0.293878,"(50.8526523, 4.3600212)",0.0,"(Rue du Marais - Broekstraat, Quartier Marais-...","(Rue du Marais - Broekstraat, Quartier Marais-..."
9,Charles Degrouxstraat,118,29,2.751265,0 days 00:30:01.388118609,0.245763,"(50.8415816, 4.4000444)",0.0,"(Rue Charles Degroux - Charles Degrouxstraat, ...","(Rue Charles Degroux - Charles Degrouxstraat, ..."


In [9]:
aed_location_4

Unnamed: 0,id,postal_code,latitude,longitude,address
11,86.0,1040.0,50.842792,4.384351,Schumanplein
14,96.0,1090.0,50.882962,4.335248,Graafschap - Jettelaan
18,101.0,1070.0,50.831942,4.328980,Tweestationsstraat
60,315.0,1040.0,50.845150,4.369893,Wetstraat
69,335.0,1130.0,50.884114,4.419841,Chaussée de Haecht
...,...,...,...,...,...
15084,16518.0,1030.0,50.865560,4.378253,Rue de Jerusalem
15092,16526.0,1000.0,50.849255,4.363412,Koningsstraat
15192,16629.0,1210.0,50.855420,4.357981,Avenue du Boulevard
15204,16643.0,1070.0,50.815393,4.294843,Avenue Emile Gryson


In [10]:
from sklearn.metrics.pairwise import haversine_distances
from math import radians

# Remove rows with missing values from both DataFrames
critical_points_clean = critical_points.dropna(subset=['midpoint_coordinates'])
aed_location_4_clean = aed_location_4.dropna(subset=['latitude', 'longitude'])

# Convert latitude and longitude from degrees to radians
critical_points_clean['midpoint_coordinates_rad'] = critical_points_clean['midpoint_coordinates'].apply(lambda x: [radians(coord) for coord in x])
aed_location_4_clean['coordinates_rad'] = aed_location_4_clean.apply(lambda row: [radians(row['latitude']), radians(row['longitude'])], axis=1)

# Check if there are still missing values after cleaning
if critical_points_clean.isnull().values.any() or aed_location_4_clean.isnull().values.any():
    print("Error: Missing values remain after cleaning.")
else:
    # Calculate Haversine distances between critical points and candidate points
    distances = haversine_distances(critical_points_clean['midpoint_coordinates_rad'].tolist(), aed_location_4_clean['coordinates_rad'].tolist())

    # Find the closest candidate point for each critical point
    closest_distances = distances.min(axis=1)
    closest_candidate_indices = distances.argmin(axis=1)

    # Update critical_points DataFrame with closest candidate distance and index
    critical_points_clean['closest_candidate_distance'] = closest_distances * 6371  # Multiply by Earth's radius to get distance in kilometers
    critical_points_clean['closest_candidate_index'] = closest_candidate_indices

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  critical_points_clean['midpoint_coordinates_rad'] = critical_points_clean['midpoint_coordinates'].apply(lambda x: [radians(coord) for coord in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  critical_points_clean['closest_candidate_distance'] = closest_distances * 6371  # Multiply by Earth's radius to get distance in kilometers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.

In [11]:
critical_points_clean

Unnamed: 0,streetname_permanence,initial_count,late_interventions_count,average_speed,average_time_difference,proportion_late_interventions,midpoint_coordinates,street_distance,start_point,end_point,midpoint_coordinates_rad,closest_candidate_distance,closest_candidate_index
0,Avenue De FrÈ - De FrÈlaan,318,111,4.606633,0 days 00:17:24.561550093,0.349057,"(50.8031574, 4.3412639)",0.0,"(Avenue De Fré - De Frélaan, Stalle, Uccle - U...","(Avenue De Fré - De Frélaan, Stalle, Uccle - U...","[0.8866823670389219, 0.07576934875296987]",0.238565,116
1,Avenue Hippocrate - Hippokrateslaan,268,82,3.722012,0 days 00:32:23.670930279,0.30597,"(50.853064849999996, 4.4535604499999994)",0.0,"(Brussels / UCL, Avenue Hippocrate - Hippokrat...","(Brussels / UCL, Avenue Hippocrate - Hippokrat...","[0.8875534163626962, 0.07772929328910029]",0.158783,757
2,Avenue de l'Arbre Ballon,315,45,4.666605,0 days 00:13:27.738957354,0.142857,"(50.8930652, 4.3166745)",0.0,(Avenue de l'Arbre Ballon - Dikkebeuklaan / Av...,(Avenue de l'Arbre Ballon - Dikkebeuklaan / Av...,"[0.8882515541721465, 0.07534018276187997]",0.834109,1320
3,Avenue de l'HÈliport,807,140,3.479987,0 days 00:16:58.442625852,0.173482,"(50.8643118, 4.3556945)",0.0,"(Avenue de l'Héliport - Helihavenlaan, Quartie...","(Avenue de l'Héliport - Helihavenlaan, Quartie...","[0.8877497126710036, 0.07602121023600816]",0.153142,1156
5,Avenue du Laerbeek - Laarbeeklaan,162,65,5.728057,0 days 00:16:54.900619440,0.401235,"(50.886818649999995, 4.307567420033124)",0.0,"(Vrije Universiteit Brussel — Campus Jette, 10...","(Vrije Universiteit Brussel — Campus Jette, 10...","[0.8881425313077559, 0.07518123423121557]",0.0,635
6,Bergense Steenweg,664,106,3.578969,0 days 00:20:40.491780417,0.159639,"(50.8445236, 4.3377892)",0.0,"(Chaussée de Mons - Bergense Steenweg, Dansaer...","(Chaussée de Mons - Bergense Steenweg, Dansaer...","[0.8874043434279604, 0.0757087037974508]",0.0,142
7,Boulevard du Triomphe,464,90,3.788297,0 days 00:14:32.322593765,0.193966,"(50.8173133, 4.4011946)",0.0,"(Boulevard du Triomphe - Triomflaan, Cité Volt...","(Boulevard du Triomphe - Triomflaan, Cité Volt...","[0.886929434102505, 0.07681533679099482]",0.115694,184
8,Broekstraat,245,72,4.097009,0 days 00:15:09.900237107,0.293878,"(50.8526523, 4.3600212)",0.0,"(Rue du Marais - Broekstraat, Quartier Marais-...","(Rue du Marais - Broekstraat, Quartier Marais-...","[0.8875462160068672, 0.07609672539675419]",0.036115,1314
9,Charles Degrouxstraat,118,29,2.751265,0 days 00:30:01.388118609,0.245763,"(50.8415816, 4.4000444)",0.0,"(Rue Charles Degroux - Charles Degrouxstraat, ...","(Rue Charles Degroux - Charles Degrouxstraat, ...","[0.8873529958413666, 0.07679526201393838]",0.219002,51
10,ChaussÈe de Haecht,552,76,3.658295,0 days 00:22:06.379834910,0.137681,"(50.8838454, 4.420667)",0.0,"(Chaussée de Haecht - Haachtsesteenweg, Haren,...","(Chaussée de Haecht - Haachtsesteenweg, Haren,...","[0.8880906383057711, 0.07715519428426017]",0.06517,4


In [12]:
critical_points_clean

Unnamed: 0,streetname_permanence,initial_count,late_interventions_count,average_speed,average_time_difference,proportion_late_interventions,midpoint_coordinates,street_distance,start_point,end_point,midpoint_coordinates_rad,closest_candidate_distance,closest_candidate_index
0,Avenue De FrÈ - De FrÈlaan,318,111,4.606633,0 days 00:17:24.561550093,0.349057,"(50.8031574, 4.3412639)",0.0,"(Avenue De Fré - De Frélaan, Stalle, Uccle - U...","(Avenue De Fré - De Frélaan, Stalle, Uccle - U...","[0.8866823670389219, 0.07576934875296987]",0.238565,116
1,Avenue Hippocrate - Hippokrateslaan,268,82,3.722012,0 days 00:32:23.670930279,0.30597,"(50.853064849999996, 4.4535604499999994)",0.0,"(Brussels / UCL, Avenue Hippocrate - Hippokrat...","(Brussels / UCL, Avenue Hippocrate - Hippokrat...","[0.8875534163626962, 0.07772929328910029]",0.158783,757
2,Avenue de l'Arbre Ballon,315,45,4.666605,0 days 00:13:27.738957354,0.142857,"(50.8930652, 4.3166745)",0.0,(Avenue de l'Arbre Ballon - Dikkebeuklaan / Av...,(Avenue de l'Arbre Ballon - Dikkebeuklaan / Av...,"[0.8882515541721465, 0.07534018276187997]",0.834109,1320
3,Avenue de l'HÈliport,807,140,3.479987,0 days 00:16:58.442625852,0.173482,"(50.8643118, 4.3556945)",0.0,"(Avenue de l'Héliport - Helihavenlaan, Quartie...","(Avenue de l'Héliport - Helihavenlaan, Quartie...","[0.8877497126710036, 0.07602121023600816]",0.153142,1156
5,Avenue du Laerbeek - Laarbeeklaan,162,65,5.728057,0 days 00:16:54.900619440,0.401235,"(50.886818649999995, 4.307567420033124)",0.0,"(Vrije Universiteit Brussel — Campus Jette, 10...","(Vrije Universiteit Brussel — Campus Jette, 10...","[0.8881425313077559, 0.07518123423121557]",0.0,635
6,Bergense Steenweg,664,106,3.578969,0 days 00:20:40.491780417,0.159639,"(50.8445236, 4.3377892)",0.0,"(Chaussée de Mons - Bergense Steenweg, Dansaer...","(Chaussée de Mons - Bergense Steenweg, Dansaer...","[0.8874043434279604, 0.0757087037974508]",0.0,142
7,Boulevard du Triomphe,464,90,3.788297,0 days 00:14:32.322593765,0.193966,"(50.8173133, 4.4011946)",0.0,"(Boulevard du Triomphe - Triomflaan, Cité Volt...","(Boulevard du Triomphe - Triomflaan, Cité Volt...","[0.886929434102505, 0.07681533679099482]",0.115694,184
8,Broekstraat,245,72,4.097009,0 days 00:15:09.900237107,0.293878,"(50.8526523, 4.3600212)",0.0,"(Rue du Marais - Broekstraat, Quartier Marais-...","(Rue du Marais - Broekstraat, Quartier Marais-...","[0.8875462160068672, 0.07609672539675419]",0.036115,1314
9,Charles Degrouxstraat,118,29,2.751265,0 days 00:30:01.388118609,0.245763,"(50.8415816, 4.4000444)",0.0,"(Rue Charles Degroux - Charles Degrouxstraat, ...","(Rue Charles Degroux - Charles Degrouxstraat, ...","[0.8873529958413666, 0.07679526201393838]",0.219002,51
10,ChaussÈe de Haecht,552,76,3.658295,0 days 00:22:06.379834910,0.137681,"(50.8838454, 4.420667)",0.0,"(Chaussée de Haecht - Haachtsesteenweg, Haren,...","(Chaussée de Haecht - Haachtsesteenweg, Haren,...","[0.8880906383057711, 0.07715519428426017]",0.06517,4


In [13]:
df_clusters = load_variable("df_clusters.txt")
df_clusters_all_nodes_haversine = load_variable("df_clusters_all_nodes_haversine.txt")
cluster_centers = load_variable("cluster_centers.txt")

In [None]:
df_clusters_all_nodes_haversine

In [15]:
cluster_centers

Unnamed: 0,cluster,latitude_intervention,longitude_intervention
0,0,50.835390,4.316295
1,1,50.862820,4.390080
2,2,50.814988,4.368908
3,3,50.888980,4.317545
4,4,50.815910,4.436210
...,...,...,...
1557,1557,50.851402,4.320566
1558,1558,50.837935,4.285310
1559,1559,50.841840,4.467120
1560,1560,50.825910,4.305810


In [16]:
from sklearn.metrics.pairwise import haversine_distances
from math import radians

# Remove rows with missing values from both DataFrames
#critical_points_clean = critical_points.dropna(subset=['midpoint_coordinates'])
df_clusters_all_nodes_haversine = df_clusters_all_nodes_haversine.dropna(subset=['node_latitude', 'node_longitude'])

# Convert latitude and longitude from degrees to radians
critical_points_clean['midpoint_coordinates_rad'] = critical_points_clean['midpoint_coordinates'].apply(lambda x: [radians(coord) for coord in x])
df_clusters_all_nodes_haversine['coordinates_rad'] = df_clusters_all_nodes_haversine.apply(lambda row: [radians(row['node_latitude']), radians(row['node_longitude'])], axis=1)

# Check if there are still missing values after cleaning
if critical_points_clean.isnull().values.any() or df_clusters_all_nodes_haversine.isnull().values.any():
    print("Error: Missing values remain after cleaning.")
else:
    # Calculate Haversine distances between critical points and candidate points
    distances = haversine_distances(critical_points_clean['midpoint_coordinates_rad'].tolist(), df_clusters_all_nodes_haversine['coordinates_rad'].tolist())

    # Find the closest candidate point for each critical point
    closest_distances = distances.min(axis=1)
    closest_candidate_indices = distances.argmin(axis=1)

    # Update critical_points DataFrame with closest candidate distance and index
    critical_points_clean['closest_candidate_distance_after'] = closest_distances * 6371  # Multiply by Earth's radius to get distance in kilometers
    critical_points_clean['closest_candidate_index_after'] = closest_candidate_indices

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  critical_points_clean['midpoint_coordinates_rad'] = critical_points_clean['midpoint_coordinates'].apply(lambda x: [radians(coord) for coord in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  critical_points_clean['closest_candidate_distance_after'] = closest_distances * 6371  # Multiply by Earth's radius to get distance in kilometers
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.p

In [17]:
critical_points_clean

Unnamed: 0,streetname_permanence,initial_count,late_interventions_count,average_speed,average_time_difference,proportion_late_interventions,midpoint_coordinates,street_distance,start_point,end_point,midpoint_coordinates_rad,closest_candidate_distance,closest_candidate_index,closest_candidate_distance_after,closest_candidate_index_after
0,Avenue De FrÈ - De FrÈlaan,318,111,4.606633,0 days 00:17:24.561550093,0.349057,"(50.8031574, 4.3412639)",0.0,"(Avenue De Fré - De Frélaan, Stalle, Uccle - U...","(Avenue De Fré - De Frélaan, Stalle, Uccle - U...","[0.8866823670389219, 0.07576934875296987]",0.238565,116,0.064397,2862
1,Avenue Hippocrate - Hippokrateslaan,268,82,3.722012,0 days 00:32:23.670930279,0.30597,"(50.853064849999996, 4.4535604499999994)",0.0,"(Brussels / UCL, Avenue Hippocrate - Hippokrat...","(Brussels / UCL, Avenue Hippocrate - Hippokrat...","[0.8875534163626962, 0.07772929328910029]",0.158783,757,0.357337,3999
2,Avenue de l'Arbre Ballon,315,45,4.666605,0 days 00:13:27.738957354,0.142857,"(50.8930652, 4.3166745)",0.0,(Avenue de l'Arbre Ballon - Dikkebeuklaan / Av...,(Avenue de l'Arbre Ballon - Dikkebeuklaan / Av...,"[0.8882515541721465, 0.07534018276187997]",0.834109,1320,0.076178,1363
3,Avenue de l'HÈliport,807,140,3.479987,0 days 00:16:58.442625852,0.173482,"(50.8643118, 4.3556945)",0.0,"(Avenue de l'Héliport - Helihavenlaan, Quartie...","(Avenue de l'Héliport - Helihavenlaan, Quartie...","[0.8877497126710036, 0.07602121023600816]",0.153142,1156,0.09007,1708
5,Avenue du Laerbeek - Laarbeeklaan,162,65,5.728057,0 days 00:16:54.900619440,0.401235,"(50.886818649999995, 4.307567420033124)",0.0,"(Vrije Universiteit Brussel — Campus Jette, 10...","(Vrije Universiteit Brussel — Campus Jette, 10...","[0.8881425313077559, 0.07518123423121557]",0.0,635,0.314567,3322
6,Bergense Steenweg,664,106,3.578969,0 days 00:20:40.491780417,0.159639,"(50.8445236, 4.3377892)",0.0,"(Chaussée de Mons - Bergense Steenweg, Dansaer...","(Chaussée de Mons - Bergense Steenweg, Dansaer...","[0.8874043434279604, 0.0757087037974508]",0.0,142,0.043268,606
7,Boulevard du Triomphe,464,90,3.788297,0 days 00:14:32.322593765,0.193966,"(50.8173133, 4.4011946)",0.0,"(Boulevard du Triomphe - Triomflaan, Cité Volt...","(Boulevard du Triomphe - Triomflaan, Cité Volt...","[0.886929434102505, 0.07681533679099482]",0.115694,184,0.151169,75
8,Broekstraat,245,72,4.097009,0 days 00:15:09.900237107,0.293878,"(50.8526523, 4.3600212)",0.0,"(Rue du Marais - Broekstraat, Quartier Marais-...","(Rue du Marais - Broekstraat, Quartier Marais-...","[0.8875462160068672, 0.07609672539675419]",0.036115,1314,0.113969,775
9,Charles Degrouxstraat,118,29,2.751265,0 days 00:30:01.388118609,0.245763,"(50.8415816, 4.4000444)",0.0,"(Rue Charles Degroux - Charles Degrouxstraat, ...","(Rue Charles Degroux - Charles Degrouxstraat, ...","[0.8873529958413666, 0.07679526201393838]",0.219002,51,0.08596,4985
10,ChaussÈe de Haecht,552,76,3.658295,0 days 00:22:06.379834910,0.137681,"(50.8838454, 4.420667)",0.0,"(Chaussée de Haecht - Haachtsesteenweg, Haren,...","(Chaussée de Haecht - Haachtsesteenweg, Haren,...","[0.8880906383057711, 0.07715519428426017]",0.06517,4,0.190465,541
