# interventions_bxl

## Functions for Saving and Loading Variables

In [1]:
# save and load data
import pickle
save_dir = "/saved_data"
wd = "/content/drive/MyDrive/Datathon"

def save_variable(variable, filename):
  f = open(filename, "wb")
  pickle.dump(variable, f)
  f.close()
  return filename

def load_variable(filename):
  f = open(filename, "rb")
  variable = pickle.load(f)
  f.close()
  return variable

## Load Data

In [8]:
# load packages & connect to google drive
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim
import folium
import matplotlib.pyplot as plt

from google.colab import drive
drive.mount("/content/drive")

%cd /content/drive/MyDrive/Datathon

# load data
aed_location = pd.read_parquet("aed_locations.parquet.gzip", engine="pyarrow")

interventions1 = pd.read_parquet("interventions1.parquet", engine="pyarrow")
interventions2 = pd.read_parquet("interventions2.parquet", engine="pyarrow")
interventions3 = pd.read_parquet("interventions3.parquet", engine="pyarrow")

interventions_bxl = pd.read_parquet("interventions_bxl.parquet.gzip",
                                    engine="pyarrow") # ←
interventions_bxl2 = pd.read_parquet("interventions_bxl2.parquet.gzip",
                                     engine="pyarrow")

mug_location = pd.read_parquet("mug_locations.parquet.gzip", engine="pyarrow")
pit_location = pd.read_parquet("pit_locations.parquet.gzip", engine="pyarrow")
ambulance_location = pd.read_parquet("ambulance_locations.parquet.gzip",
                                     engine="pyarrow")
cad9 = pd.read_parquet("cad9.parquet.gzip", engine="pyarrow")

## Data Cleaning

In [4]:
###############################################################################
# 0. check the variables and nan's
# print(interventions_bxl.info())
print(interventions_bxl.columns)

###############################################################################
# 1. select potentially useful variables
# keys = ["postalcode_intervention", "latitude_intervention", "longitude_intervention",
#         "eventtype_firstcall", "eventLevel_firstcall", "eventtype_trip",
#         "eventlevel_trip", "t0", "t3", "waiting_time"]
keys = ["postalcode_intervention", "latitude_intervention", "longitude_intervention",
        "eventtype_firstcall", "eventLevel_firstcall", "eventtype_trip",
        "eventlevel_trip", "waiting_time"]
interventions_bxl_1 = interventions_bxl[keys]

###############################################################################
# 2. drop data with nan's
interventions_bxl_2 = interventions_bxl_1.dropna()

###############################################################################
# 3. check event type
# 999 cases' event type changed on trip compared with on first call
# print((interventions_bxl_2["eventtype_firstcall"] != \
#        interventions_bxl_2["eventtype_trip"]).sum())

# related to heart: "P011 - Chest pain", "P039 - Cardiac problem
#  (other than thoracic pain)", "P003 - Cardiac arrest"
# need AED: "P003 - Cardiac arrest"
# print(interventions_bxl_2["eventtype_firstcall"].unique())
heart_related = ["P011 - Chest pain",
                 "P039 - Cardiac problem (other than thoracic pain)",
                 "P003 - Cardiac arrest"]
interventions_bxl_3 = interventions_bxl_2[interventions_bxl_2\
                      ["eventtype_firstcall"].isin(heart_related)]
# print(interventions_bxl_3) # 6358 × 6

###############################################################################
# 4. compare event type and level at the first call and at the trip
# 78 changed event type & 490 changed event level
# use the ones of first call because people who pass by the patient may not be able to distinguish
# levels generally decreases
# print((interventions_bxl_3["eventtype_firstcall"] != \
#        interventions_bxl_3["eventtype_trip"]).sum())
# print((interventions_bxl_3["eventLevel_firstcall"] != \
#        interventions_bxl_3["eventlevel_trip"]).sum())
# print(interventions_bxl_3[["eventtype_firstcall", "eventtype_trip"]]\
#                           [interventions_bxl_3["eventtype_firstcall"] != \
#                            interventions_bxl_3["eventtype_trip"]])
# print(interventions_bxl_3[["eventLevel_firstcall", "eventlevel_trip"]]\
#                           [interventions_bxl_3["eventLevel_firstcall"] != \
#                            interventions_bxl_3["eventlevel_trip"]])

# check event levels
level = interventions_bxl_3["eventLevel_firstcall"].apply(lambda x: int(x[1:]))
interventions_bxl_4 = interventions_bxl_3.assign(level_num = level)
# print(interventions_bxl_4)

###############################################################################
# 5. delete outliers in waiting_time
# define the function to delete outliers in a numeric variable
def delete_outliers(dataframe, column_name, threshold = 2):
  # data is a column of dataframe
  data = dataframe[column_name]
  m = data.mean()
  sd  = data.std()
  return dataframe[(data - m).abs() < threshold * sd]

# some waiting time is more than 20 hours, 1 is 999999999.0, which do not make sense
interventions_bxl_5 = interventions_bxl_4[interventions_bxl_4["waiting_time"] < 200]
# print(interventions_bxl_5)

###############################################################################
# 6. delete outliers in latitude & longtitude
# brussels: 50.8467°N 4.3525°E
interventions_bxl_6 = interventions_bxl_5[\
                      (interventions_bxl_5["latitude_intervention"] > 4500000) &\
                      (interventions_bxl_5["latitude_intervention"] < 5500000) &\
                      (interventions_bxl_5["longitude_intervention"] > 400000) &\
                      (interventions_bxl_5["longitude_intervention"] < 500000)]
# print(interventions_bxl_6) # 5156 x 9

###############################################################################
# 7. change the scale of latitude & longtitude
interventions_bxl_7 = interventions_bxl_6
interventions_bxl_7["latitude_intervention"] = \
                   interventions_bxl_6["latitude_intervention"].div(100000)
interventions_bxl_7["longitude_intervention"] = \
                   interventions_bxl_6["longitude_intervention"].div(100000)
print(interventions_bxl_7)

Index(['mission_id', 'service_name', 'postalcode_permanence',
       'cityname_permanence', 'streetname_permanence',
       'housenumber_permanence', 'latitude_permanence', 'longitude_permanence',
       'permanence_short_name', 'permanence_long_name', 'vector_type',
       'eventtype_firstcall', 'eventLevel_firstcall', 'eventtype_trip',
       'eventlevel_trip', 'postalcode_intervention', 'cityname_intervention',
       'latitude_intervention', 'longitude_intervention', 't0', 't1',
       't1confirmed', 't2', 't3', 't4', 't5', 't6', 't7', 't9',
       'intervention_time_t1reported', 'waiting_time', 'intervention_duration',
       'departure_time_t1reported', 'unavailable_time',
       'name_destination_hospital', 'postalcode_destination_hospital',
       'cityname_destination_hospital', 'streetname_destination_hospital',
       'housenumber_destination_hospital', 'calculated_traveltime_departure_',
       'calculated_distance_departure_to', 'calculated_traveltime_destinatio',
       '

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interventions_bxl_7["latitude_intervention"] = \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interventions_bxl_7["longitude_intervention"] = \


## Exploratory Analysis

In [70]:
###############################################################################
# data used in the analysis
df = interventions_bxl_7

###############################################################################
# check correlation
# → no correlation between the level of the event and the waiting time
correlation = df["level_num"].corr(df["waiting_time"])
# print(correlation)

###############################################################################
# check different event type
# seems that different types do not differ
groupby_type = df.groupby("eventtype_firstcall")["waiting_time"].agg(["mean","std"])
# print(groupby_type)

###############################################################################
# see in the map
# create the center of the map
map_center = [df["latitude_intervention"].mean(), df["longitude_intervention"].mean()]

# create a Folium map
map_object = folium.Map(location = map_center, zoom_start = 12)

# add markers on the map
for index, row in df.iterrows():

    folium.CircleMarker(
        location = [row["latitude_intervention"], row["longitude_intervention"]],
        radius = 2,
        color = "blue",
        fill = True,
        fill_color = "blue"
    ).add_to(map_object)

# # save as html
# map_object.save("map_with_markers.html")
display(map_object)

## Try Network Analysis


In [23]:
###############################################################################
# data used in the analysis
df = interventions_bxl_7

# import packages
import osmnx as ox
import math
import matplotlib.pyplot as plt

###############################################################################
# obtain the road network
# method 1: use the map of brussels city
# but this may not fit the dataset above
# G_brussels = ox.graph_from_place("Brussels, Belgium", network_type = "walk")
# ox.plot_graph(G_brussels, node_size = 0, edge_color = "w", edge_linewidth = 0.2)

# method 2: draw a rectangle
max_lat = math.ceil(df["latitude_intervention"].max() * 1000) / 1000
min_lat = math.ceil(df["latitude_intervention"].min() * 1000) / 1000
max_lon = math.ceil(df["longitude_intervention"].max() * 1000) / 1000
min_lon = math.ceil(df["longitude_intervention"].min() * 1000) / 1000

# G_bbox = ox.graph_from_bbox(min_lat - 0.01, max_lat + 0.01,
#                             min_lon - 0.01, max_lon + 0.01,
#                             network_type = "walk")
# ox.plot_graph(G_bbox, node_size = 0, edge_color = "w", edge_linewidth = 0.2)

# save_variable(G_bbox, wd + save_dir + "/G_bbox.txt")
G_bbox = load_variable(wd + save_dir + "/G_bbox.txt")

###############################################################################
# draw locations of intervention on the map
# locations_intervention_nearest_nodes = ox.distance.nearest_nodes(G_bbox,
#        list(df["latitude_intervention"]), list(df["longitude_intervention"]))

fig, ax = ox.plot_graph(G_bbox, node_size = 0, edge_color = "w", edge_linewidth = 0.2)

# for node in locations_intervention_nearest_nodes:
#   x, y = G_bbox.nodes[node]["x"], G_bbox.nodes[node]["y"]
#   ax.scatter(x, y, c = "blue", s = 50, marker = "X", zorder = 5)

lat = list(df["latitude_intervention"])
lon = list(df["longitude_intervention"])
for i in range(len(lat)):
  ax.scatter(lat[i], lon[i], color = "red", s = 50, marker = "X", zorder = 5)

plt.show()

'/content/drive/MyDrive/Datathon/saved_data/G_bbox.txt'

# aed_location

## Data Cleaning

In [10]:
# save and load data
import pickle
save_dir = "/saved_data"
wd = "/content/drive/MyDrive/Datathon"

def save_variable(variable, filename):
  f = open(filename, "wb")
  pickle.dump(variable, f)
  f.close()
  return filename

def load_variable(filename):
  f = open(filename, "rb")
  variable = pickle.load(f)
  f.close()
  return variable

###############################################################################
# 0. check the variables and nan's
# print(aed_location.info())
# print(aed_location["address"])
# print(aed_location) # 15225 × 11

###############################################################################
# 1. only keep data in brussels
aed_location_1 = aed_location[aed_location["province"] == "Bruxelles-Brussel"]
# print(aed_location_1) # 2042 x 11

# print(aed_location_1["number"].isna().sum()) # 209 are nan's

###############################################################################
# 2. add the latitude and the longitude of corresponding address
from geopy.geocoders import Nominatim
import time

def find_location(address, number, province = "Bruxelles-Brussel",
                  country = "Belgium"):
  full_address = "{} {}, {}, {}".format(int(number), address, province, country)
  geolocator = Nominatim(user_agent = "my_app")
  location = geolocator.geocode(full_address, timeout=None)
  if location:
    return location.latitude, location.longitude
  return float("nan")

# calculating the latitude and the longitude data
# cost ~20 min...
# do not delete print()!
# lat_and_lon = []
# for i in range(aed_location_1.shape[0]):
#   address = list(aed_location_1["address"])[i]
#   number_raw = list(aed_location_1["number"])[i]
#   if pd.isna(number_raw):
#     lat_and_lon.append(float("nan"))
#     print("index = {}".format(i)) # slow the speed
#     continue
#   number = int(number_raw)
#   rst = find_location(address, number)
#   lat_and_lon.append(rst)
#   print("index = {}".format(i))

# save_variable(lat_and_lon, wd + save_dir + "/lat_and_lon.txt")
lat_and_lon = load_variable(wd + save_dir + "/lat_and_lon.txt")

latitude = []
longitude = []
for item in lat_and_lon:
  if pd.isna(item):
    latitude.append(float("nan"))
    longitude.append(float("nan"))
  else:
    latitude.append(item[0])
    longitude.append(item[1])

aed_location_2 = aed_location_1.assign(latitude = latitude,
                                       longitude = longitude)
# print(aed_location_2)

###############################################################################
# 3. only keep latitude & longitude data
keys = ["id", "postal_code", "latitude", "longitude"]
aed_location_3 = aed_location_2[keys]

###############################################################################
# 4. drop data with nan's
aed_location_4 = aed_location_3.dropna()
print(aed_location_4) # 1562 x 4

Unnamed: 0,id,type,address,number,postal_code,municipality,province,location,public,available,hours,latitude,longitude
11,86.0,,Schumanplein,11.0,1040.0,Brussel,Bruxelles-Brussel,,,,,50.842792,4.384351
14,96.0,,Graafschap - Jettelaan,2.0,1090.0,Brussel,Bruxelles-Brussel,,,,,50.882962,4.335248
15,97.0,,Sint-Pieterskerstraat,,1090.0,Brussel,Bruxelles-Brussel,,,,,,
16,98.0,,Wemmelsestweg,100.0,1090.0,Brussel,Bruxelles-Brussel,,,,,,
18,101.0,,Tweestationsstraat,80.0,1070.0,Brussel,Bruxelles-Brussel,,,,,50.831942,4.328980
...,...,...,...,...,...,...,...,...,...,...,...,...,...
15185,15100.0,Appareil fixe-Vast apparaat,Avenue Jean Van Horebeeck,196.0,1160.0,Brussel,Bruxelles-Brussel,,Oui-Ja,Non-Nee,,,
15192,16629.0,Appareil fixe-Vast apparaat,Avenue du Boulevard,21.0,1210.0,Saint-Josse-Ten-Noode,Bruxelles-Brussel,Bureau at 23rd floor,Non-Nee,Non-Nee,Monday to Friday from 8:00 to 18:00,50.855420,4.357981
15204,16643.0,Appareil fixe-Vast apparaat,Avenue Emile Gryson,1.0,1070.0,Bruxelles,Bruxelles-Brussel,,Oui-Ja,Non-Nee,,50.815393,4.294843
15213,16651.0,Appareil fixe-Vast apparaat,Antoon van Osslaan,,1120.0,Neder-over-Heembeek,Bruxelles-Brussel,Gelijkvloers,Non-Nee,Non-Nee,ma-vrij 8u-17u,,


In [18]:
import math

df = aed_location_2[["address", "number", "latitude", "longitude"]]
# x = list(df["latitude"])
# for i in range(1, len(x)):
#   if abs(x[i] - x[i - 1]) < 1e-8:
#     print("{}: ({}, {})".format(i, x[i], x[i-1]))
df.iloc[53:57, :]

Unnamed: 0,address,number,latitude,longitude
320,Bourgetlaan,42.0,50.877313,4.432468
321,Bourgetlaan,42.0,50.877313,4.432468
322,Bourgetlaan,42.0,50.877313,4.432468
327,Woluwedal,102.0,50.855719,4.444934


## Combine with the Intervention Dataset

In [34]:
import networkx as nx
# find the shortest distance to an AED device of each intervention location

# # choose dataframe
# df_intervention = interventions_bxl_7
# df_aed = aed_location_4

# # find nearest nodes for each location
# nearest_nodes_intervention = ox.distance.nearest_nodes(G_bbox,
#       list(df_intervention["latitude_intervention"]),
#       list(df_intervention["longitude_intervention"]))
# nearest_nodes_aed = ox.distance.nearest_nodes(G_bbox,
#       list(df_aed["latitude"]), list(df_aed["longitude"]))

# for each intervention location, find the nearst AED device
loc_test = df_intervention[["latitude_intervention", "longitude_intervention"]].iloc[0]
node_test = ox.distance.nearest_nodes(G_bbox, \
                list(df_intervention["latitude_intervention"])[110], \
                list(df_intervention["longitude_intervention"])[110])
target_test = ox.distance.nearest_nodes(G_bbox, \
                list(df_aed["latitude"])[20], \
                list(df_aed["longitude"])[20])
route_length = nx.shortest_path_length(G_bbox, source = node_test,
                        target = target_test, weight = "length")
print(node_test)
print(target_test)
print(route_length)

28        4.39229
44        4.35817
94        4.42592
95        4.42592
110       4.34743
           ...   
115427    4.37195
115428    4.37195
115449    4.43919
115451    4.37195
115499    4.43985
Name: longitude_intervention, Length: 5156, dtype: float64