# Uber Nairobi Ambulance Perambulation Challenge
The aim of this challenge is to place six virtual ambulances around the city of Nairobi, moving them around throughout the day with the goal of minimising the distance travelled when responding to crashes during the test period.





### Setup



In [None]:
!pip install --upgrade fastcore -q
!pip install --upgrade fastai -q
!pip install geopandas

In [209]:
from fastai.vision.all import * # Needs latest version, and sometimes a restart of the runtime after the pip installs
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.cluster import MiniBatchKMeans
import time
import torch.optim as optim

In [None]:
# set folder
from google.colab import drive
drive.mount('/content/drive')
%cd "/content/drive/MyDrive/DataScience Challenges/uber-nairobi-ambulance-perambulation"

### Load the Crash Data

In [211]:
df = pd.read_csv('Data/Train.csv', parse_dates=['datetime'])
df.head()

Unnamed: 0,uid,datetime,latitude,longitude
0,1,2018-01-01 00:25:46,-1.18885,36.931382
1,2,2018-01-01 02:02:39,-0.662939,37.20873
2,3,2018-01-01 02:31:49,-0.662939,37.20873
3,4,2018-01-01 03:04:01,-1.288087,36.826583
4,5,2018-01-01 03:58:49,-1.18885,36.931382


### Standardize features by removing the mean and scaling to unit variance



In [212]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler()
data_sc = scaler.fit_transform(df[['latitude', 'longitude']].values)

In [213]:
def loss_fn(crash_locs, amb_locs):
  """For each crash we find the dist to the closest ambulance, and return the mean of these dists."""
  # Dists to first ambulance
  dists_split = crash_locs-amb_locs[0]
  dists = (dists_split[:,0]**2 + dists_split[:,1]**2)**0.5
  min_dists = dists
  for i in range(1, 6):
    # Update dists so they represent the dist to the closest ambulance
    dists_split = crash_locs-amb_locs[i]
    dists = (dists_split[:,0]**2 + dists_split[:,1]**2)**0.5
    min_dists = torch.min(min_dists, dists)
  
  return min_dists.mean()

### Score function

In [214]:
def score(sub, ref):
    total_distance = 0
    for date, c_lat, c_lon in ref[['datetime', 'latitude', 'longitude']].values:
        row = sub.loc[sub.date < date].tail(1) # Prior to Oct 2 this was incorrectly .head(1)
        dists = []
        for a in range(6):
            dist = ((c_lat - row[f'A{a}_Latitude'].values[0])**2+(c_lon - row[f'A{a}_Longitude'].values[0])**2)**0.5 
            dists.append(dist)
        total_distance += min(dists)
    return total_distance

### Finding locations
- In a first step, I use the MiniBatchKMeans algorithm to initialize the starting points

- After that, I use the LBFGS algorithm to minimize the distance

In [215]:
def model_lbfgs_minikmeans(crash_locs, n_iter=1000, lr=1e-1, store_every=100, verbose=1):

  minikmeans = MiniBatchKMeans(n_clusters=6, max_iter=10, batch_size=6, random_state=44, max_no_improvement=15, reassignment_ratio=5e-9)
  model = minikmeans.fit(crash_locs)
  amb_locs = torch.tensor(model.cluster_centers_)
  amb_locs.requires_grad_()

  optimizer = optim.LBFGS([amb_locs], lr=lr )

  def closure():
    loss = loss_fn(crash_locs, amb_locs)
    optimizer.zero_grad()
    loss.backward()
    return loss

  for i in range(n_iter):

    loss = loss_fn(crash_locs, amb_locs)
    if verbose == 1:
      if i % store_every == 0: print('Loss:', loss.item()) # Print loss

    optimizer.step(closure)

  return amb_locs.detach().numpy()

crash_locs = tensor(data_sc)
amb_locs = model_lbfgs_minikmeans(crash_locs, n_iter=1000, lr=1e-3, store_every=100, verbose=1)


Loss: 0.45005287116803316
Loss: 0.43066289889319825
Loss: 0.43048519200161456
Loss: 0.43035873406787406
Loss: 0.4302573804222081
Loss: 0.43017851405188534
Loss: 0.43010922973985694
Loss: 0.43004548528128533
Loss: 0.4299972117867011
Loss: 0.42995518817910655


### Local score

In [216]:
reference = df.loc[df.datetime > '2019-01-01'] # Using 2019 as our test set
dates = pd.date_range('2019-01-01', '2020-01-01', freq='3h')
sub = pd.DataFrame({
    'date':dates
})

for ambulance in range(6):
            sub['A'+str(ambulance)+'_Latitude'] = scaler.inverse_transform(amb_locs)[:, 0][ambulance]
            sub['A'+str(ambulance)+'_Longitude'] = scaler.inverse_transform(amb_locs)[:, 1][ambulance]

score(sub, reference)

91.30645429095912

### Submission

In [217]:
ss = pd.read_csv('Data/SampleSubmission.csv', parse_dates=['date'])
ss.head()

Unnamed: 0,date,A0_Latitude,A0_Longitude,A1_Latitude,A1_Longitude,A2_Latitude,A2_Longitude,A3_Latitude,A3_Longitude,A4_Latitude,A4_Longitude,A5_Latitude,A5_Longitude
0,2019-07-01 00:00:00,0,0,0,0,0,0,0,0,0,0,0,0
1,2019-07-01 03:00:00,0,0,0,0,0,0,0,0,0,0,0,0
2,2019-07-01 06:00:00,0,0,0,0,0,0,0,0,0,0,0,0
3,2019-07-01 09:00:00,0,0,0,0,0,0,0,0,0,0,0,0
4,2019-07-01 12:00:00,0,0,0,0,0,0,0,0,0,0,0,0


In [218]:
for ambulance in range(6):
            ss['A'+str(ambulance)+'_Latitude'] = scaler.inverse_transform(amb_locs)[:, 0][ambulance]
            ss['A'+str(ambulance)+'_Longitude'] = scaler.inverse_transform(amb_locs)[:, 1][ambulance]
ss.head()

Unnamed: 0,date,A0_Latitude,A0_Longitude,A1_Latitude,A1_Longitude,A2_Latitude,A2_Longitude,A3_Latitude,A3_Longitude,A4_Latitude,A4_Longitude,A5_Latitude,A5_Longitude
0,2019-07-01 00:00:00,-1.287253,36.821733,-1.224755,36.885941,-1.263419,36.741914,-1.490986,37.057649,-1.330657,36.885212,-1.109947,37.010332
1,2019-07-01 03:00:00,-1.287253,36.821733,-1.224755,36.885941,-1.263419,36.741914,-1.490986,37.057649,-1.330657,36.885212,-1.109947,37.010332
2,2019-07-01 06:00:00,-1.287253,36.821733,-1.224755,36.885941,-1.263419,36.741914,-1.490986,37.057649,-1.330657,36.885212,-1.109947,37.010332
3,2019-07-01 09:00:00,-1.287253,36.821733,-1.224755,36.885941,-1.263419,36.741914,-1.490986,37.057649,-1.330657,36.885212,-1.109947,37.010332
4,2019-07-01 12:00:00,-1.287253,36.821733,-1.224755,36.885941,-1.263419,36.741914,-1.490986,37.057649,-1.330657,36.885212,-1.109947,37.010332


In [207]:
ss.to_csv('submission.csv', index=False)