In [None]:
# Packages
import os
import re
import time
import json
import pprint
import unidecode
import googlemaps
import urllib.request, json


import pandas as pd
import numpy as np
import seaborn as sns
import datetime as dt
import matplotlib.pyplot as plt


from pandas import Series, DataFrame

# Data Preprocessing

1) First we import Cajamarca's health data.

In [None]:
## Read data
health_places = pd.read_excel( r'../_data/cajamarca_data_helth_estb.xlsx')
health_places.head()

2) Then we import centroids data and keep with lat, long and Ubigeo to merge with the health data.

In [None]:
centroids_raw = pd.read_excel( r'../_data/peru_districts_centroids.xlsx')
centroids = centroids_raw[["UBIGEO", "Centroid_Latitude", "Centroid_Longitude"]]
centroids

3) Now we are merge health data with the correspondent centroid using Ubigeo.

In [None]:
## Merge data
dist_data = health_places.merge(                  # dataframe A to be merged
                              centroids,        # dataframe B to be merged with
                              on = 'UBIGEO',    # by variable name
                              how = 'left',     # keep A and complete with B
                              validate = "m:1"  # Asign unique values (others: m:1, 1:m, m:m)
                              )
# generate orig and dest
dist_data["origin"] = dist_data["latitud"].astype(str) + ',' + dist_data["longitud"].astype(str)
dist_data["destination"] = dist_data["Centroid_Latitude"].astype(str) + ',' + dist_data["Centroid_Longitude"].astype(str)

# Prewview of data
print(dist_data.shape)
dist_data.head()

# Distance from health place to centroid

For this we are going to use euclidean distance

In [None]:
dist_data["distance"] = (
                        (dist_data["latitud"]-dist_data["Centroid_Latitude"])**2
                         +
                        (dist_data["longitud"]-dist_data["Centroid_Longitude"])**2
                        )**(1/2)
dist_data

closest health establishment to Cajamarca district centroids

In [None]:
dist_data.groupby(['NOMBDIST'])[['distance','nombre']].min()

# Google Directions API

We generate some objects before call directions API

In [None]:
#Index Column
dist_data["idx"] = dist_data.index
# Previsualization of data
dist_data

In [None]:
# Generate lists 
comb_idx = dist_data.index.tolist()
orig = dist_data['origin'].tolist()
dest = dist_data['destination'].tolist()

# Generate dictionary to store data
data_distance = {}

Calling directions API

In [None]:
# Loop to generate info about geolocations
traf_mod = ['best_guess', 'optimistic', 'pessimistic']
traf_mod_dict = {}

for mod in traf_mod:
    distance_info = np.zeros(shape=(len(comb_idx),7), dtype =float)
    i=0

    for c,o,d in list(zip(comb_idx, orig, dest)):
        try:
            # Google MapsDdirections API endpoint
            endpoint = 'https://maps.googleapis.com/maps/api/directions/json?'

        ## Fixed Parameters
            traffic_model = mod  
            departure_time= '1643058000'
            mode = 'driving'
            api_key = 'AIzaSyBZz-ffplUIzI1PclCpd_c1JTlxp06ufjM'
            region = 'PE'

         ## Parameters
            origin = o
            destination = d

            #Building the URL for the request
            nav_request = 'origin={}&destination={}&departure_time={}&traffic_model={}&mode={}&region={}&key={}'.format(origin , 
                            destination , departure_time , traffic_model , mode, region, api_key)

            # https://maps.googleapis.com/maps/api/directions/json?origin=Toledo&destination=Madrid&region=es&key=AIzaSyD_4E6Hd-fYECy3mZ4asxN23JjIstvLdoE


            # Concatenate strings
            request = endpoint + nav_request

            #Sends the request and reads the response.
            response = urllib.request.urlopen(request).read()

            #Loads response as JSON
            directions = json.loads(response)
            #print(json.dumps(directions, indent = 2))

            legs = directions['routes'][0]['legs'][0]


            distance_info[i][0] = c
            distance_info[i][1] = float(re.sub("[^0-9.]", "", legs['distance']['text']))
            distance_info[i][2] = legs['distance']['value']

            distance_info[i][3] = float(re.sub("[^0-9.]", "", legs['duration']['text']))
            distance_info[i][4] = legs['duration']['value']

            distance_info[i][5] = float(re.sub("[^0-9.]", "", legs['duration_in_traffic']['text']))
            distance_info[i][6] = legs['duration_in_traffic']['value']

            i=i+1

            my_keys = ['distance', 'duration', 'duration_in_traffic']
            info = { my_key: legs[my_key] for my_key in my_keys }
            data_distance[c] = info 

        except Exception as e:

            distance_info[i][0] = c

            distance_info[i][1] = "nan"
            distance_info[i][2] = "nan"

            distance_info[i][3] = "nan"
            distance_info[i][4] = "nan"

            distance_info[i][5] = "nan"
            distance_info[i][6] = "nan"

            i=i+1

            #data_distance[c] = {'distance': {'text': 'nan', 'value': 0}, 'duration': {'text': 'nan', 'value': 0}, 'duration_in_traffic': {'text': 'nan', 'value': 0}}
            print(mod, c)
    loop_mod_dict = {mod: distance_info}
    traf_mod_dict.update(loop_mod_dict)

Final data form

In [None]:
## Data frames for best_guess, optimistic and pessimistic
dist_best = pd.DataFrame(traf_mod_dict['best_guess'],
                            columns = ['Combination', 'Distance_Km','Distance_meters','Duration_min',
                                       'Duration_seconds','Duration_min_traf', 'Duration_seconds_traf'])

dist_opti = pd.DataFrame(traf_mod_dict['optimistic'],
                            columns = ['Combination', 'Distance_Km','Distance_meters','Duration_min',
                                       'Duration_seconds','Duration_min_traf', 'Duration_seconds_traf'])

dist_pesi = pd.DataFrame(traf_mod_dict['pessimistic'],
                            columns = ['Combination', 'Distance_Km','Distance_meters','Duration_min',
                                       'Duration_seconds','Duration_min_traf', 'Duration_seconds_traf'])

## Combine data set
data_dist = [dist_best['Combination'],
             dist_best['Duration_seconds_traf'], dist_pesi['Duration_seconds_traf'], dist_opti['Duration_seconds_traf'],
             dist_best['Distance_Km'], dist_pesi['Distance_Km'], dist_opti['Distance_Km']]

headers = ['idx',
           'travel_time_best_guess', 'travel_time_pessimistic', 'travel_time_optimistic',
           'travel_distance_best_guess', 'travel_distance_pessimistic', 'travel_distance_optimistic']

dist_api = pd.DataFrame(data_dist).transpose()
dist_api.columns = headers
# Save data of directions API
#dist_api.to_csv( r'./group_5_ass_7_apidir.csv')
dist_api

The google directions API process was executed once, taking about 2:30 hours to finish. We store this results in a CSV, so we can call it instead of running this process again

In [None]:
# Unmark to used data already worked with directions API 
#dist_api = pd.read_csv( r'./group_5_ass_7_apidir.csv')
#dist_api

We combine Cajamarca Health data with the results of directions API

In [None]:
## Combine distance data with district data
results = pd.merge(dist_data, dist_api, on='idx', how='left', validate='m:1')
results = results.drop(columns=['IDPROV', 'CODIGO', 'CNT_CCPP', 'DESCRIPCIO', 'origin', 'destination'])
# Travel time in minutes
results["travel_time_best_guess"]  = results["travel_time_best_guess"]/60
results["travel_time_pessimistic"] = results["travel_time_pessimistic"]/60
results["travel_time_optimistic"]  = results["travel_time_optimistic"]/60
results

# Graphs from Google Directions

Histograms of time travel

In [None]:
df1_2plot = results[['NOMBPROV','travel_time_best_guess','travel_time_pessimistic','travel_time_optimistic']]
df1_2plot.columns = ['PROV','best_guess','pessimistic','optimistic']
df1_2plot = df1_2plot.melt(id_vars=["PROV"])
df1_2plot

In [None]:
figure1 = sns.FacetGrid(df1_2plot, col="PROV",col_wrap=4)
figure1.map(sns.histplot, 'value', hue = df1_2plot.variable)

Histograms of travel distance

In [None]:
df2_2plot = results[['NOMBPROV','travel_distance_best_guess','travel_distance_pessimistic','travel_distance_optimistic']]
df2_2plot.columns = ['PROV','best_guess','pessimistic','optimistic']
df2_2plot = df2_2plot.melt(id_vars=["PROV"])
df2_2plot

In [None]:
figure2 = sns.FacetGrid(df2_2plot, col="PROV",col_wrap=4)
figure2.map(sns.histplot, 'value', hue = df2_2plot.variable)

Data for bar plot mean

In [None]:
# Sub data from travel time
time_2plot    = results.groupby(['NOMBPROV'])[['travel_time_best_guess',
                                               'travel_time_pessimistic',
                                               'travel_time_optimistic']].mean()
time_2plot.columns = ['best_guess','pessimistic','optimistic']
time_2plot["type"] = 'travel_time'

# Sub data from travel distance
distance_2plot = results.groupby(['NOMBPROV'])[['travel_distance_best_guess',
                                                'travel_distance_pessimistic',
                                                'travel_distance_optimistic']].mean()
distance_2plot.columns = ['best_guess','pessimistic','optimistic']
distance_2plot["type"] = 'travel_distance'

# time and distance
time_dist = pd.concat([time_2plot, distance_2plot])
time_dist["PROV"] = time_dist.index
time_dist.reset_index(drop=True,inplace=True)
time_dist_2plot = time_dist.melt(id_vars=["type", "PROV"])
time_dist_2plot

In [None]:
ag = sns.catplot(x = "PROV", y = "value",
                 hue = "variable", col = "type",
                 data = time_dist_2plot,
                 kind ="bar",height=10, aspect=1);

(ag.set_axis_labels("API results by province", "").set_xticklabels(rotation=30))