# This notebbook merges and validates the datasets for Google mobility prediction

In [2]:
import pandas as pd

In [3]:
###############
# Import datasets
###############

# State name mapping table
state_mapping_table = pd.read_csv('../Input_datasets/Mobility_flow_prediction_shared/us_states_w_fips_and_coordinates.csv')
# Drop US-DC (District of Columbia) as it is not a state
state_mapping_table = state_mapping_table[state_mapping_table["iso_3166_2_code"]!="US-DC"]
iso_2_fips_mapper = dict(zip(state_mapping_table['iso_3166_2_code'], state_mapping_table['FIPS']))
fips_iso_2_mapper = dict(zip(state_mapping_table['FIPS'], state_mapping_table['iso_3166_2_code']))

# Google Mobility dataset
google_mobility_dataset = pd.read_csv('../Input_datasets/Google_mobility_flow_prediction/Google_mobility_data.csv')

# Import US state edgelist
edge_list = pd.read_json('../Input_datasets/Mobility_flow_prediction_shared/us_states_edge_list.json')
edge_list.columns= ['origin', 'destination']
# Drop FIPS=11 (US-DC, District of Columbia) as it is not a state
edge_list = edge_list[(edge_list['origin']!=11) & (edge_list['destination']!=11)]

# Additional edge characteristics
us_state_distances = pd.read_csv('../Input_datasets/Mobility_flow_prediction_shared/US_state_distances.csv', skiprows=2, index_col=0)
us_state_distances.columns = us_state_distances.index

# Node characteristics - US state population
us_state_pop = pd.read_csv('../Input_datasets/Mobility_flow_prediction_shared/US_state_pop_2019_census.csv', index_col=0)
us_state_pop.columns = ["state", "population_2019", "population_density_2019", "FIPS"]
us_state_pop.drop(columns=["state"], inplace=True)

# Node characteristics - OpenStreetMap features
overpass_features = pd.read_csv('../Input_datasets/Mobility_flow_prediction_shared/overpass_features.csv', index_col=0)
overpass_features.drop(columns=["overpass_id", "state"], inplace=True)
overpass_features.rename(columns={"state_short":"iso_3166_2_code"}, inplace=True)


In [4]:
###############
# Create node_list data
###############
node_list = pd.merge(state_mapping_table, us_state_pop, on = "FIPS", how='left')
node_list = pd.merge(node_list, overpass_features, on="iso_3166_2_code", how='left')


###############
# Drop Alaska and Hawaii from node list
###############
node_list = node_list[(node_list['iso_3166_2_code']!='US-AK') &
                      (node_list['iso_3166_2_code']!='US-HI')]

###############
# Create edge_list data
###############
edge_list["distances"] = edge_list.apply(lambda x: us_state_distances.loc[x["origin"], x["destination"]], axis=1)
edge_list['origin'] = edge_list['origin'].astype('int').apply(lambda x: fips_iso_2_mapper[x])
edge_list['destination'] = edge_list['destination'].astype('int').apply(lambda x: fips_iso_2_mapper[x])

###############
# Drop edges where origin = destination
###############

edge_list = edge_list[edge_list['origin'] != edge_list['destination']]

###############
# Create node_target_list data
###############
node_target_list = google_mobility_dataset.drop(columns=["country_region_code", "country_region", "sub_region_1"])

###############
# Drop Alaska and Hawaii from node target list
###############
node_target_list = node_target_list[(node_target_list['iso_3166_2_code']!='US-AK') &
                      (node_target_list['iso_3166_2_code']!='US-HI')]

###############
# Save to csv
###############
node_list.to_csv("../Output_datasets/Google_mobility_flow_prediction/node_list.csv")
edge_list.to_csv("../Output_datasets/Google_mobility_flow_prediction/edge_list.csv")
node_target_list.to_csv("../Output_datasets/Google_mobility_flow_prediction/node_target_list.csv")