# This notebbook merges and validates the datasets for Google mobility prediction

In [42]:
import pandas as pd

In [43]:
###############
# Import datasets
###############

# State name mapping table
state_mapping_table = pd.read_csv('../Input_datasets/Mobility_flow_prediction_shared/us_states_w_fips_and_coordinates.csv')
# Drop US-DC (District of Columbia) as it is not a state
state_mapping_table = state_mapping_table[state_mapping_table["iso_3166_2_code"]!="US-DC"]
iso_2_fips_mapper = dict(zip(state_mapping_table['iso_3166_2_code'], state_mapping_table['FIPS']))
fips_iso_2_mapper = dict(zip(state_mapping_table['FIPS'], state_mapping_table['iso_3166_2_code']))

# Google Mobility dataset
geods_mobility_dataset = pd.read_csv('../Input_datasets/GeoDS_mobility_flow_prediction/state2state_merged.csv')
# Drop Puerto Rico and US-DC (District of Columbia)
geods_mobility_dataset = geods_mobility_dataset[(geods_mobility_dataset['geoid_o']!=11) & (geods_mobility_dataset['geoid_d']!=11) &
                                                (geods_mobility_dataset['geoid_o']!=72) & (geods_mobility_dataset['geoid_d']!=72)]

# Import US state edgelist
edge_list = pd.read_json('../Input_datasets/Mobility_flow_prediction_shared/us_states_edge_list.json')
edge_list.columns= ['origin', 'destination']
# Drop FIPS=11 (US-DC, District of Columbia) as it is not a state
edge_list = edge_list[(edge_list['origin']!=11) & (edge_list['destination']!=11)]

# Additional edge characteristics
us_state_distances = pd.read_csv('../Input_datasets/Mobility_flow_prediction_shared/US_state_distances.csv', skiprows=2, index_col=0)
us_state_distances.columns = us_state_distances.index

# Node characteristics - US state population
us_state_pop = pd.read_csv('../Input_datasets/Mobility_flow_prediction_shared/US_state_pop_2019_census.csv', index_col=0)
us_state_pop.columns = ["state", "population_2019", "population_density_2019", "FIPS"]
us_state_pop.drop(columns=["state"], inplace=True)

# Node characteristics - OpenStreetMap features
overpass_features = pd.read_csv('../Input_datasets/Mobility_flow_prediction_shared/overpass_features.csv', index_col=0)
overpass_features.drop(columns=["overpass_id", "state"], inplace=True)
overpass_features.rename(columns={"state_short":"iso_3166_2_code"}, inplace=True)


In [44]:
###############
# Create node_list data
###############
node_list = pd.merge(state_mapping_table, us_state_pop, on = "FIPS", how='left')
node_list = pd.merge(node_list, overpass_features, on="iso_3166_2_code", how='left')

###############
# Create edge_list data
###############
edge_list["distances"] = edge_list.apply(lambda x: us_state_distances.loc[x["origin"], x["destination"]], axis=1)
edge_list['origin'] = edge_list['origin'].astype('int').apply(lambda x: fips_iso_2_mapper[x])
edge_list['destination'] = edge_list['destination'].astype('int').apply(lambda x: fips_iso_2_mapper[x])

###############
# Create edge_target_list data
###############
edge_target_list = geods_mobility_dataset.copy()
edge_target_list['geoid_o'] = edge_target_list['geoid_o'].astype('int').apply(lambda x: fips_iso_2_mapper[x])
edge_target_list['geoid_d'] = edge_target_list['geoid_d'].astype('int').apply(lambda x: fips_iso_2_mapper[x])
edge_target_list.rename(columns={"geoid_o":"origin", "geoid_d": "destination"}, inplace=True)

###############
# Validate mobility data
###############

# Obtain list of countries from trade dataset and validate

origin_set = set(edge_target_list.origin.unique())
destination_set = set(edge_target_list.destination.unique())

if (origin_set - destination_set != set()) & (destination_set - origin_set != set()):
    print('Number of partners and reporters do no match!')

periods = set(edge_target_list.start_date.unique())
all_pairs = set([(i,j,k) for i in origin_set for j in destination_set for k in periods])
real_pairs = set(list(edge_target_list[['origin', 'destination', 'start_date']].itertuples(index=False, name=None)))

if (all_pairs - real_pairs != set()) & (real_pairs - all_pairs != set()):
    print('Number of expected and real observations do no match!')

###############
# Save to csv
###############
node_list.to_csv("../Output_datasets/GeoDS_mobility_flow_prediction/node_list.csv")
edge_list.to_csv("../Output_datasets/GeoDS_mobility_flow_prediction/edge_list.csv")
edge_target_list.to_csv("../Output_datasets/GeoDS_mobility_flow_prediction/edge_target_list.csv")