In [None]:
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
# Use pygeos in geopandas
os.environ['USE_PYGEOS'] = '0'

import pickle
import matplotlib.pyplot as plt
import geopandas as gpd
import networkx as nx
from tqdm.auto import tqdm

from gensit.utils import *
from gensit.utils.notebook_functions import *

%matplotlib inline

# AUTO RELOAD EXTERNAL MODULES
%load_ext autoreload
%autoreload 2

# Dataset summaries

In [None]:
# Print data summary
all_datasets = ['DC','Chi','LA','NYC','San','Sea']

for ds in all_datasets:
    ds_table_filename = f'od_2015.npy'
    ds_train_index_filename = f'train_index.pkl'
    ds_test_index_filename = f'test_index.pkl'
    ds_validation_index_filename = f'valid_index.pkl'
    base_path = f'../../od_construction_benchmarks/methods/ANN/data/od/{ds}/'

    # Define directory
    ds_table_path = os.path.join(base_path,ds_table_filename)
    ds_train_index_path = os.path.join(base_path,ds_train_index_filename)
    ds_test_index_path = os.path.join(base_path,ds_test_index_filename)
    ds_validation_index_path = os.path.join(base_path,ds_validation_index_filename)

    ds_table = np.load(ds_table_path)
    with open(ds_train_index_path, 'rb') as f:
        ds_train_index = pickle.load(f)
    with open(ds_test_index_path, 'rb') as f:
        ds_test_index = pickle.load(f)
    with open(ds_validation_index_path, 'rb') as f:
        ds_validation_index = pickle.load(f)
    ds_zero_cells = np.argwhere(ds_table==0)
    ds_train_cells = np.array(ds_train_index).T
    ds_test_cells = np.array(ds_test_index).T
    ds_validation_cells = np.array(ds_validation_index).T
    ds_zero_and_train_cells = np.concatenate((np.argwhere(ds_table==0),ds_train_cells))
    ds_test_validation_cells = np.concatenate((ds_test_cells,ds_validation_cells))
    nrows,ncols = np.shape(ds_table)

    print(f"{ds} dataset | I: {nrows}, J: {ncols}, Total: {ds_table.sum()}")
    print(f"Total cells: {nrows*ncols}")
    print(f"Train cells: {len(ds_train_cells)}, Zero cells: {len(ds_zero_cells)}, Train/zero cells: {len(ds_zero_and_train_cells)}")
    print(f"Test cells: {len(ds_test_cells)}, Test/Validation cells: {len(ds_test_validation_cells)}")
    print(f"Sanity check: {len(ds_zero_and_train_cells)+len(ds_test_validation_cells)} = {nrows*ncols}")
    print('\n')

## Import table and geometries

In [None]:
dataset = f'DC'
table_filename = f'od_2015.npy'
cost_filename = f'distance.npy'
region_features_filename = f'region_features_{dataset}.csv'
region_geometries_filename = f'region_geometries_{dataset}.geojson'
neighbours_filename = f"neighbors.npy"
train_index_filename = f'train_index.pkl'
test_index_filename = f'test_index.pkl'
validation_index_filename = f'valid_index.pkl'
base_path = f'../data/raw/{dataset}'

# Define directory
table_path = os.path.join(base_path,table_filename)
cost_path = os.path.join(base_path,cost_filename)
region_features_path = os.path.join(base_path,region_features_filename)
region_geometries_path = os.path.join(base_path,region_geometries_filename)
neighbours_path = os.path.join(base_path,neighbours_filename)
train_index_path = os.path.join(base_path,train_index_filename)
test_index_path = os.path.join(base_path,test_index_filename)
validation_index_path = os.path.join(base_path,validation_index_filename)

In [None]:
table = np.load(table_path)
I,J = np.shape(table)
rowsums,colsums = table.sum(axis=1),table.sum(axis=0)
region_features = pd.read_csv(region_features_path)
region_geometries = gpd.read_file(region_geometries_path)
region_geometries = region_geometries.set_index('GEOID')
region_geometries.index = region_geometries.index.astype(int)
region_geometries = region_geometries.sort_index()
region_geometries['LOCATIONID'] = list(range(1,I+1))
cost = np.load(cost_path)
neighbours = np.load(neighbours_path)
with open(train_index_path, 'rb') as f:
    train_index = pickle.load(f)
with open(test_index_path, 'rb') as f:
    test_index = pickle.load(f)
with open(validation_index_path, 'rb') as f:
    validation_index = pickle.load(f)
cost_large_diagonal = cost + np.eye(J)*1000

adjacency_matrix = (neighbours+np.eye(I)).astype('int8')
cost_weigthed_adjacency_matrix = np.where(adjacency_matrix,cost,np.nan)

In [None]:
region_features = region_features.reset_index().rename(columns={'Unnamed: 0':'GEOID'}).set_index('GEOID')
# Sanity checks
try:
    assert region_features.index.is_monotonic_increasing
except:
    print("Region features not monotonic")

# Keep only a subset of the data
mini_region_features = deepcopy(region_features.reset_index()[["GEOID","Estimate!!Total housing units"]])

In [None]:
region_features_with_origin_demand = deepcopy(region_features)
mini_region_features_with_origin_demand = deepcopy(mini_region_features)
region_features_with_origin_and_destination_demand = deepcopy(region_features)
mini_region_features_with_origin_and_destination_demand = deepcopy(mini_region_features)

region_features_with_origin_demand['Origin_Demand'] = rowsums
mini_region_features_with_origin_demand['Origin_Demand'] = rowsums
region_features_with_origin_and_destination_demand['Origin_Demand'] = rowsums
mini_region_features_with_origin_and_destination_demand['Origin_Demand'] = rowsums

region_features_with_origin_and_destination_demand['Destination_Demand'] = colsums
mini_region_features_with_origin_and_destination_demand['Destination_Demand'] = colsums

In [None]:
dataset,I,J,table.sum()

In [None]:
zero_cells = np.argwhere(table==0)
train_cells = np.array(train_index).T
test_cells = np.array(test_index).T
validation_cells = np.array(validation_index).T
zero_and_train_cells = np.concatenate((np.argwhere(table==0),train_cells))
test_validation_cells = np.concatenate((test_cells,validation_cells))

In [None]:
val_min_row = int(validation_cells[:, 0].min())
val_max_row = int(validation_cells[:, 0].max())
val_min_col = int(validation_cells[:, 1].min())
val_max_col = int(validation_cells[:, 1].max())

test_min_row = int(test_cells[:, 0].min())
test_max_row = int(test_cells[:, 0].max())
test_min_col = int(test_cells[:, 1].min())
test_max_col = int(test_cells[:, 1].max())

new_zero_training_cells,new_test_cells,new_validation_cells = [],[],[]
for o in range(I):
    for d in range(J):
        if (o >= val_min_row and o <= val_max_row) and \
            (d >= val_min_col and d <= val_max_col):
            new_validation_cells.append([o,d])
        elif (o >= test_min_row and o <= test_max_row) and \
            (d >= test_min_col and d <= test_max_col):
            new_test_cells.append([o,d])
        else:
            assert bool(np.any((zero_and_train_cells[:,0] == o) & (zero_and_train_cells[:,1] == d)))
            new_zero_training_cells.append([o,d])
new_zero_training_cells = np.array(new_zero_training_cells,dtype='int32')
new_validation_cells = np.array(new_validation_cells,dtype='int32')
new_test_cells = np.array(new_test_cells,dtype='int32')
assert len(new_validation_cells)+len(new_test_cells)+len(new_zero_training_cells) == I*J

In [None]:
table_masks = np.zeros((I,J))
for ntc in new_test_cells:
    table_masks[ntc[0],ntc[1]] = 1
for nvc in new_validation_cells:
    table_masks[nvc[0],nvc[1]] = 2
fig,ax = plt.subplots(1,1,figsize=(10, 10))
colors = ['green','red','blue']
cmap = mpl.colors.ListedColormap(colors)
heatmap = plt.imshow(table_masks, cmap=cmap, aspect='equal', interpolation='nearest')
cbar = plt.colorbar(ticks=[0,1,2],fraction=0.046, pad=0.04)
cbar.set_ticklabels(['Train', 'Test','Validation'])
cbar.ax.tick_params(labelsize=14)
# ax.set_xticks([])
# ax.set_yticks([])
ax.tick_params(axis='both', labelsize=14)
ax.set_xlabel('Destinations', fontsize=16)
ax.set_ylabel('Origins', fontsize=16)
# fig.tight_layout()
plt.show()

In [None]:
# Number of zeros by column
zeros_by_col = (region_features == 0).astype(int).sum(axis=0).to_dict()
zeros_by_col = {k:v for k,v in zeros_by_col.items() if v <= 0}
zeros_by_col

In [None]:
destination_attraction_attrs = {
    "households":"Total!!Estimate!!HOUSEHOLDS!!Households",
    "housing_units":"Estimate!!Total housing units",
    "population":"Estimate!!SEX AND AGE!!Total population"
}

destination_attractions = {}
for da_attr,colname in destination_attraction_attrs.items():
    destination_attractions[da_attr] = region_features[colname].values
destination_attractions["demand"] = table.sum(axis=0)

In [None]:
plt.figure(figsize=(10,10))
plt.title('Neighbours')
plt.imshow(neighbours, cmap='Set1_r', interpolation='nearest')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.title('Table')
plt.imshow(np.where(table<=0,np.nan,table), cmap='hot', interpolation='nearest')
plt.show()

In [None]:
plt.figure(figsize=(10,10))
plt.title('Cost')
plt.imshow(cost, cmap='hot', interpolation='nearest')
plt.show()

In [None]:
fig,ax = plt.subplots(1,1,figsize=(10,10))
plt.title('Geometries')
_ = region_geometries.plot(ax=ax,edgecolor='blue',facecolor='white')

In [None]:
fig,axs = plt.subplots(1,len(destination_attractions),figsize=(5*len(destination_attractions),5))

for i,keyval in enumerate(destination_attractions.items()):
    axs[i].set_title(f"{keyval[0]} with {sum(keyval[1]<=0)} zeros out of {len(keyval[1])}")
    _ = axs[i].hist((keyval[1]/keyval[1].sum()).ravel())

In [None]:
plt.title("Cost matrix")
_ = plt.hist(10994*(cost/cost.sum()).ravel(),bins=30)

# Normalise data

In [None]:
origin_demand_sum_normalised = rowsums/rowsums.sum()
cost_max_normalised = cost/cost.max()
cost_sum_normalised = cost/cost.sum()
cost_large_diagonal_sum_normalised = cost_large_diagonal/cost_large_diagonal.sum()
cost_large_diagonal_max_normalised = cost_large_diagonal/cost_large_diagonal.max()

destination_attractions_sum_normalised = {}
for da_name, da_data in destination_attractions.items():
    destination_attractions_sum_normalised[f"destination_attraction_{da_name}_ts_sum_normalised"] = da_data/da_data.sum()

# Export data to file

In [None]:
region_features

In [None]:
len(new_zero_training_cells)

In [None]:
# os.mkdir(f"../data/inputs/{dataset}")

np.savetxt(f'../data/inputs/{dataset}/ground_truth_table.txt',table)
np.savetxt(f'../data/inputs/{dataset}/rowsums.txt',rowsums)
np.savetxt(f'../data/inputs/{dataset}/colsums.txt',colsums)
np.savetxt(f'../data/inputs/{dataset}/zero_cells.txt',zero_cells)
np.savetxt(f'../data/inputs/{dataset}/train_cells.txt',train_cells)
np.savetxt(f'../data/inputs/{dataset}/zero_and_train_cells.txt',zero_and_train_cells)
np.savetxt(f'../data/inputs/{dataset}/test_cells.txt',test_cells)
np.savetxt(f'../data/inputs/{dataset}/validation_cells.txt',validation_cells)
np.savetxt(f'../data/inputs/{dataset}/test_and_validation_cells.txt',test_validation_cells)
np.savetxt(f'../data/inputs/{dataset}/neighbourhood_split_zero_and_train_cells.txt',new_zero_training_cells)
np.savetxt(f'../data/inputs/{dataset}/neighbourhood_split_test_cells.txt',new_test_cells)
np.savetxt(f'../data/inputs/{dataset}/neighbourhood_split_validation_cells.txt',new_validation_cells)
np.savetxt(f'../data/inputs/{dataset}/origin_demand_sum_normalised.txt',origin_demand_sum_normalised)

np.save(f'../data/inputs/{dataset}/region_features.npy',region_features.to_numpy())
np.save(f'../data/inputs/{dataset}/mini_region_features.npy',mini_region_features.to_numpy())
np.save(f'../data/inputs/{dataset}/region_features_with_origin_demand.npy',region_features_with_origin_demand.to_numpy())
np.save(f'../data/inputs/{dataset}/mini_region_features_with_origin_demand.npy',mini_region_features_with_origin_demand.to_numpy())
np.save(f'../data/inputs/{dataset}/region_features_with_origin_and_destination_demand.npy',region_features_with_origin_and_destination_demand.to_numpy())
np.save(f'../data/inputs/{dataset}/mini_region_features_with_origin_and_destination_demand.npy',mini_region_features_with_origin_and_destination_demand.to_numpy())
region_geometries.to_file(f'../data/inputs/{dataset}/region_geometries.geojson')

np.savetxt(f'../data/inputs/{dataset}/cost_matrix_max_normalised.txt',cost_max_normalised)
np.savetxt(f'../data/inputs/{dataset}/cost_matrix_sum_normalised.txt',cost_sum_normalised)
np.savetxt(f'../data/inputs/{dataset}/cost_matrix.txt',cost)
np.savetxt(f'../data/inputs/{dataset}/cost_matrix_large_diagonal_sum_normalised.txt',cost_large_diagonal_sum_normalised)
np.savetxt(f'../data/inputs/{dataset}/cost_matrix_large_diagonal_max_normalised.txt',cost_large_diagonal_max_normalised)
np.savetxt(f'../data/inputs/{dataset}/cost_matrix_large_diagonal.txt',cost_large_diagonal)

np.savetxt(f'../data/inputs/{dataset}/adjacency_matrix.txt',adjacency_matrix,fmt='%i')
np.savetxt(f'../data/inputs/{dataset}/cost_weigthed_adjacency_matrix.txt',cost_weigthed_adjacency_matrix)

for da_filename, da_data in destination_attractions_sum_normalised.items():
    np.savetxt(f'../data/inputs/{dataset}/{da_filename}.txt',da_data[:,np.newaxis])