## Load Packages

In [1]:
# Standard packages
import tempfile
import warnings
import urllib
import shutil
import os

# Less standard, but still pip- or conda-installable
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import re
import rtree
import pickle
import progressbar # pip install progressbar2, not progressbar
from tqdm import tqdm

#Install using Christoph Golke Wheel; https://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely
import shapely
import rasterio
from geopy.geocoders import Nominatim
from rasterio.windows import Window 

#Import module with functions
import data_eng.az_proc as ap

## Define Microsoft Azure Blob Root

In [2]:
# The(preferred) copy of NAIP in the West Europe Azure region

warnings.filterwarnings("ignore")
%matplotlib inline

## Load the spatial index of NAIP tiles

In [3]:
# Spatial index that maps lat/lon to NAIP tiles; we'll load this when we first 
# need to access it.
index = None

if index is None:
    index = ap.NAIPTileIndex()

Bypassing download of already-downloaded file tile_index.dat
Bypassing download of already-downloaded file tile_index.idx
Bypassing download of already-downloaded file tiles.p


## EIA and HIFLD Petroleum Data Sources

# EIA and HIFLD 

### Load Data

Homeland Infrastructure Foundation-Level Data (HIFLD) - Petroleum Terminals

https://hifld-geoplatform.opendata.arcgis.com/datasets/7841aba67178425cbf33995fc914e2fe_0/data

In [4]:
hifld_petroleum_terminals = pd.read_csv("image_download_azure/Petroleum_Terminals_HIFLD.csv") #read in sheet of quadrangles
hifld_lons = hifld_petroleum_terminals["X"].tolist()
hifld_lats = hifld_petroleum_terminals["Y"].tolist()

EIA - Strategic Petroleum Reserves

https://atlas.eia.gov/datasets/strategic-petroleum-reserves?geometry=-159.521%2C0.792%2C-28.212%2C52.750

In [5]:
eia_strategic_petroleum_reserves = pd.read_csv("image_download_azure/Strategic_Petroleum_Reserves.csv") #read in sheet of quadrangles
eia_spr_lons = eia_strategic_petroleum_reserves["X"].tolist()
eia_spr_lats = eia_strategic_petroleum_reserves["Y"].tolist()

EIA - Petroleum Product Terminals

https://atlas.eia.gov/datasets/petroleum-product-terminals 

In [6]:
eia_petroleum_product_terminals = pd.read_csv("image_download_azure/Petroleum_Product_Terminals.csv") #read in sheet of quadrangles
eia_ppt_lons = eia_petroleum_product_terminals["X"].tolist()
eia_ppt_lats = eia_petroleum_product_terminals["Y"].tolist()

EIA - Northeast Petroleum Reserves

https://atlas.eia.gov/datasets/northeast-petroleum-reserves 

In [7]:
eia_northeast_petroleum_reserves = pd.read_csv("image_download_azure/Northeast_Petroleum_Reserves.csv") #read in sheet of quadrangles
eia_npr_lons = eia_northeast_petroleum_reserves["X"].tolist()
eia_npr_lats = eia_northeast_petroleum_reserves["Y"].tolist()

EIA - Petroleum Refineries

https://atlas.eia.gov/datasets/petroleum-refineries?geometry=-13.914%2C-56.555%2C151.320%2C84.803

In [8]:
eia_petroleum_refineries = pd.read_csv("image_download_azure/Petroleum_Refineries.csv") #read in sheet of quadrangles
eia_pr_lons = eia_petroleum_refineries["X"].tolist()
eia_pr_lats = eia_petroleum_refineries["Y"].tolist()

EIA - Natural Gas Processing Plants

https://atlas.eia.gov/datasets/natural-gas-processing-plants

In [9]:
eia_natural_gas_processing_plants = pd.read_csv("image_download_azure/Natural_Gas_Processing_Plants.csv") #read in sheet of quadrangles
eia_ngpp_lons = eia_natural_gas_processing_plants["X"].tolist()
eia_ngpp_lats = eia_natural_gas_processing_plants["Y"].tolist()

### Get the Filepathways, tile name, tile URL for EIA, HIFLD Data

In [10]:
hifld_file_pathways = ap.lons_lat_to_filepaths(hifld_lons, hifld_lats, index)
eia_spr_file_pathways = ap.lons_lat_to_filepaths(eia_spr_lons, eia_spr_lats, index)
eia_ppt_file_pathways = ap.lons_lat_to_filepaths(eia_ppt_lons, eia_ppt_lats, index)
eia_npr_file_pathways = ap.lons_lat_to_filepaths(eia_npr_lons, eia_npr_lats, index)
eia_pr_file_pathways = ap.lons_lat_to_filepaths(eia_pr_lons, eia_pr_lats, index)
eia_ngpp_file_pathways = ap.lons_lat_to_filepaths(eia_ngpp_lons, eia_ngpp_lats, index)

#filepathways 
petrol_file_pathways = np.vstack((hifld_file_pathways, eia_spr_file_pathways, eia_ppt_file_pathways, eia_npr_file_pathways,
                                  eia_pr_file_pathways, eia_ngpp_file_pathways)) #combine filepaths from multiple sources

petrol_file_pathways = np.unique(petrol_file_pathways, axis=0) #remove duplicates

 12%|█▏        | 290/2338 [00:00<00:01, 1152.09it/s]

No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile in

 26%|██▌       | 609/2338 [00:00<00:02, 846.25it/s] 

No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections


100%|██████████| 2338/2338 [00:05<00:00, 430.41it/s]


No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections


100%|██████████| 4/4 [00:00<00:00, 1333.64it/s]
  8%|▊         | 114/1476 [00:00<00:01, 1139.93it/s]

No tile intersections
No tile intersections
No tile intersections


 15%|█▌        | 228/1476 [00:00<00:01, 936.14it/s] 

No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections


 35%|███▌      | 518/1476 [00:00<00:01, 925.24it/s]

No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections


 41%|████▏     | 612/1476 [00:00<00:01, 835.85it/s]

No tile intersections
No tile intersections
No tile intersections


 53%|█████▎    | 780/1476 [00:00<00:00, 771.35it/s]

No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections


 63%|██████▎   | 932/1476 [00:01<00:00, 675.24it/s]

No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections


 76%|███████▌  | 1116/1476 [00:01<00:00, 514.97it/s]

No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections


 83%|████████▎ | 1227/1476 [00:01<00:00, 517.10it/s]

No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections


 91%|█████████ | 1339/1476 [00:02<00:00, 518.80it/s]

No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections


100%|██████████| 1476/1476 [00:02<00:00, 633.69it/s]


No tile intersections


100%|██████████| 6/6 [00:00<00:00, 999.44it/s]
100%|██████████| 135/135 [00:00<00:00, 1153.74it/s]


No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections


100%|██████████| 478/478 [00:00<00:00, 1191.91it/s]


In [11]:
#tile names and urls 
tile_name_eia_hifld, tile_url_eia_hifld = ap.filepaths_to_tile_name_tile_url(petrol_file_pathways)
tile_name_tile_url_eia_hifld = np.column_stack((tile_name_eia_hifld, tile_url_eia_hifld))

In [12]:
tile_name_tile_url_eia_hifld.shape

(2452, 2)

# Group Identified ASTs

In [22]:
group_identified_sites = pd.read_csv("image_download_azure/identified_sites.csv") #read in sheet of quadrangles
group_identified_sites_lat = group_identified_sites["Lat"].tolist()
group_identified_sites_lon = group_identified_sites["Lon"].tolist()
assert len(group_identified_sites_lat) == len(group_identified_sites_lat)
print(len(group_identified_sites_lat))
len(ap.lons_lat_to_filepaths(group_identified_sites_lon, group_identified_sites_lat, index))

group_identified_sites_file_pathways = ap.lons_lat_to_filepaths(group_identified_sites_lon, group_identified_sites_lat, index)
tile_name_tile_url_group_identified_sites = ap.filepaths_to_tile_name_tile_url(group_identified_sites_file_pathways)
tile_name_tile_url_group_identified_sites = np.array((tile_name_tile_url_group_identified_sites)).T
tile_name_tile_url_group_identified_sites.shape

91


100%|██████████| 91/91 [00:00<00:00, 1123.40it/s]


No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections


100%|██████████| 91/91 [00:00<00:00, 1358.06it/s]

No tile intersections
No tile intersections
No tile intersections
No tile intersections
No tile intersections





(68, 2)

### Thirty Ports

In [23]:
thirty_port_quads = pd.read_csv("image_download_azure/Quadrangles_of_interest.csv") #read in sheet of quadrangles

tile_name_thirty_ports, tile_url_thirty_ports = ap.collected_quads_to_tile_name_tile_url(thirty_port_quads) # identify filespaths/urls for quads of interest

tile_name_tile_url_thirty_ports = np.column_stack((tile_name_thirty_ports, tile_url_thirty_ports))
tile_name_tile_url_thirty_ports.shape

(148, 2)

### Combine Filepaths from group identified sources 

In [24]:
all_group_identified_sites = np.vstack((tile_name_tile_url_group_identified_sites, tile_name_tile_url_thirty_ports))
print(all_group_identified_sites.shape)

all_group_identified_sites = np.unique(all_group_identified_sites, axis=0) #remove duplicates
print(all_group_identified_sites.shape)

(216, 2)
(216, 2)


# Combine tile names and urls from all identified sources 

In [25]:
tile_name_tile_url_eia_hifld_group_identified_sites = np.vstack((tile_name_tile_url_eia_hifld, all_group_identified_sites))
print(tile_name_tile_url_eia_hifld_group_identified_sites.shape)
tile_name_tile_url_eia_hifld_group_identified_sites = np.unique(tile_name_tile_url_eia_hifld_group_identified_sites, axis=0) #remove duplicates
print(tile_name_tile_url_eia_hifld_group_identified_sites.shape)

(2668, 2)
(2610, 2)


# Save tile name and url

In [26]:
# save array Get array of the data (all tile names and urls)
np.save("image_download_azure/tile_name_tile_url_complete_array", tile_name_tile_url_eia_hifld_group_identified_sites)

In [49]:
#Determine the additional tiles
tile_name_tile_url_current_set = np.load('tile_name_tile_url_labeled.npy')
print(tile_name_tile_url_current_set.shape)
print(tile_name_tile_url_eia_hifld_group_identified_sites.shape)
dif = np.array(list(set(map(tuple, tile_name_tile_url_eia_hifld_group_identified_sites)) - set(map(tuple, tile_name_tile_url_current_set))))
print(dif.shape)

#save only the tiles that were not originally included in the assessment set
#np.save("image_download_azure/tile_name_tile_url_expanded_only", dif)

(2488, 2)
(2610, 2)
(2529, 2)


In [34]:
labeled = np.load("tile_name_tile_url_labeled.npy")
print(labeled.shape)
#current_set_full = np.load("image_download_azure/tile_name_tile_url_eia_hifld_thirty_ports.npy")
#print(current_set_full.shape)
#current_remaining = np.load("tile_name_tile_url_remaining_expanded.npy")
#print(current_remaining.shape)

(2514, 2)
(2610, 2)
(0, 2)


In [40]:
2514 + 67 
#np.load("tile_name_tile_url_error_downloading.npy")

2581

In [52]:
#get the indicies for the remaining tiles that need to be annotated

#create a set of indicies for the expanded full dataset
index_expanded_full = list(range(len(expanded_full)-1))
index_expanded_full = set(index_expanded_full)

#determine the indicies of the tiles that have already been labeled in the expanded_full array
index_labeled = []
for i in range(len(labeled)):
    ismember = [row==labeled[i] for row in expanded_full.tolist()]
    index_labeled = index_labeled + np.where(ismember)[0].tolist()
index_labeled = set(index_labeled)

#determine the indices for the remaining tiles,(i.e., the difference between the expanded and labeled arrays)
remaining = index_expanded_full.difference(index) #(A - B)
remaining = list(remaining)
#len(remaining)

In [57]:
#update the remaining array 
#current_remaining = np.concatenate((current_remaining, expanded_full[remaining]), axis=0)
#np.save("tile_name_tile_url_remaining_expanded.npy", current_remaining)

current_remaining = np.load("tile_name_tile_url_remaining_expanded.npy")
print(current_remaining.shape)

In [58]:
print(current_remaining.shape)

(25, 2)


In [60]:
states = []
for tile in expanded_full[:,1]:
    states.append(tile.split("/")[5])
len(np.unique(np.array(states)))

47