In [1]:
import geopandas as gpd    
import os
import pandas as pd
from glob import glob
import tempfile
import shutil

import pystac_client
import planetary_computer

In [2]:
def combine_slosh_extents(slosh_extent_dir):
    #read in and combine slosh extent
    combined_extent = []
    for file_name in os.listdir(slosh_extent_dir):
        os.path.join(file_name, slosh_extent_dir)
    slosh_extent_file_paths = glob(slosh_extent_dir + "/*.shp")
    slosh_extent_gdf = pd.concat([gpd.read_file(p) for p in slosh_extent_file_paths], 
                                 ignore_index=True)
    return slosh_extent_gdf


In [9]:
def create_area_of_interest_dict(polygon):
    # Assuming 'gdf' is your GeoDataFrame and 'geometry' is your column with the Polygon

    # Get the GeoJSON dictionary representation
    geojson_dict = polygon.__geo_interface__

    # Convert to your area_of_interest format
    area_of_interest = {
        "type": geojson_dict["type"],
        "coordinates": [[list(tup) for tup in geojson_dict["coordinates"][0]]]#[0]  # We only need the outer ring
    }
    return area_of_interest

def items_dict_to_list(item, property_keys = ["gsd",'naip:year'],):
    properties = list( map(item.properties.get, ["gsd",'naip:year']) )
    image_path = item.assets["image"].href
    return [item.id, image_path] + properties


def extract_naip_tiles_by_tract(polygon, date_range):
    area_of_interest = create_area_of_interest_dict(polygon)
    search = catalog.search(collections=["naip"], intersects=area_of_interest,
                            datetime=date_range)
    items = search.item_collection()
    naip_image_data = [items_dict_to_list(item) for item in items]
    return naip_image_data

In [10]:
subset_tracts_in_slosh = gpd.read_file("/work/csr33/images_for_predictions/subset_tracts_in_slosh.geojson")

In [11]:
date_range = "2020-01-01/2024-01-31"
property_keys = ["gsd",'naip:year']
columns = ["id", "image_path"] + property_keys

In [12]:
catalog = pystac_client.Client.open("https://planetarycomputer.microsoft.com/api/stac/v1",
                                     modifier=planetary_computer.sign_inplace,)

In [None]:
extracted_naip = []
for index, row in subset_tracts_in_slosh.iterrows():
    if index % 1000 == 0:
        print(index)
    polygon = row["geometry"]
    extracted_naip = extracted_naip + extract_naip_tiles_by_tract(polygon, date_range)

0


In [None]:
extracted_naip_df = pd.DataFrame(extracted_naip, columns = columns) 
extracted_naip_df.to_csv("extracted_naip.csv")

In [47]:
#catalog.get_collection("naip")
import requests

def download_image(url, filename):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(filename, 'wb') as file:
            file.write(response.content)
    else:
        print(f"Unable to download image. HTTP response code: {response.status_code}")

In [49]:
# Usage
download_image(image_path, filename)

In [34]:
gpd.GeoDataFrame.from_features(items_new)


Unnamed: 0,geometry,gsd,datetime,naip:year,proj:bbox,proj:epsg,providers,naip:state,proj:shape,proj:transform
0,"POLYGON ((-81.68342 30.12231, -81.68390 30.191...",0.6,2021-11-29T16:00:00Z,2021,"[427454.4, 3332535.0, 434165.4, 3340154.4]",26917,[{'url': 'https://www.fsa.usda.gov/programs-an...,fl,"[12699, 11185]","[0.6, 0.0, 427454.4, 0.0, -0.6, 3340154.4, 0.0..."
1,"POLYGON ((-81.62147 30.12231, -81.62190 30.191...",0.6,2021-11-29T16:00:00Z,2021,"[433523.4, 3332500.8000000003, 440134.2, 33401...",26917,[{'url': 'https://www.fsa.usda.gov/programs-an...,fl,"[12693, 11018]","[0.6, 0.0, 433523.4, 0.0, -0.6, 3340116.6, 0.0..."
2,"POLYGON ((-81.68343 30.18531, -81.68390 30.253...",0.6,2021-11-29T16:00:00Z,2021,"[427500.6, 3339516.6, 434206.8, 3347025.0]",26917,[{'url': 'https://www.fsa.usda.gov/programs-an...,fl,"[12514, 11177]","[0.6, 0.0, 427500.6, 0.0, -0.6, 3347025.0, 0.0..."
3,"POLYGON ((-81.62147 30.18530, -81.62189 30.253...",0.6,2021-11-29T16:00:00Z,2021,"[433565.4, 3339481.8000000003, 440172.0, 33469...",26917,[{'url': 'https://www.fsa.usda.gov/programs-an...,fl,"[12509, 11011]","[0.6, 0.0, 433565.4, 0.0, -0.6, 3346987.2, 0.0..."
4,"POLYGON ((-81.68342 30.24731, -81.68389 30.316...",0.6,2021-11-29T16:00:00Z,2021,"[427545.6, 3346387.2, 434249.39999999997, 3354...",26917,[{'url': 'https://www.fsa.usda.gov/programs-an...,fl,"[12699, 11173]","[0.6, 0.0, 427545.6, 0.0, -0.6, 3354006.6, 0.0..."
5,"POLYGON ((-81.62146 30.24730, -81.62189 30.316...",0.6,2021-11-29T16:00:00Z,2021,"[433606.8, 3346352.4000000004, 440210.39999999...",26917,[{'url': 'https://www.fsa.usda.gov/programs-an...,fl,"[12693, 11006]","[0.6, 0.0, 433606.8, 0.0, -0.6, 3353968.2, 0.0..."


['fl_m_3008151_sw_17_060_20211129',
 'fl_m_3008151_se_17_060_20211129',
 'fl_m_3008151_nw_17_060_20211129',
 'fl_m_3008151_ne_17_060_20211129',
 'fl_m_3008143_sw_17_060_20211129',
 'fl_m_3008143_se_17_060_20211129']

In [43]:
#items_new.features["id"]
gpd.GeoDataFrame.from_features(items_new.to_dict(), crs="epsg:4326")


Unnamed: 0,geometry,gsd,datetime,naip:year,proj:bbox,proj:epsg,providers,naip:state,proj:shape,proj:transform
0,"POLYGON ((-81.68342 30.12231, -81.68390 30.191...",0.6,2021-11-29T16:00:00Z,2021,"[427454.4, 3332535.0, 434165.4, 3340154.4]",26917,[{'url': 'https://www.fsa.usda.gov/programs-an...,fl,"[12699, 11185]","[0.6, 0.0, 427454.4, 0.0, -0.6, 3340154.4, 0.0..."
1,"POLYGON ((-81.62147 30.12231, -81.62190 30.191...",0.6,2021-11-29T16:00:00Z,2021,"[433523.4, 3332500.8000000003, 440134.2, 33401...",26917,[{'url': 'https://www.fsa.usda.gov/programs-an...,fl,"[12693, 11018]","[0.6, 0.0, 433523.4, 0.0, -0.6, 3340116.6, 0.0..."
2,"POLYGON ((-81.68343 30.18531, -81.68390 30.253...",0.6,2021-11-29T16:00:00Z,2021,"[427500.6, 3339516.6, 434206.8, 3347025.0]",26917,[{'url': 'https://www.fsa.usda.gov/programs-an...,fl,"[12514, 11177]","[0.6, 0.0, 427500.6, 0.0, -0.6, 3347025.0, 0.0..."
3,"POLYGON ((-81.62147 30.18530, -81.62189 30.253...",0.6,2021-11-29T16:00:00Z,2021,"[433565.4, 3339481.8000000003, 440172.0, 33469...",26917,[{'url': 'https://www.fsa.usda.gov/programs-an...,fl,"[12509, 11011]","[0.6, 0.0, 433565.4, 0.0, -0.6, 3346987.2, 0.0..."
4,"POLYGON ((-81.68342 30.24731, -81.68389 30.316...",0.6,2021-11-29T16:00:00Z,2021,"[427545.6, 3346387.2, 434249.39999999997, 3354...",26917,[{'url': 'https://www.fsa.usda.gov/programs-an...,fl,"[12699, 11173]","[0.6, 0.0, 427545.6, 0.0, -0.6, 3354006.6, 0.0..."
5,"POLYGON ((-81.62146 30.24730, -81.62189 30.316...",0.6,2021-11-29T16:00:00Z,2021,"[433606.8, 3346352.4000000004, 440210.39999999...",26917,[{'url': 'https://www.fsa.usda.gov/programs-an...,fl,"[12693, 11006]","[0.6, 0.0, 433606.8, 0.0, -0.6, 3353968.2, 0.0..."


In [37]:
items_new#.assets["image"].href

In [76]:
from shapely.geometry import Polygon

point1 = (-75.754727, 39.059650)
point2 = (-75.003601, 38.453883)
 

# The points are the bottom left and top right corners of the rectangle
polygon = [point1, (point2[0], point1[1]), point2, (point1[0], point2[1]), point1]


[[-75.754727, 39.05965],
 [-75.003601, 39.05965],
 [-75.003601, 38.453883],
 [-75.754727, 38.453883],
 [-75.754727, 39.05965]]

In [82]:
# Assuming 'gdf' is your GeoDataFrame and 'geometry' is your column with the Polygon
#polygon = slosh_extent_in_usa['geometry'].iloc[1]

# Get the GeoJSON dictionary representation
#geojson_dict = polygon.__geo_interface__

# Convert to your area_of_interest format
area_of_interest = {
    "type": "Polygon", #geojson_dict["type"],
    "coordinates": [[list(tup) for tup in polygon]] #geojson_dict["coordinates"][0]#[0]  # We only need the outer ring
}

In [83]:
area_of_interest

{'type': 'Polygon',
 'coordinates': [[[-75.754727, 39.05965],
   [-75.003601, 39.05965],
   [-75.003601, 38.453883],
   [-75.754727, 38.453883],
   [-75.754727, 39.05965]]]}

In [90]:
items_new

In [None]:

#Install using Christoph Golke Wheel; https://www.lfd.uci.edu/~gohlke/pythonlibs/#shapely
import shapely
import rasterio
from geopy.geocoders import Nominatim
from rasterio.windows import Window 
#Import module with functions
import src.az_proc as ap
import argparse

In [None]:
x = gpd.read_file("/work/csr33/quads/map_indexes_QD12K/map_indexes\quads12k_a_extract.shp")

In [None]:
def get_args_parse():
    parser = argparse.ArgumentParser(
        description='obtain tile name and tile url')
    parser.add_argument('--blob_root', type=str, default='https://naipeuwest.blob.core.windows.net/naip',
                        help='Define Microsoft Azure Blob Root; The(preferred) copy of NAIP in the West Europe Azure region')
    parser.add_argument('--lat_lon_dir', type=str, help='Directory holding datasets of lat lons')
    parser.add_argument('--quad_dir', type=str, help='Directory holding dataset of quads')
    parser.add_argument('--output_dir', type=str, default=None,
                        help='Directory to save outputs')
    parser.add_argument('--tile_names_tile_urls_filename', type=str, default=None,
                        help='Filename to save tile names and urls')


    args = parser.parse_args()
    return args

In [None]:
slosh_extent_gdf

In [None]:

def collected_quads_to_tile_name_tile_url(quads, blob_root): 
    """
    #Tiles are stored at: [blob root]/v002/[state]/[year]/[state]_[resolution]_[year]/[quadrangle]/filename
    #Read in a excel sheet which includes the quadrangle 
    """

    tile_names = []
    tile_urls = []
    file_name_index = {'m': 0, 'qqname': 1, 'direction': 2, 'YY': 3, 'resolution': 4, 'capture_date': 5,
                       'version_date': 5}
    two_digit_state_resolution = ["al", "ak", "az", "ar", "ca", "co", "ct", "de", "fl", "ga",
                                  "hi", "id", "il", "in", "ia", "ks", "ky", "la", "me", "md",
                                  "ma", "mi", "mn", "ms", "mo", "mt", "ne", "nv", "nh", "nj",
                                  "nm", "ny", "nc", "nd", "oh", "ok", "or", "pa", "ri", "sc",
                                  "sd", "tn", "tx", "ut", "vt", "va", "wa", "wv", "wi", "wy"]

    for index, row in quads.iterrows():
        file_name = row[0].split('_')  # filename
        state = row[1].lower()  # state
        year = row[2]  # YYYY

        if state in two_digit_state_resolution:
            resolution = file_name[file_name_index["resolution"]][1:3] + "cm"
        else:
            resolution = file_name[file_name_index["resolution"]] + "cm"
        quadrangle = file_name[file_name_index["qqname"]][0:5]  # qqname

        tile_name = file_name + '.tif'
        tile_names.append(tile_name)
        tile_urls.append(os.path.join(blob_root, "v002", state, str(year), 
                                      state + '_' + str(resolution) + '_' + str(year),
                                      str(quadrangle), tile_name))
    return (tile_names, tile_urls)

In [None]:
def get_urls_from_quads(quad_dir): 
    tile_names_tile_urls = pd.DataFrame() #dataframe to hold all tile_names and urls
    quad_paths = glob(quad_dir + "/*.csv")
    for quad_path in quad_paths:
        quad_dataset = pd.read_csv(quad_path) #read in sheet of quadrangles
        tile_names, tile_urls = collected_quads_to_tile_name_tile_url(quad_dataset, blob_root) # identify filespaths/urls for quads of interest
        quad_names_urls = pd.DataFrame({"tile_name": tile_names, "tile_url": tile_urls})
        tile_names_tile_urls = pd.concat([tile_names_tile_urls, quad_names_urls])
    return tile_names_tile_urls

In [None]:
def tile_name_tile_url_characteristics(tile_names_tile_urls, output_path = None): 
    """tabulates the tile characteristics (the states, year resolution ranges), 
      returns the tile charcateristics 
       (quadrange names, the filenames,the states, year resolution ranges)

    Args:
        file_loc (str): The file location of the spreadsheet
        print_cols (bool): A flag used to print the columns to the console
            (default is False)

    Returns:
        list: a list of strings representing the header columns
    """

    state_array = np.empty((len(tile_names_tile_urls), 1), dtype=object)
    year_array = np.empty((len(tile_names_tile_urls), 1))
    quad_array = np.empty((len(tile_names_tile_urls), 1))
    resolution_array = np.empty((len(tile_names_tile_urls), 1), dtype=object)
    filename_array = np.empty((len(tile_names_tile_urls), 1), dtype=object)

    for i in range(len(tile_names_tile_urls)):
        state_array[i] = tile_names_tile_urls[i, 1].split('/')[5]
        year_array[i] = tile_names_tile_urls[i, 1].split('/')[6]
        quad_array[i] = tile_names_tile_urls[i, 1].split('/')[8]
        filename_array[i] = tile_names_tile_urls[i, 1].split('/')[9]
        resolution_array[i] = tile_names_tile_urls[i, 1].split('/')[-3].split('_')[1]

    state_abbreviations = np.unique(state_array)
    num_states = len(state_abbreviations)
    years = np.unique(year_array)
    resolutions = np.unique(resolution_array)

    print("the number of tiles includes", len(tile_names_tile_urls))
    print("The number of states included", num_states)
    print("Postal abriviations of the states included", state_abbreviations)
    print("The years in which the images were collected", years)
    print("The resolutions of the images", resolutions)
    if output_path is not None:
        np.save(os.path.join(output_path, 'states_in_tile_urls.npy'), state_abbreviations)

    return num_states, state_abbreviations, years, resolutions, quad_array, filename_array

In [None]:

class DownloadProgressBar():
    """
    A progressbar to show the completed percentage and download speed for each image downloaded using urlretrieve.

    https://stackoverflow.com/questions/37748105/how-to-use-progressbar-module-with-urlretrieve
    """

    def __init__(self):
        self.pbar = None

    def __call__(self, block_num, block_size, total_size):
        if not self.pbar:
            self.pbar = progressbar.ProgressBar(max_value=total_size)
            self.pbar.start()

        downloaded = block_num * block_size
        if downloaded < total_size:
            self.pbar.update(downloaded)
        else:
            self.pbar.finish()


class NAIPTileIndex:
    """
    Utility class for performing NAIP tile lookups by location.
    """

    tile_rtree = None
    tile_index = None
    base_path = None

    def __init__(self, base_path=None):
        blob_root = 'https://naipeuwest.blob.core.windows.net/naip'
        index_files = ["tile_index.dat", "tile_index.idx", "tiles.p"]
        index_blob_root = re.sub('/naip$', '/naip-index/rtree/', blob_root)

        if base_path is None:

            base_path = os.path.join(tempfile.gettempdir(), 'naip')
            os.makedirs(base_path, exist_ok=True)

            for file_path in index_files:
                download_url(index_blob_root + file_path, destination_folder=base_path,
                             destination_filename=base_path + '/' + file_path,
                             progress_updater=DownloadProgressBar())

        self.base_path = base_path
        self.tile_rtree = rtree.index.Index(base_path + "/tile_index")
        self.tile_index = pickle.load(open(base_path + "/tiles.p", "rb"))

    def lookup_tile(self, lat, lon):
        """"
        Given a lat/lon coordinate pair, return the list of NAIP tiles that contain
        that location.

        Returns a list of COG file paths.
        """

        point = shapely.geometry.Point(float(lon), float(lat))
        intersected_indices = list(self.tile_rtree.intersection(point.bounds))

        intersected_files = []
        tile_intersection = False

        for idx in intersected_indices:

            intersected_file = self.tile_index[idx][0]
            intersected_geom = self.tile_index[idx][1]
            if intersected_geom.contains(point):
                tile_intersection = True
                intersected_files.append(intersected_file)

        if not tile_intersection and len(intersected_indices) > 0:
            print('''Error: there are overlaps with tile index, 
                      but no tile completely contains selection''')
            return None
        elif len(intersected_files) <= 0:
            print("No tile intersections")
            return None
        else:
            return intersected_files


def download_url(url, destination_folder, destination_filename=None, progress_updater=None, force_download=False): 
    """
    Download a URL to a a file
    Args:
    url(str): url to download
    destination_folder(str): directory to download folder
    destination_filename(str): the name for each of files to download
    return:
    destination_filename
    """

    # This is not intended to guarantee uniqueness, we just know it happens to guarantee
    # uniqueness for this application.
    if destination_filename is not None:
        destination_filename = os.path.join(destination_folder, destination_filename)
    if destination_filename is None:
        url_as_filename = url.replace('://', '_').replace('/', '_')
        destination_filename = os.path.join(destination_folder, url_as_filename)
    if os.path.isfile(destination_filename):
        print('Bypassing download of already-downloaded file {}'.format(os.path.basename(url)))
        return destination_filename
    #  print('Downloading file {} to {}'.format(os.path.basename(url),destination_filename),end='')
    urllib.request.urlretrieve(url, destination_filename, progress_updater)
    assert (os.path.isfile(destination_filename))
    nBytes = os.path.getsize(destination_filename)
    print('...done, {} bytes.'.format(nBytes))

    return destination_filename

In [None]:


def main(args):    
    ## Define Microsoft Azure Blob Root
    # The(preferred) copy of NAIP in the West Europe Azure region
    warnings.filterwarnings("ignore")

    ## Load the spatial index of NAIP tiles
    # Spatial index that maps lat/lon to NAIP tiles; we'll load this when we first 
    # need to access it.
    index = None
    if index is None:
        index = ap.NAIPTileIndex()
        
    tile_names_tile_urls_from_lat_lons = ap.get_urls_from_lat_lons(args.lat_lon_dir, index, args.blob_root)
    tile_names_tile_urls_from_quads = ap.get_urls_from_quads(args.quad_dir)
    tile_names_tile_urls = pd.concat([tile_names_tile_urls_from_lat_lons, tile_names_tile_urls_from_quads])
    tile_names_tile_urls = tile_names_tile_urls.drop_duplicates() #remove duplicates
    np.save(os.path.join(args.output_dir, args.tile_names_tile_urls_filename), tile_names_tile_urls) #save
    
    num_states, state_abbreviations, years, resolutions, quad_array, filename_array = ap.tile_name_tile_url_characteristics(tile_names_tile_urls, args.output_dir) #get attributes
    
if __name__ == '__main__':
    ### Get the arguments 
    args = get_args_parse()
    main(args)