In [1]:
# Standard packages
import tempfile
import warnings
import urllib
import shutil
import os
# Less standard, but still pip- or conda-installable
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
#import rasterio
import re
import rtree
import shapely
import pickle
import data_eng.az_proc as ap
#from cartopy import crs
import collections
import cv2
import math
from glob import glob

# Load Data

In [5]:
parent_directory = "//oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//"

tile_names_tile_urls_complete_array = np.load("image_download_azure/tile_name_tile_url_complete_array.npy")

tiles_labeled = "tile_name_tile_url_labeled.npy"
tiles_labeled_from_complete_set = np.load(tiles_labeled)

tracker_file_path = 'outputs/tile_img_annotation_annotator.npy'
tile_img_annotation = np.load(tracker_file_path)

tiles_errors = 'tile_name_tile_url_error_downloading.npy'
tiles_errors = np.load(tiles_errors)

#create folder to hold tiles in completed dataset
path_to_complete_dataset = "verified/complete_dataset"
path_to_tiles_folder_complete_dataset = os.path.join(parent_directory, path_to_complete_dataset,"tiles")
os.makedirs(path_to_tiles_folder_complete_dataset, exist_ok=True)

# Make Functions

In [3]:
def add_formatted_and_standard_tile_names_to_tile_names_time_urls(tile_names_tile_urls):
    #get a list of the formated tile names
    tile_names = []
    for tile_url in tile_names_tile_urls:
        tile_url = tile_url[1].rsplit("/",3)
        #get the quad standard tile name 
        tile_name = tile_url[3]
        tile_name = os.path.splitext(tile_name)[0] 
        #get the tile name formated (1/14/2022)
        tile_name_formatted = tile_url[1] + "_" + tile_url[2] + "_" + tile_url[3]
        tile_name_formatted = os.path.splitext(tile_name_formatted)[0] 
        tile_names.append([tile_name, tile_name_formatted])
    
    #create array that contains the formated tile_names and tile_names
    tile_names_tile_urls_formatted_tile_names = np.hstack((tile_names_tile_urls, np.array(tile_names)))
    
    return(tile_names_tile_urls_formatted_tile_names)

def jpg_path_to_tile_name_formatted(tile_paths):
    tile_names = []
    for tile_path in tile_paths:
        base = os.path.basename(tile_path)
        jpg = os.path.splitext(base)[0] #name of tif with the extension removed
        tile_name_formated_name = jpg.rsplit("_",1)[0] #name of tif with the extension removed
        tile_names.append(tile_name_formated_name)
    return(tile_names)

# Tile names and urls for all tiles apart of the dataset (complete array)

In [4]:
tile_names_tile_urls_complete_array = add_formatted_and_standard_tile_names_to_tile_names_time_urls(tile_names_tile_urls_complete_array)
print(tile_names_tile_urls_complete_array.shape)

unique_tile_name_formatted, indicies = np.unique(tile_names_tile_urls_complete_array[:,3], return_index = True)
tile_names_tile_urls_complete_array_unique_formatted_tile_names = tile_names_tile_urls_complete_array[indicies,:]
print(tile_names_tile_urls_complete_array_unique_formatted_tile_names.shape) 

unique_tile_name_standard, indicies = np.unique(tile_names_tile_urls_complete_array[:,2], return_index = True)
tile_names_tile_urls_complete_array_unique_standard_tile_names = tile_names_tile_urls_complete_array[indicies,:]
print(tile_names_tile_urls_complete_array_unique_standard_tile_names.shape) 

(2610, 4)
(2540, 4)
(2452, 4)


# Tile names and urls for all tiles apart of the dataset that have been recorded as labeled

In [5]:
tiles_labeled_from_complete_set = add_formatted_and_standard_tile_names_to_tile_names_time_urls(tiles_labeled_from_complete_set)
print(tiles_labeled_from_complete_set.shape)

unique_tile_name_formatted, indicies = np.unique(tiles_labeled_from_complete_set[:,3], return_index = True)
tile_names_tile_urls_complete_array_unique_formatted_tile_names = tiles_labeled_from_complete_set[indicies,:]
print(tile_names_tile_urls_complete_array_unique_formatted_tile_names.shape) 

unique_tile_name_standard, indicies = np.unique(tiles_labeled_from_complete_set[:,2], return_index = True)
tile_names_tile_urls_labeled_array_unique_standard_tile_names = tiles_labeled_from_complete_set[indicies,:]
print(tile_names_tile_urls_labeled_array_unique_standard_tile_names.shape) 

(2514, 4)
(2473, 4)
(2447, 4)


# Identify specific file types from folders in the storge space

In [7]:
#The entire storage space
path_storage_space = os.path.join(parent_directory)
jpgs_storage_space = glob(path_storage_space + "/**/*.jpg", recursive = True)
print(len(jpgs_storage_space))

tile_names_formatted_storage_space = jpg_path_to_tile_name_formatted(jpgs_storage_space)
tile_names_formatted_storage_space = np.unique(tile_names_formatted_storage_space)
tile_names_formatted_storage_space.shape

2259453


(2382,)

In [68]:
tile_names_standard_storage_space = []
for tile_name_formatted_storage_space in tile_names_formatted_storage_space:
    tile_names_standard_storage_space.append(tile_name_formatted_storage_space.split("_",4)[4])
tile_names_in_storage_space_standard_formatted = np.hstack((np.array(tile_names_standard_storage_space)[:,None], tile_names_formatted_storage_space[:,None]))

# Determine which tiles included in the complete dataset array have not been labeled (tile names formatted)
- use for inquiry 

In [104]:
counter_0 = 0
counter_1 = 0
counter_gr1 = 0
for tile_names_formatted in tile_names_in_storage_space_standard_formatted[:,1]: #iterate over the tiles that have actually been annotated
    strings_with_substring = [string for string in tile_names_tile_urls_complete_array_unique_formatted_tile_names[:,3] if tile_names_formatted in string] #check if a given tile is in the complete_array 
    if len(strings_with_substring) == 0:
        counter_0 += 1
    if len(strings_with_substring) == 1:
        counter_1 += 1
    if len(strings_with_substring) > 1:
        counter_gr1 += 1
print(counter_0, counter_1, counter_gr1)

22 2360 0


In [105]:
#Check which formatted tile names that could be included in the dataset are missing
counter_0 = 0
counter_1 = 0
counter_gr1 = 0
for tile_name_formatted_complete_array in tile_names_tile_urls_complete_array_unique_formatted_tile_names: #iterate over the tiles that are meant to be in full set
    unlabeled = [string for string in tile_names_in_storage_space_standard_formatted[:,1] if tile_name_formatted_complete_array[3][0:46] in string] #check if a given tile has been annotated
    if len(unlabeled) == 0:
        counter_0 += 1
    if len(unlabeled) == 1:
        counter_1 += 1
    if len(unlabeled) > 1:
        counter_gr1 += 1
print(counter_0, counter_1, counter_gr1)

113 2360 0


# Determine which tiles included in the complete dataset array have not been labeled (standard naming convention)
- Use this to determine which tiles have not been annotated

In [106]:
counter_0 = 0
counter_1 = 0
counter_gr1 = 0
for tile_name_standard in tile_names_in_storage_space_standard_formatted[:,0]: #iterate over the tiles that have actually been annotated
    strings_with_substring = [string for string in tile_names_tile_urls_complete_array_unique_standard_tile_names[:,2] if tile_name_standard in string] #check if a given tile is in the complete_array 
    if len(strings_with_substring) == 0:
        counter_0 += 1
        print(tile_name_standard)
    if len(strings_with_substring) == 1:
        counter_1 += 1
    if len(strings_with_substring) > 1:
        counter_gr1 += 1
print(counter_0, counter_1, counter_gr1)

0 2356 26


#### Check which standard tile names that could be included in the dataset are missing + which tiles have been annotated in duplicate

In [107]:
counter_0 = 0
counter_1 = 0
counter_gr1 = 0
unlabeled_array = []
duplicated_standard_tile_names = []
for tile_name_standard_complete_array in tile_names_tile_urls_complete_array_unique_standard_tile_names: #iterate over the tiles that are meant to be in full set
    standard_tile_names = [string for string in tile_names_in_storage_space_standard_formatted[:,0] if tile_name_standard_complete_array[2][0:27] in string] #check if a given tile has been annotated
    if len(standard_tile_names) == 0:
        counter_0 += 1
        unlabeled_array.append(tile_name_standard_complete_array)
    if len(standard_tile_names) == 1:
        counter_1 += 1
    if len(standard_tile_names) > 1:
        counter_gr1 += 1
        duplicated_standard_tile_names.append(tile_name_standard_complete_array)
print(counter_0, counter_1, counter_gr1)
unlabeled_array = np.array(unlabeled_array)

103 2290 59


#### Check whether unlabeled tiles are known to contain errors 

In [141]:
counter_0 = 0
counter_geq1 = 0
unlabeled_array_no_known_errors = []

for tile_name in unlabeled_array: #iterate over the tiles that are meant to be in full set
    standard_tile_names_included_in_error_array = [string for string in tiles_errors[:,0] if tile_name[2][0:27] in string] #check if a given tile has been annotated
    if len(standard_tile_names_included_in_error_array) == 0:
        counter_0 += 1
        unlabeled_array_no_known_errors.append(tile_name)
        
    if len(standard_tile_names_included_in_error_array) >= 1:
        counter_geq1 += 1
print(counter_0, counter_geq1)

unlabeled_array_no_known_errors = np.array(unlabeled_array_no_known_errors)
np.save("unlabeled_tile_names_tile_urls.npy", unlabeled_array_no_known_errors[:,[0,1]])

56 47


In [None]:
Deter

# Move existing tiles

In [None]:
for tile_names in list_of_possible_tile_names:
    print(tile_names)

In [None]:
counter = 0
for tile_labeled in tiles_labeled_from_complete_set:
    tile_labeled_name = os.path.splitext(tile_labeled[0])[0]
    x = [string for string in tiles_recorded_as_labeled if tile_labeled_name in string]
    if len(x) > 0:
        counter += 1
        #print(x)
counter 

# Add tiles to completed dataset

In [None]:
#Move already downloaded tiles to completed dataset
for path in downloaded_tiles_in_verification_sets:
    base = os.path.basename(path)
    tif = os.path.splitext(base)[0] #name of tif with the extension removed
    #if tif in tile_img_annotation[:,0]:
    #    shutil.move(path, os.path.join(path_to_tiles_folder_complete_dataset,base)) # copy images with matching .xml files in the "chips_tank" folder

In [None]:
#Make a list of the tiles moved to completed dataset
tiles_downloaded = os.listdir(path_to_tiles_folder_complete_dataset)
tiles_downloaded_temp = []
for tile in tiles_downloaded:
     tiles_downloaded_temp.append(os.path.splitext(tile)[0]) #name of tif with the extension removed
tiles_downloaded = np.array(tiles_downloaded_temp)

# Download remaining tiles that correspond to ONLY to verified images

In [None]:
#Gather the locations of tiles that have already been downlaoded and verified 
path_positive_images_complete_dataset = "//oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/complete_dataset/chips_positive"
jpg_path_positive_images_complete_dataset = glob(path_positive_images_complete_dataset + "/*.jpg", recursive = True)
print(len(jpg_path_positive_images_complete_dataset))

tiles_to_be_downloaded = []
for path in jpg_path_positive_images_complete_dataset:
    base = os.path.basename(path)
    img = os.path.splitext(base)[0] #name of tif with the extension removed
    tiles_to_be_downloaded.append(img.rsplit("_",1)[0])
tiles_to_be_downloaded = np.array(tiles_to_be_downloaded)

tiles_to_be_downloaded = np.unique(tiles_to_be_downloaded)

In [None]:
#Identify tiles that have not already been downloaded
tile_names_to_download = []
for tile in tiles_to_be_downloaded: #index over the downloaded tiled
    if tile not in tiles_downloaded: #index over the tiles that should be downloded
        tile_names_to_download.append(tile)

In [None]:

tiles_labeled_from_complete_set

blob_root = 'https://naipblobs.blob.core.windows.net/naip'
#blob_root = 'https://naipeuwest.blob.core.windows.net/naip'  
tile_names = []
tile_urls = []

for tile in tile_names_to_download:
    tile_name = tile.split("_",4)[4] #+ ".tif"
    
    tile_name = [string for string in tiles_labeled_from_complete_set[:,0] if tile_name in string]  #check if the tile_name is in the string contained in the array

In [None]:
#Check how many border tiles there are 
counter = 0
for tile in tiles_to_be_downloaded:
    tile_name = tile.split("_",4)[4] #+ ".tif"
    tile_name = [string for string in tiles_labeled_from_complete_set[:,0] if tile_name in string]  #check if the tile_name is in the string contained in the array
    if len(tile_name) > 1:
        print(tile_name)
        counter += 1
counter

In [None]:
tiles_labeled_from_complete_set[tiles_labeled_from_complete_set[:,0] == 'm_4108616_sw_16_060_20180707_20181211.tif']

In [None]:
#https://www.kite.com/python/answers/how-to-check-if-a-list-contains-a-substring-in-python


In [None]:
import pandas as pd
pd.DataFrame(np.load("image_download_azure/tile_name_tile_url_complete_array.npy")).to_csv("image_download_azure/tile_name_tile_url_complete_array.csv")

In [None]:

#for tile in tiles_to_download:
#    print(tile+".tif")
    
    #print(tiles_labeled_from_complete_set[tiles_labeled_from_complete_set[:,0] == tile+".tif"])
            'https://naipblobs.blob.core.windows.net/naip/v002/va/2018/va_060cm_2018/36076/m_3607614_ne_18_060_20181001.tif'],

In [None]:
np.where(tiles_labeled_from_complete_set[:,0] == "m_3211520_se_11_060_20190305.tif")

In [None]:
        #if not os.path.exists(os.path.join(path_to_tiles_folder_complete_dataset,base)): #skip over tiles that already exist in complete dataset

In [None]:
#remove tiles / images with no annotations

In [None]:
#Download Tiles 

for labeled_image_tile_id in tile_id: # the tiles from images 
    if labeled_image_tile_id not in tiles: #the downloaded times 
        #print(labeled_image_tile_id)
        #download the tiles if they are not in the tiles folder 
        for tile_name_url in tiles_labeled_from_complete_set[:,]:
            if labeled_image_tile_id == tile_name_url[0].split(".",1)[0]:
                ###Download tile
                ap.download_url(tile_name_url[1], os.path.join(parent,annotation_directory,"tiles"),
                                                         progress_updater=ap.DownloadProgressBar())

In [None]:
# Determine the pixel location of the image within the tile

In [6]:
path_to_verified_complete_dataset_tiles = os.path.join(parent_directory, path_to_complete_dataset, "tiles")

In [20]:
len(range(0,511))

511

In [None]:

tiles_dir = path_to_verified_complete_dataset_tiles
state = []
resolution = []
year = []
capture_date  = []
utm_zone  = []
standard_tile_name = []

six_digit_chip_name = []
NW_coordinates = []
SE_coordinates = []
row_indicies = []
col_indicies = []

for tile_name in os.listdir(path_to_verified_complete_dataset_tiles): #index over the tiles in the tiles_dir 
    file_name, ext = os.path.splitext(tile_name) # File name
    
    item_dim = int(512)          
    tile = cv2.imread(os.path.join(tiles_dir, tile_name)) 
    tile_height,  tile_width,  tile_channels = tile.shape #the size of the tile 

    #divide the tile into 512 by 512 chips (rounding up)
    row_index = math.ceil(tile_height/512) 
    col_index = math.ceil(tile_width/512)

    for x in range(0, col_index):
        for y in range(0, row_index):
            #image characteristics
            six_digit_chip_name.append(file_name+ '_'+ str(count).zfill(6) + '.jpg') # The index is a six-digit number like '000023'.
            NW_coordinates.append([x*item_dim, y*(item_dim)]) #NW (Top Left) 
            SE_coordinates.append([x*item_dim+item_dim-1, y*(item_dim)+item_dim-1]) #SE (Bottom right) 
            row_indicies.append(y)
            col_indicies.append(x)
            #tile characteristics
            standard_tile_name.append(file_name.split("_",4)[4]) #standard_tile_name
            state.append(file_name.split("_",9)[0]) #state
            resolution.append(file_name.split("_",9)[1]) #resolution
            utm_zone.append(file_name.split("_",9)[7]) #utm
            year.append(file_name.split("_",9)[2]) #year
            capture_date.append(file_name.split("_",9)[-1]) #capture date