In [2]:
import os 

import fiona #must be import before geopandas
import geopandas as gpd
import rasterio
import xarray as xr
import re
import rtree
import shapely
import pickle

#from cartopy import crs
import collections
import cv2
import math
from glob import glob
from tqdm.notebook import tqdm_notebook

# Standard packages
import tempfile
import warnings
import urllib
import shutil

# Less standard, but still pip- or conda-installable
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle

import data_eng.az_proc as ap
import data_eng.form_calcs as fc

from lxml.etree import Element,SubElement,tostring
import xml.dom.minidom
from xml.dom.minidom import parseString
import xml.etree.ElementTree as et
from xml.dom import minidom

#import requests
from PIL import Image
from io import BytesIO
import tqdm
from skimage.metrics import structural_similarity as compare_ssim
import imutils

import psutil

## File Paths

In [3]:
parent_directory = "//oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//"

tile_names_tile_urls_complete_array = np.load("image_download_azure/tile_name_tile_url_complete_array.npy")

tiles_labeled = "tile_name_tile_url_labeled.npy"
tiles_labeled_from_complete_set = np.load(tiles_labeled)

tracker_file_path = 'outputs/tile_img_annotation_annotator.npy'
tile_img_annotation = np.load(tracker_file_path)
tile_img_annotation_annotator = np.load("outputs/tile_img_annotation_annotator.npy")

tiles_errors = 'tile_name_tile_url_error_downloading.npy'
tiles_errors = np.load(tiles_errors)

#create folder to hold tiles in completed dataset

tiles_complete_dataset_path = os.path.join(parent_directory,"complete_dataset","tiles")
tiles_xml_complete_dataset_path = os.path.join(parent_directory,"complete_dataset","tiles_xml")
os.makedirs(tiles_complete_dataset_path, exist_ok=True)

tile_names_tile_urls_complete_array_unique_standard_tile_names = np.load("tile_names_tile_urls_complete_array_unique_standard_tile_names.npy")
image_characteristics = pd.read_csv("image_characteristics.csv")
tile_names_tile_urls_complete_array_unique_standard_tile_names = np.load("tile_names_tile_urls_complete_array_unique_standard_tile_names.npy")

#NAIP quad map
quad_indicies_path = "C:/Users/rapiduser/Box/EPA STAR 2019 (Community Resistance to Environmental Disasters)/Data/AST Datasets/MapIndices_National_GDB/MapIndices_National_GDB.gdb"
#fiona.listlayers(quad_indicies_path)

## Unverified Images and Annotations (Subfolders, Images, and XMLs)

In [4]:
unverified_set1_subfolders_path = os.path.join(parent_directory,"unverified_images\student_reviewed_unverified_images_set1")
unverified_set1_subfolders_path = ap.img_path_anno_path(ap.list_of_sub_directories(unverified_set1_subfolders_path))

unverified_set1_image_paths = []
unverified_set1_xml_paths = []
for directory in tqdm.tqdm(unverified_set1_subfolders_path):
    #print(len(os.listdir(directory[0])),len(os.listdir(directory[1])))
    fc.remove_thumbs(directory[0])
    unverified_set1_image_paths += glob(directory[0] + "/*.jpg", recursive = True)
    unverified_set1_xml_paths += glob(directory[1] + "/*.xml", recursive = True)

100%|██████████| 40/40 [00:20<00:00,  1.98it/s]


## Verified Images and Annotations (Subfolders, Images, and XMLs)

In [5]:
verified_sets_path = os.path.join(parent_directory, "verified/verified_sets")
verified_sets_subfolders_path = ap.img_path_anno_path(ap.list_of_sub_directories(verified_sets_path))

verified_set1_image_paths = []
verified_set1_xml_paths = []
verified_set1_subfolders_path = []

for verified_set in tqdm.tqdm(verified_sets_subfolders_path):
    fc.remove_thumbs(verified_set[0])
    set_number = verified_set[0].split("/")[-2].split("_")[1]
    
    if set_number == str(1):
        verified_set1_image_paths += glob(verified_set[0] + "/*.jpg", recursive = True)
        verified_set1_xml_paths += glob(verified_set[1] + "/*.xml", recursive = True)
        verified_set1_subfolders_path.append(verified_set)
        
verified_set1_subfolders_path = np.array(verified_set1_subfolders_path)

100%|██████████| 11/11 [00:24<00:00,  2.25s/it]


## Unverified and Verified Images and Annotations (Subfolders, Images, and XMLs)

In [6]:
#first folder in unverified
#get directories that need to be check
fc.remove_thumbs(tiles_complete_dataset_path)
unverified_verified_set1_image_paths = np.array(unverified_set1_image_paths + verified_set1_image_paths)
unverified_verified_set1_subfolders_paths = np.concatenate((unverified_set1_subfolders_path, verified_set1_subfolders_path))

## Identify labeled images where the images do not correspond correctly to the tile chip <br>
(identified by subfolder)

In [None]:
#positive_images = np.zeros((0, 512, 512, 3))
tile_names = []
incorrect_chip_names =[]
incorrect_chip_paths = []
xs = []
ys = []

for directory in tqdm.tqdm(unverified_verified_set1_subfolders_paths):
    #identify tiles that have corresponding images in directory
    tiles_in_directory = fc.get_tile_names_from_chip_names(directory[0])
    images_in_directory, images_in_directory_array, image_directory = fc.positive_images_to_array(directory[0])
    tile_names_temp, xs_temp, ys_temp, incorrect_chip_names_temp, incorrect_chip_paths_temp = fc.identify_incorrect_images(tiles_complete_dataset_path, tiles_in_directory, 
                                                                                                                           images_in_directory, images_in_directory_array,
                                                                                                                           image_directory)
    tile_names += tile_names_temp
    xs += xs_temp 
    ys += ys_temp 
    incorrect_chip_paths += incorrect_chip_paths_temp 
    incorrect_chip_names += incorrect_chip_names_temp

d = {'tile_names': tile_names,
     'xs': xs,
     'ys': ys,
     'incorrect_chip_paths': incorrect_chip_paths,
     'incorrect_chip_names': incorrect_chip_names}

incorrect_labeled_chip_names_by_subfolder = pd.DataFrame(data = d)
incorrect_labeled_chip_names_by_subfolder.to_csv('incorrect_labeled_chip_names_by_subfolder.csv')

## Identify labeled images where the images do not correspond correctly to the tile chip <br>
(identified by tile)

In [11]:
"""
fc.remove_thumbs(tiles_complete_dataset_path)

tiles_in_set1 = np.zeros((0))
for subfolders in tqdm.tqdm(unverified_verified_set1_subfolders_paths):
    tiles_in_set1 = np.concatenate((tiles_in_set1, fc.get_tile_names_from_chip_names(subfolders[0])))
tiles_in_set1 = np.unique(tiles_in_set1)

tile_names, xs, ys, incorrect_chip_paths = fc.identify_incorrect_images_simultaneous(tiles_complete_dataset_path, tiles_in_set1, unverified_verified_set1_image_paths)

d = {'tile_names': tile_names,
     'xs': xs,
     'ys': ys,
     'incorrect_chip_paths': incorrect_chip_paths}

incorrect_labeled_chip_names_by_tile = pd.DataFrame(data = d)
incorrect_labeled_chip_names_by_tile.to_csv('incorrect_labeled_chip_names_by_tile.csv')
"""

100%|██████████| 44/44 [00:03<00:00, 12.92it/s]
100%|██████████| 932/932 [5:12:45<00:00, 20.13s/it]  
