# Read Packages

In [1]:
import os
import cv2
import copy

import math
import numpy as np
import matplotlib 
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
from glob import glob
import tqdm
from skimage.metrics import structural_similarity as compare_ssim
import shutil
#Parsing/Modifying XML
from lxml.etree import Element,SubElement,tostring
import xml.dom.minidom
from xml.dom.minidom import parseString
import xml.etree.ElementTree as et
from xml.dom import minidom

import data_eng.az_proc as ap
import data_eng.form_calcs as fc

# Read in Files 

In [2]:
parent_directory = "//oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//"
tile_names_tile_urls_complete_array = np.load("image_download_azure/tile_name_tile_url_complete_array.npy")
tile_dir_path = os.path.join(parent_directory,"complete_dataset","tiles")
#https://stackoverflow.com/questions/5953373/how-to-split-image-into-multiple-pieces-in-python
img_anno_directory = os.path.join(parent_directory,"temp/rechip")
remaining_chips_path =  os.path.join(parent_directory,"temp/remaining_chips")
remaining_sets_path =  os.path.join(parent_directory,"temp/remaining_sets")

incorrectly_chipped_images_path = os.path.join(parent_directory,"temp/incorrectly_chipped_images")
incorrect_named_correctly_chipped_dir = os.path.join(parent_directory,"temp/incorrectly_named_correct_chipped_images")

In [3]:
verified_standard_quad_image_name_format = os.path.join(parent_directory,"verified/verified_standard_quad_image_name_format")
all_verified_standard_quad_subfolders_path = ap.img_path_anno_path(ap.list_of_sub_directories(verified_standard_quad_image_name_format)) 
verified_state_year_subfolders_path = os.path.join(parent_directory,"verified/verified_state_year_image_name_formating")
all_verified_state_year_subfolders_path = ap.img_path_anno_path(ap.list_of_sub_directories(verified_state_year_subfolders_path)) 

In [4]:
compiled_by_tile_dir = os.path.join(parent_directory,"compiled_dataset/compile_by_tile")
compile_by_tile_state_year_dir = os.path.join(parent_directory,"compiled_dataset/compile_by_tile_state_year")
images_do_not_match_names_dir = os.path.join(parent_directory,"compiled_dataset/images_do_not_match_names_state_year")
correctly_chipped_incorrect_dir = os.path.join(parent_directory,"compiled_dataset/correctly_chipped_incorrect_data")

# Functions

# 

In [11]:
all_verified_paths = np.concatenate((all_verified_state_year_subfolders_path, all_verified_standard_quad_subfolders_path))
all_img_paths, all_xml_paths = fc.get_img_xml_paths(all_verified_paths)
all_tile_names, all_img_names = fc.get_tile_names(all_img_paths) #identify tiles in each folder
all_tile_names = np.unique(all_tile_names)

state_year_img_paths, state_year_xml_paths = fc.get_img_xml_paths(all_verified_state_year_subfolders_path)
standard_img_paths, standard_xml_paths = fc.get_img_xml_paths(all_verified_standard_quad_subfolders_path)

100%|██████████████████████████████████████████████████████████████████████████████████| 23/23 [00:19<00:00,  1.16it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 16/16 [00:09<00:00,  1.61it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:01<00:00,  4.21it/s]


In [12]:
six_digit_index_list = fc.get_six_digit_index(state_year_img_paths)
ys,xs = fc.get_x_y_index(standard_img_paths)

In [3]:
    ys,xs = fc.get_x_y_index(standard_img_paths) 
    all_tile_names = tqdm.tqdm(all_tile_names)

    args = ((tile_name, compile_by_tile_state_year_dir, tile_dir_path, 
            state_year_img_paths, state_year_xml_paths, six_digit_index_list) for tile_name in all_tile_names)
    num_cores = mp.cpu_count()
    pool = Pool(processes=num_cores)
    pool.map_async(fc.multi_iterate_over_tile_compare_move_state_year_by_six_digit_index, args)
    pool.close()
    pool.join()

functools.partial(<class 'int'>, base=2)

In [None]:
#compile_by_tile_state_year using six digit indicies 

In [None]:
import multiprocessing as mp


# Step 1: Init multiprocessing.Pool()
pool = mp.Pool(mp.cpu_count())
all_tile_names = tqdm.tqdm(all_tile_names)
# Step 2: `pool.apply` the `howmany_within_range()`
results = [pool.apply(multi_iterate_over_tile_compare_move_state_year_by_six_digit_index, 
                      args=(tile_name, compile_by_tile_state_year_dir, tile_dir_path, 
                            state_year_img_paths, state_year_xml_paths, six_digit_index_list)) for tile_name in all_tile_names]

# Step 3: Don't forget to close
pool.close()    

  0%|          | 0/2153 [00:00<?, ?it/s]

In [21]:
num_tiles = len(all_tile_names)
args = list(zip(all_tile_names, [compile_by_tile_state_year_dir]*num_tiles, [tile_dir_path]*num_tiles,
                [state_year_img_paths]*num_tiles, [state_year_xml_paths]*num_tiles, [six_digit_index_list]*num_tiles))

In [None]:
args

In [22]:
from multiprocessing import Pool
import multiprocessing as mp

num_cores = mp.cpu_count()
pool = Pool(processes=num_cores)
pool.map_async(fc.multi_iterate_over_tile_compare_move_state_year_by_six_digit_index, args)
pool.close()
pool.join()
## -- End pasted text --

In [12]:
num_cores

3

In [61]:
img_count_state_year = 0
img_count_standard = 0

for tile_name in tqdm.tqdm(tile_names):
    compile_tile_dir = make_by_tile_dirs(compiled_by_tile_dir, tile_name)
    tile, row_index, col_index = read_tile(os.path.join(tile_dir_path, tile_name + ".tif")) #read in tile
    
    img_in_tile_paths = [string for string in img_paths if tile_name in string]
    xml_in_tile_paths = [string for string in xml_paths if tile_name in string]
    img_in_tile_names = [string for string in img_names if tile_name in string]
    
    assert len(img_in_tile_paths) == len(xml_in_tile_paths) == len(img_in_tile_names), "The same number of images and xmls"
    count = 1
    for y in range(0, row_index): #rows #use row_index to account for the previous errors in state/year naming conventions
        for x in range(0, row_index): #cols   
            standard_quad_img_name_wo_ext = tile_name + '_' + f"{y:02}"  + '_' + f"{x:02}" # row_col
            img_name_wo_ext = tile_name + '_'+ str(count).zfill(6) #specify the chip names
            t_2_chip = tile_to_chip_array(tile, x, y, int(512)) #get correct chip from tile
            img_count_state_year = compare_move_imgs_state_year(x, y, tile_name, count, img_count_state_year,
                                                                img_in_tile_paths, xml_in_tile_paths, img_in_tile_names, 
                                                                compile_tile_dir, images_do_not_match_names_dir)
            
            img_count_standard = compare_move_imgs_standard(x, y, tile_name, img_count_standard, img_in_tile_paths, xml_in_tile_paths, img_in_tile_names,
                                                            compile_tile_dir, correctly_chipped_incorrect_dir)
            count += 1  
            
print(img_count_state_year, img_count_standard)

  0%|          | 9/2153 [04:39<15:00:05, 25.19s/it]

fl_60cm_2019_26080_m_2608063_ne_17_060_20191120_000037 /n m_2608063_ne_17_060_20191120_01_12 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_26080_m_2608063_ne_17_060_20191120_000037.jpg
fl_60cm_2019_26080_m_2608063_ne_17_060_20191120_000038 /n m_2608063_ne_17_060_20191120_01_13 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_26080_m_2608063_ne_17_060_20191120_000038.jpg
fl_60cm_2019_26080_m_2608063_ne_17_060_20191120_000039 /n m_2608063_ne_17_060_20191120_01_14 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_26080_m_2608063_ne_17_060_20191120_000039.jpg
fl_60cm_2019_26080_m_2608063_ne_17_060_20191120_000062 /n m_2608063_ne_17_060_20191120_02_13 //oit-nas-fe13dc.oi

  1%|          | 11/2153 [05:19<13:18:33, 22.37s/it]

fl_60cm_2019_26080_m_2608064_nw_17_060_20191120_000001 /n m_2608064_nw_17_060_20191120_00_00 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_26080_m_2608064_nw_17_060_20191120_000001.jpg
fl_60cm_2019_26080_m_2608064_nw_17_060_20191120_000002 /n m_2608064_nw_17_060_20191120_00_01 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_26080_m_2608064_nw_17_060_20191120_000002.jpg
fl_60cm_2019_26080_m_2608064_nw_17_060_20191120_000027 /n m_2608064_nw_17_060_20191120_01_02 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_26080_m_2608064_nw_17_060_20191120_000027.jpg
fl_60cm_2019_26080_m_2608064_nw_17_060_20191120_000030 /n m_2608064_nw_17_060_20191120_01_05 //oit-nas-fe13dc.oi

  1%|          | 12/2153 [05:53<15:31:01, 26.09s/it]

fl_60cm_2019_26080_m_2608064_nw_17_060_20191120_000485 /n m_2608064_nw_17_060_20191120_20_04 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_26080_m_2608064_nw_17_060_20191120_000485.jpg


  1%|          | 16/2153 [07:14<13:13:09, 22.27s/it]

tx_060cm_2018_26098_m_2609838_sw_14_060_20190105_000061 /n m_2609838_sw_14_060_20190105_02_12 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_Cleave_Niculescu_Sunny_4/chips_positive\tx_060cm_2018_26098_m_2609838_sw_14_060_20190105_000077.jpg
tx_060cm_2018_26098_m_2609838_sw_14_060_20190105_000067 /n m_2609838_sw_14_060_20190105_02_18 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_Cleave_Niculescu_Sunny_4/chips_positive\tx_060cm_2018_26098_m_2609838_sw_14_060_20190105_000186.jpg
tx_060cm_2018_26098_m_2609838_sw_14_060_20190105_000077 /n m_2609838_sw_14_060_20190105_03_04 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_Cleave_Niculescu_Sunny_4/chips_positive\tx_060cm_2018_26098_m_2609838_sw_14_060_20190105_000210.jpg
tx_060cm_2018_26098_m_2609838_sw_14_060_20190105_000186 /n m_2609838_sw_14_060_20190105_07_

  1%|          | 20/2153 [09:08<16:11:28, 27.33s/it]

fl_60cm_2019_27082_m_2708205_ne_17_060_20191129_000319 /n m_2708205_ne_17_060_20191129_12_18 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708205_ne_17_060_20191129_000319.jpg
fl_60cm_2019_27082_m_2708205_ne_17_060_20191129_000339 /n m_2708205_ne_17_060_20191129_13_13 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708205_ne_17_060_20191129_000339.jpg
fl_60cm_2019_27082_m_2708205_ne_17_060_20191129_000340 /n m_2708205_ne_17_060_20191129_13_14 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708205_ne_17_060_20191129_000340.jpg
fl_60cm_2019_27082_m_2708205_ne_17_060_20191129_000342 /n m_2708205_ne_17_060_20191129_13_16 //oit-nas-fe13dc.oi

  1%|          | 21/2153 [09:41<17:11:21, 29.03s/it]

fl_60cm_2019_27082_m_2708205_nw_17_060_20191129_000009 /n m_2708205_nw_17_060_20191129_00_08 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708205_nw_17_060_20191129_000009.jpg
fl_60cm_2019_27082_m_2708205_nw_17_060_20191129_000032 /n m_2708205_nw_17_060_20191129_01_06 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708205_nw_17_060_20191129_000032.jpg
fl_60cm_2019_27082_m_2708205_nw_17_060_20191129_000085 /n m_2708205_nw_17_060_20191129_03_09 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708205_nw_17_060_20191129_000085.jpg
fl_60cm_2019_27082_m_2708205_nw_17_060_20191129_000086 /n m_2708205_nw_17_060_20191129_03_10 //oit-nas-fe13dc.oi

  1%|          | 22/2153 [10:13<17:43:04, 29.93s/it]

fl_60cm_2019_27082_m_2708205_se_17_060_20191129_000038 /n m_2708205_se_17_060_20191129_01_12 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708205_se_17_060_20191129_000038.jpg
fl_60cm_2019_27082_m_2708205_se_17_060_20191129_000062 /n m_2708205_se_17_060_20191129_02_11 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708205_se_17_060_20191129_000062.jpg
fl_60cm_2019_27082_m_2708205_se_17_060_20191129_000063 /n m_2708205_se_17_060_20191129_02_12 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708205_se_17_060_20191129_000063.jpg
fl_60cm_2019_27082_m_2708205_se_17_060_20191129_000064 /n m_2708205_se_17_060_20191129_02_13 //oit-nas-fe13dc.oi

  1%|          | 23/2153 [11:01<20:50:46, 35.23s/it]

fl_60cm_2019_27082_m_2708205_sw_17_060_20191129_000147 /n m_2708205_sw_17_060_20191129_05_21 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708205_sw_17_060_20191129_000147.jpg
fl_60cm_2019_27082_m_2708205_sw_17_060_20191129_000148 /n m_2708205_sw_17_060_20191129_05_22 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708205_sw_17_060_20191129_000148.jpg
fl_60cm_2019_27082_m_2708205_sw_17_060_20191129_000172 /n m_2708205_sw_17_060_20191129_06_21 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708205_sw_17_060_20191129_000172.jpg
fl_60cm_2019_27082_m_2708205_sw_17_060_20191129_000173 /n m_2708205_sw_17_060_20191129_06_22 //oit-nas-fe13dc.oi

  1%|          | 24/2153 [11:36<20:53:41, 35.33s/it]

fl_60cm_2019_27082_m_2708212_ne_17_060_20191129_000133 /n m_2708212_ne_17_060_20191129_05_07 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708212_ne_17_060_20191129_000133.jpg
fl_60cm_2019_27082_m_2708212_ne_17_060_20191129_000170 /n m_2708212_ne_17_060_20191129_06_19 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708212_ne_17_060_20191129_000170.jpg
fl_60cm_2019_27082_m_2708212_ne_17_060_20191129_000212 /n m_2708212_ne_17_060_20191129_08_11 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_1_josh_jaewon_amadu/chips_positive\fl_60cm_2019_27082_m_2708212_ne_17_060_20191129_000212.jpg
fl_60cm_2019_27082_m_2708212_ne_17_060_20191129_000213 /n m_2708212_ne_17_060_20191129_08_12 //oit-nas-fe13dc.oi

  1%|▏         | 27/2153 [13:38<22:35:43, 38.26s/it]

tx_060cm_2018_27097_m_2709712_sw_14_060_20181210_000083 /n m_2709712_sw_14_060_20181210_03_10 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_2_cleave_tang_poonacha/chips_positive\tx_060cm_2018_27097_m_2709712_sw_14_060_20181210_000083.jpg
tx_060cm_2018_27097_m_2709712_sw_14_060_20181210_000084 /n m_2709712_sw_14_060_20181210_03_11 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_2_cleave_tang_poonacha/chips_positive\tx_060cm_2018_27097_m_2709712_sw_14_060_20181210_000084.jpg
tx_060cm_2018_27097_m_2709712_sw_14_060_20181210_000491 /n m_2709712_sw_14_060_20181210_20_10 //oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//verified/verified_state_year_image_name_formating/verify_2_cleave_tang_poonacha/chips_positive\tx_060cm_2018_27097_m_2709712_sw_14_060_20181210_000491.jpg


  2%|▏         | 35/2153 [19:43<19:53:29, 33.81s/it]


KeyboardInterrupt: 

In [59]:
x = sorted(glob(compiled_by_tile_dir + "/**/*.jpg", recursive = True))
len(x)

15549

In [57]:
compile_tile_dir

'//oit-nas-fe13dc.oit.duke.edu//data_commons-borsuk//compiled_dataset/compile_by_tile\\m_2908910_nw_16_060_20190707'