### ISIS METADATA GRIDDING - Test 02
working with Roksana's code

In [11]:
import sys
import numpy as np
import cv2
import glob
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd
from itertools import chain
import ntpath
from scipy.signal import find_peaks
import traceback
import os

In [12]:
#Sub-directory path
#SD_PATH = gen_ran_subdir("L:/DATA/ISIS/ISIS_101300030772/b*/B1*")

#Image path
#I_PATH = gen_ran_img(SD_PATH, '/*')

#For testing - only use same sub-directory (subdir)
SD_PATH = os.path.realpath("Output-Test-Images")
#Folder containing 30 images for testing (very temporary)
#I_PATH = gen_ran_img(SD_PATH, '/*')
print ("Subdir path:", SD_PATH)
#print ("Random image in subdir path", I_PATH)

Subdir path: C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/Output-Test-Images


In [13]:
### DICTIONARIES
# Labelling of coordinates
LABELS_NUM = [1,2,3, 4, 5, 6, 7, 8, 9]
#LABELS_NUM = [1,2, 4, 8]
#LABELS_CAT_DOT = ['day_1','day_2','day_3','hour_1','hour_2','minute_1','minute_2','second_1', 'second_2','station_code']
#LABELS_CAT_DIGIT = ['satellite_number','year','day_1','day_2','day_3','hour_1','hour_2','minute_1','minute_2',
                    #'second_1', 'second_2', 'station_number_1','station_number_2']
LABELS_CAT_NUM = ['Operating Mode 1','Operating Mode 2','Station Number 1', 'Station Number 2', 'Year 1', 'Year 2', 'Day 1', 'Day 2', 'Day 3',
                    'Hour 1', 'Hour 2', 'Min 1', 'Min 2', 'Sec 1', 'Sec 2']
LABELS_DICT = ['dict_cat_digit','dict_num_digit']#'dict_cat_dot','dict_num_dot',]

#Defaults for dictionary mappings of coordinates to labels
DEFAULT_DICT_CAT_DIGIT = (53,21,661) #mean_dist_default,first_peak_default,last_peak_default
DEFAULT_DICT_NUM_DIGIT = (47,41,20) #mean_dist_default,first_peak_default,dist_btw_peaks for peak detection

#DEFAULT_DICT_CAT_DIGIT_F = (43,23,540) #mean_dist_default,first_peak_default,last_peak_default for those in LIST_DIRECTORY_DOTS 
#DEFAULT_DICT_NUM_DIGIT_F = (40,37,20) #mean_dist_default,first_peak_default,dist_btw_peaks for peak detection for those in LIST_DIRECTORY_DOTS 

#DEFAULT_DICT_CAT_DOT = (59,20,549)##mean_dist_default,first_peak_default,last_peak_default
#DEFAULT_DICT_NUM_DOT = (15,32,10) #mean_dist_default,first_peak_default,dist_btw_peaks for peak detection

##### Base Functions

In [14]:
# From helper_functions.py
def record_loss (df, function_name, subdir_location, columns_to_extract = None, loss_extraction = None):
    """Create dataframe that records loss."""

    if columns_to_extract is None:
        columns_to_extract = ['file_name']
    if loss_extraction is None:
        loss_extraction = []
    if len(loss_extraction) == 0:
        # function should return NA if there an error
        loss_extraction = df.isna().any(axis=1)
    # Record the files whose extraction was not successful
    df_loss_extraction = df[loss_extraction].copy()
    df_loss_extraction = df_loss_extraction[columns_to_extract]
    df_loss_extraction['func_name'] = function_name
    df_loss_extraction[ 'subdir_name'] = subdir_location

    return df_loss_extraction, loss_extraction

# From scan2data > image_segmentation > trim_raw_metadata > connected_components_metadata_location
def metadata_location(meta, min_count = 20, max_count = 2000):
    """"Use connected component algorithm to find the location of the metadata
    :param meta: binarized UTF-8 2D array of values (0 or 1) array containing metadata
    :type meta: class: `numpy.ndarray`
    :param min_count: minimum number of pixels to be considered metadata dot/num, defaults to 50
    :type min_count: int, optional
    :param max_count: maximum number of pixels to be considered metadata dot/num, defaults to 1000
    :type max_count: int, optional
    :return: metadata labelled by the connected component algorithm ie UTF-8 2D array of values where each value correspond to belonging to a metadata group
    :rtype: class: `numpy.ndarray`
    """
    
    #run algorithm on metadata section
    blurred = cv2.GaussianBlur(meta, (3,3), 0)
    _, labelled = cv2.connectedComponents(blurred)

    #Dictionary of label:counts
    unique, counts = np.unique(labelled, return_counts = True)
    dict_components = dict(zip(unique, counts))

    #Remove outliers // Remove pixels not part of metadata
    dict_subset = {}
    dict_outlier = {}
    for k,v, in dict_components.items():
        if v > min_count and v < max_count:
            dict_subset[k] = v
        else:
            dict_outlier[k] = v
    
    if key_list_to_remove := list(dict_outlier.keys()):
        for k in key_list_to_remove:
            labelled[labelled == k] = 0

    return labelled

#From scan2data > image_segmentation > trim_raw_metadata
#test provided values and change if needed
def bottomside_metadata_trimming(connected_meta, opened_meta,
                                 h_window = 100, w_window = 600, starting_y = 10, starting_x = 25, step_size = 20, trim_if_small = 25):
    """Sliding window method to locate and trim bottomside metadata
    :param connected_meta: metadata labelled by the connected component algorithm ie UTF-8 2D array of values where each value correspond to belonging to a metadata group
    :type connected_meta: class: `numpy.ndarray`
    :param opened_meta:  UTF-8 2D array of values representing raw metadata after morphological operations including opening
    :type opened_meta: nclass: `numpy.ndarray`
    :param h_window: height of sliding window, defaults to 100
    :type h_window: int, optional
    :param w_window: width of sliding window, defaults to 700
    :type w_window: int, optional
    :param starting_y: by how many pixels from the top to start windowing process, defaults to 0
    :type starting_y: int, optional
    :param starting_x: by how many pixels from the left to start the windowing process, defaults to 15
    :type starting_x: int, optional
    :param step_size: by how much sliding window moves to the right and/or bottom, defaults to 10
    :type step_size: int, optional
    :param trim_if_small: by how many pixels to trim metadata's height or width if they are smaller than h_window or w_window,defaults to 11
    :type trim_if_small: int, optional
    :return: trimmed metadata i.e.  trimmed UTF-8 2D array of values containing metadata (window with highest mean area)
    :rtype: class: `numpy.ndarray`
    '''"""
    
    def sliding_window(image, starting_y, starting_x, h_window, w_window, step_size):
        """sliding window generator object"""
        h_img, w_img = np.shape(image)
        for y in range(starting_y, h_img - h_window, step_size):
            for x in range(starting_x, w_img - w_window, step_size):
                yield y,x,image[y:y + h_window, x:x + w_window]
    
    h_raw, w_raw = np.shape(opened_meta)
    
    if h_window + step_size  >= h_raw:
        h_window = h_raw - trim_if_small
    if w_window + step_size >= w_raw:
        w_window = w_raw - trim_if_small
    
    s = sliding_window(connected_meta, starting_y, starting_x, h_window, w_window, step_size)
    
    max_window = connected_meta[starting_y:h_window+starting_y,
                 starting_x:w_window+starting_x ]
    max_mean = np.mean(max_window)
    y_max= starting_y
    x_max = starting_x
    
    for y, x, window in s:
        tmp = window
        mean = np.mean(tmp)
        if mean > max_mean:
            max_window = tmp
            max_mean  = mean
            y_max = y
            x_max = x

    trimmed_metadata =  opened_meta[y_max:y_max + h_window, x_max:x_max + w_window]
    return trimmed_metadata

#From scan2data > image_segmentation > trim_raw_metadata
def trimming_metadata(raw_metadata, opening_kernal_size = (3,3), median_kernal_size = 5):
    """"Trim the rectangle containing metadata to smallest workable area."""

    try:
        #Filtering to reduce noise
        #median_filtered_meta = cv2.medianBlur(raw_metadata, median_kernal_size)

        #Opening operation: Eroision + Dilation
        #kernal_opening = np.ones(opening_kernal_size, dtype = np.uint8)
        #opened_meta = cv2.morphologyEx(raw_metadata, cv2.MORPH_OPEN, kernal_opening)

        # Binarization
        metadata_binary = cv2.threshold(raw_metadata, 127, 255, cv2.THRESH_BINARY)[1]

        #Run connected components algorithm
        connected_meta = metadata_location(metadata_binary)

        trimmed_metadata = bottomside_metadata_trimming(connected_meta, metadata_binary)
        #bottomside_metadata_trimming function is from same location

        #Checking
        #cv2.imshow("test", trimmed_metadata)
        #cv2.waitKey(0)
        return (trimmed_metadata)
    except:
        return (np.nan)

In [66]:
#From scan2data > metadata_translation> leftside_metadata_grid_mapping 
def extract_centroids(cut_metadata, file_name, min_pixels = 5, max_pixels = 3000, max_area_dot = 120):
    
    '''Extract the coordinates of the centroid of each metadata number using the connected component algorithm
    
    :param dilated_meta: trimmed metadata (output of image_segmentation.leftside_metadata_trimming) after a rotation and dilation morphological transformation (see translate_leftside_metadata.extract_leftside_metadata )
    :type dilated_meta: class: `numpy.ndarray`
    :param file_name: full path of starting raw image ex:G:/R014207929F/431/Image0399.png
    :type file_name: str
    :param min_num_pixels: minimum number of pixels to be considered metadata dot/num, defaults to 50
    :type min_num_pixels: int, optional
    :param max_number_pixels: maximum number of pixels to be considered metadata dot/num, defaults to 1000
    :type max_number_pixels: int, optional
    :param max_area_dot: maximum median area of a single metadata components to be considered a dot, defaults to 120
    :type max_area_dot: int, optional
    :returns: col_centroids,row_centroids,is_dot : list of col ('x') positions where metadata is detected,list of row ('y') positions where metadata is detected,whether the metadata is dots
    :rtype: class: `list`,class: `list`, bool
    :raises Exception: returns np.nan,np.nan,np.nan if there is an error
    
    '''

    try:
        blurred = cv2.GaussianBlur(cut_metadata, (3,3), 0)
        totalLabels, label_ids, stats, centroids = cv2.connectedComponentsWithStats(blurred)
        area_centroids = stats[:,-1]

        centroids_metadata = centroids[np.logical_and(area_centroids > min_pixels, area_centroids < max_pixels),:]
        #^ consider adjusting min and max range

        zip_centroids = list(zip(*centroids_metadata))
        col_centroids = list((zip_centroids[0]))
        row_centroids = list((zip_centroids[1]))

        #Determine dot type (temp)
        #area_centroids = area_centroids[np.logical_and(area_centroids > min_num_pixels, area_centroids < max_number_pixels)]
        #median_area = np.median(area_centroids)
        #The line below is commented to prevent giving the dot items manually
        #if any([dir_dot in file_name for dir_dot in LIST_DIRECTORY_DOTS]) and median_area < max_area_dot:
        #is_dot = median_area < max_area_dot

        #### CHECK (TEMP) ####
        #output = np.zeros(np.shape(cut_metadata), dtype = 'uint8')
        #for i in range(1, totalLabels):
            #area = stats[i, cv2.CC_STAT_AREA]
            #if area > min_pixels and area < max_pixels:
                #new_img = cut_metadata.copy()
                #x1 = stats[i, cv2.CC_STAT_LEFT]
                #y1 = stats[i, cv2.CC_STAT_TOP]
                #h = stats[i, cv2.CC_STAT_HEIGHT]
                #w = stats[i, cv2.CC_STAT_WIDTH]

                #pt1 = (x1, y1)
                #pt2 = (x1 + w, y1 + h)
                #(X, Y) = centroids[i]

                #cv2.rectangle(new_img, pt1, pt2, (0, 0, 255), 3)
                #cv2.circle(new_img, (int(X), int(Y)), 4, (0, 0, 255), -1)

                #component = np.zeros(np.shape(cut_metadata), dtype= "uint8")
                #componentMask = (label_ids == i).astype('uint8') *255

                #component = cv2.bitwise_or(component, componentMask)
                #output =cv2.bitwise_or(output, componentMask)

                #cv2.imshow("numbers", component)
        #cv2.imshow("Mask", output)
        #cv2.waitKey(0)
                #os.chdir("C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/ISIS_Meeting")
                # Generate input and output file paths 
                #cv2.imwrite(file_name +"Test" + str(i) + ".png", output)

        return col_centroids, row_centroids
    
    except:
        return np.nan, np.nan

##From scan2data > metadata_translation > leftside_metdata_grid_mapping
def indices_highest_bin(list_coords, nbins, peak_threshold, distance_bwtn_peaks):
    """" returns indices of most common values using binning

    :param list_coord: list of positions where metadata is detected
    :type list_coord: class: `list`
    :param nbins: number of bins used for binning operation, defaults to 500
    :type nbins: int, optional
    :param peak_prominence_threshold: the threshold to detect peaks that correspond to the peaks corresponding to the most common values, defaults to 0.2
    :type peak_prominence_threshold: int, optional
    :param distance_between_peaks: the minimum number of samples between subsequent peaks corresponding to the most common values, defaults to 30
    :type distance_between_peaks: int, optional
    :returns: select_peaks,bin_edges,counts i.e. array of the indices of  peaks corresponding to the most common values, array for the bin edges after calling np.histogram, array for counts of the number of elements in each bin after calling np.histogram  
    :rtype: class: `numpy.ndarray`,class: `numpy.ndarray`,class: `numpy.ndarray`
    """
    
    arr_coord= np.array(list_coords)

    mean_coords = np.mean(arr_coord)
    std_coords = np.std(arr_coord)
    no_outlier_coords = arr_coord[np.abs(arr_coord - mean_coords) < 3 * std_coords]

    #Binning
    counts, bin_edges = np.histogram(no_outlier_coords, bins=nbins)

    #Detect peaks
    counts_norm = (counts - np.min(counts)) / (np.max(counts) - np.min(counts)) #Normalization 
    select_peaks,_ = find_peaks(counts_norm, distance = distance_bwtn_peaks, prominence = peak_threshold) 
    #print ("select peaks", select_peaks)  
    return select_peaks, bin_edges, counts

# From scan2data > metadata_translation > leftside_metadata_grid_mapping
def get_leftside_metadata_grid_mapping(list_x_digit,list_y_digit,dir_name,
                      difference_ratio=0.5,use_defaults=True):
    
    """Determines and returns the the mapping between coordinate values on a metadata image 
    and metadata labels in a subdirectory, for metadata of types dot and digits, as well as returns 
    the histogram used to generate each mapping
    
    """
    # Dictionary of dictionaries that map labels to coordinate point in metadata
    all_labels = [LABELS_CAT_NUM, LABELS_NUM]
    all_dict_mapping = {}
    all_dict_hist = {}
    # Different protocols depending on the type of dictionary mappings
    for i, list_coord in (enumerate([list_x_digit,list_y_digit])):
        type_dict = LABELS_DICT[i]
        labels = all_labels[i]
        try:
            if 'cat' in type_dict:
                if type_dict == 'dict_cat_digit':
                    #if any([dir_dot in dir_name for dir_dot in LIST_DIRECTORY_DOTS]):
                        #mean_dist_default,first_peak_default,last_peak_default=DEFAULT_DICT_CAT_DIGIT_F
                    #else:
                    mean_dist_default,first_peak_default,last_peak_default = DEFAULT_DICT_CAT_DIGIT #53, 21, 661
            
                #elif type_dict == 'dict_cat_dot':
                    #mean_dist_default,first_peak_default,last_peak_default = DEFAULT_DICT_CAT_DOT
                try:
                    idx_peaks,bin_edges,counts = indices_highest_bin(list_coord, 500, 0.2, 30)
                    peaks = bin_edges[np.array(idx_peaks)] #coordinate values on a metadata image probably corresponding to metadata
                    
                    n_labels = len(labels)
                    first_peak = peaks[0]
                    last_peak = peaks[-1]

                    if use_defaults and abs(last_peak -last_peak_default)  > difference_ratio*mean_dist_default:
                        last_peak = last_peak_default
                    if use_defaults and abs(first_peak -first_peak_default)  > difference_ratio*mean_dist_default:
                        first_peak = first_peak_default
                    
                    mean_dist_btw_peaks = (last_peak - first_peak)/(n_labels - 1)
                    list_peaks = [int(round(first_peak + i* mean_dist_btw_peaks)) for i in range(0, n_labels)]
                    
                    all_dict_mapping[type_dict] =dict(zip(list_peaks,labels))
                    all_dict_hist[type_dict] = (idx_peaks,bin_edges,counts)
                    print ("[try]dict type", type_dict)
                

                except:
                    last_peak = last_peak_default
                    first_peak = first_peak_default
                    mean_dist_btw_peaks = mean_dist_default
                    n_labels = len(labels)
                    list_peaks = [int(round(first_peak + i* mean_dist_btw_peaks)) for i in range(0, n_labels)]
                    
                    all_dict_mapping[type_dict] =dict(zip(list_peaks,labels))
                    all_dict_hist[type_dict] = {}
                    print ("[except]dict type", type_dict)
                
            elif 'num' in type_dict:
                if  type_dict == 'dict_num_digit':
                    #if any([dir_dot in dir_name for dir_dot in LIST_DIRECTORY_DOTS]):
                        #mean_dist_default,peak_0_default,dist_btw_peaks = DEFAULT_DICT_NUM_DIGIT_F
                    #else:
                    mean_dist_default, peak_0_default, dist_btw_peaks = DEFAULT_DICT_NUM_DIGIT # 47, 41, 20
                #elif type_dict == 'dict_num_dot':
                    #mean_dist_default,peak_0_default,dist_btw_peaks= DEFAULT_DICT_NUM_DOT

                    
                try:
                    idx_peaks, bin_edges, counts = indices_highest_bin(list_coord, 500, 0.2, dist_btw_peaks)
                    arr_idx_peaks = np.array(idx_peaks)
                    print ("arr idx peaks", arr_idx_peaks)
                    print ("bin edges", bin_edges)
                    peaks = bin_edges[arr_idx_peaks]
                    peak_0 = peaks[0]
                    if use_defaults and abs(peak_0 - peak_0_default)  > difference_ratio*mean_dist_default:
                        peak_0 = peak_0_default
                    #print ("peak_0", peak_0)
                    # only first three peaks are deemed relevant
                    if len(peaks) < 3:
                        max_idx = 2
                    else:
                        max_idx = 3
                    #print ("max idx", max_idx)
                    
                    #mean_dist_btw_peaks = np.mean([peaks[i+1]- peaks[i] for i in range(0, max_idx)])
                    mean_dist_btw_peaks = np.mean([peaks[1]-peaks[0]])
                    print ("mean dist", mean_dist_btw_peaks)
                    if use_defaults and abs(mean_dist_btw_peaks - mean_dist_default)  > difference_ratio*dist_btw_peaks:
                        mean_dist_btw_peaks = mean_dist_default

                    list_peaks = [int(round(peak_0 + i* mean_dist_btw_peaks)) for i in range(len(labels))]
                
                    all_dict_mapping[type_dict] =dict(zip(list_peaks,labels))
                    all_dict_hist[type_dict] = (idx_peaks,bin_edges,counts)
                    print ("[try]dict type", type_dict)
                except:
                    peak_0 = peak_0_default
                    mean_dist_btw_peaks = mean_dist_default
                    list_peaks = [int(round(peak_0 + i* mean_dist_btw_peaks)) for i in range(len(labels))]
                    all_dict_mapping[type_dict] =dict(zip(list_peaks,labels))
                    all_dict_hist[type_dict] =  {}
                    print ("[except]dict type", type_dict)
        except:
            all_dict_mapping[type_dict] ={}
            all_dict_hist[type_dict] =  {}
    
    return all_dict_mapping, all_dict_hist

#From scan2data > metadata_translation > translate_leftside_metadata
def map_coord_to_metadata(list_cat_coord,list_num_coord,dict_mapping_cat, dict_mapping_num):
    """Map coordinate of metadata centroids to information
    
    :param list_cat_coord: list of metadata positions to map to categories   
    :type list_cat_coord: list
    :param list_num_coord: list of metadata positions to map to numbers 
    :type list_num_coord: list
    :param dict_mapping_cat: dictionary used to map coordinate positions to categories
    :type dict_mapping_cat: dict
    :param dict_mapping_num: dictionary used to map coordinate positions to numbers
    :type dict_mapping_num: dict
    :returns: dict_metadata
    :rtype: dict
    
    """
    
        
    list_coord = list(zip(list_cat_coord,list_num_coord))
    coord_mapping_cat = dict_mapping_cat.keys()
    coord_mapping_num = dict_mapping_num.keys()
        
    dict_metadata={}
    for cat_coord, num_coord in list_coord:
        cat_key = min(coord_mapping_cat, key=lambda x:abs(x-cat_coord))
        num_key = min(coord_mapping_num, key=lambda x:abs(x-num_coord))
    
        cat = dict_mapping_cat[cat_key]
        num = dict_mapping_num[num_key]
            
        if cat in dict_metadata:
            dict_metadata[cat].append(num)
        else:
            dict_metadata[cat] = [num]
    return dict_metadata
    #except:
        #return np.nan

##### Process Functions

In [67]:
#From scan2data> image_segmentation > segment_images_in_subdir.py
#some variables here are not necessary and can be removed, ie. height, width...
def segment_metadata(subdir_location, regex_img, min_bottom_height = 25, cutoff_width=300, cutoff_height=150):
    """Should only segment metadata. Can be adjusted to include ionogram. """
    regex_raw_image =  SD_PATH + ("/*")
    #print ("the raw images path is:", regex_raw_image)
    list_images = glob.glob(regex_raw_image)
    
    #Dataframe is processing
    df_img = pd.DataFrame(data = {"file_name": list_images})
    #Read each image in a 2D UTF-8 grayscale array
    df_img["raw"] = df_img['file_name'].map(lambda file_name: cv2.imread(file_name, 0))

    # Extract ionogram and coordinates delimiting its limits
    #df_img['limits']= list(zip(df_img['raw'].map(lambda raw_img: extract_ionogram(raw_img)))) 
    # Record the files whose ionogram extraction was not successful
    #df_loss_ion_extraction, loss_ion_extraction = record_loss(df_img,'image_segmentation.extract_ionogram_from_scan.extract_ionogram',subdir_location)
    #df_img = df_img[~loss_ion_extraction]
    #df_img['height'],df_img['width'] = list(zip(df_img['ionogram'].map(lambda array_pixels: array_pixels.shape)))
    
    #Raw metadata
    #df_tmp = (df_img.apply(lambda row: extract_metadata(row['raw'], row['limits']), axis = 1, result_type = 'expand'))
    #df_img = df_img.assign(metadata_type = df_tmp[0])
    #df_img = df_img.assign(raw_metadata = df_tmp[1])
    #extract_metadata is function from extract_metadata_from_scan
    
    # There should be no metadata on left and top, especially after flipping
    #outlier_metadata_location = np.any([df_img['metadata_type'] == 'right',df_img['metadata_type']=='top', df_img['metadata_type'] == 'left'], axis=0)
    #df_outlier_metadata_location ,_ =  record_loss(df_img,'image_segmentation.segment_images_in_subdir.segment_images: metadata not on left or bottom',subdir_location,
                                         #['file_name','metadata_type'],outlier_metadata_location )
    
    #if not df_outlier_metadata_location.empty:
        #df_outlier_metadata_location['details'] = df_outlier_metadata_location.apply(lambda row: str(row['metadata_type']),1)
        #df_outlier_metadata_location = df_outlier_metadata_location[['file_name','func_name','subdir_name','details']]
    #else:
        #df_outlier_metadata_location = df_outlier_metadata_location[['file_name','func_name','subdir_name']]
    
    # Remove loss from detected metadata not being on the left or bottom
    #df_img = df_img[~outlier_metadata_location]

    #Trimmed metadata
    #df_img['trimmed_metadata'] = list(zip(*df_img.apply(lambda row: trimming_metadata(row['raw_metadata'], row['metadata_type']), axis = 1, result_type = 'expand')))
    df_trim_tmp = (df_img.apply(lambda row: trimming_metadata(row["raw"]), axis = 1))
    df_img = df_img.assign(trimmed_metadata = df_trim_tmp)
    df_loss_trim, loss_trim = record_loss(df_img, 'image_segmentation.trim_raw_metadata.trimming_metadata', subdir_location)
    #trimming_metadata is a function from trim_raw_metadata
    #record_loss is a function from helper_functions
    
    df_img = df_img[~loss_trim]
    # Check if metadata too small
    #df_map_tmp = df_img['trimmed_metadata'].map(lambda array_pixels: np.shape(array_pixels), axis = 1)
    df_map_tmp = df_img.apply(lambda row: np.shape(row['trimmed_metadata']), axis = 1, result_type = 'expand')
    df_img = df_img.assign(meta_height = df_map_tmp[0])
    df_img = df_img.assign(meta_width = df_map_tmp[1])

    outlier_size_metadata = (np.logical_and(df_img['meta_height'] < min_bottom_height, df_img['meta_height'] < min_bottom_height))    
        
    df_outlier_metadata_size, _ = record_loss(df_img,'image_segmentation.segment_images_in_subdir.segment_images: metadata size outlier',subdir_location,
                                           ['file_name','meta_height','meta_width'], outlier_size_metadata)

    if not df_outlier_metadata_size.empty:
        df_outlier_metadata_size['details'] = df_outlier_metadata_size.apply(lambda row: str(row['meta_height']) + str(row['meta_width']), 1)
        df_outlier_metadata_size = df_outlier_metadata_size[['file_name','func_name','subdir_name','details']]
        
    else:
        df_outlier_metadata_size = df_outlier_metadata_size[['file_name','func_name','subdir_name']]
    
    # Remove files whose metadata too small
    df_img = df_img[~outlier_size_metadata]
    
    # Dataframe recording loss from programming errors
    df_loss = pd.concat([df_loss_trim])
    
    # Dataframe recording loss from various filters i.e. metadata too small, ionogram too small/big
    df_outlier = pd.concat([df_outlier_metadata_size])

    return df_img,  df_loss, df_outlier

#From scan2data > metadata_translation > translate_leftside_metadata.py
def get_bottomside_metadata (df_img, subdir_location, kernal_size =(1, 1)):
    """Reads the metadata"""

    kernel_dilation = np.ones(kernal_size, np.uint8)

    #df_dilate_tmp = df_img.apply(lambda trimmed_meta: cv2.dilate(trimmed_meta, kernel_dilation))
    df_dilate_tmp = df_img['trimmed_metadata'].map(lambda trimmed_meta: cv2.dilate(trimmed_meta, kernel_dilation))
    df_img = df_img.assign(dilated_metadata = df_dilate_tmp)


    #df_img['x_centroids'], df_img['y_centroids'], df_img['is_dot'] = zip(*df_img.apply(lambda row: extract_centroids(row['dilated_metadata'], row['file_name']), 1))
    df_cent_tmp = df_img.apply(lambda row: extract_centroids(row['dilated_metadata'], row['file_name']), axis = 1, result_type = 'expand')
    #df_cent_tmp.to_csv("C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/output-cent-tmp.csv")
    df_img = df_img.assign(x_centroids = df_cent_tmp[0])
    df_img = df_img.assign(y_centroids = df_cent_tmp[1])

    df_loss_centroids_extraction, loss_centroids_extraction = record_loss(df_img,'metadata_translation.determine_metadata_grid_mapping.extract_centroids_',subdir_location)
    # extract_centroids and record_loss are two other functions 

    # Remove files where the extraction didn't work
    df_img = df_img[~loss_centroids_extraction]
    # ^removes them from the main dataframe
    
    #df_num_subset = df_img[np.invert(np.array(df_img['is_dot']))]
    list_x_digit, list_y_digit = [0], [0]
    list_x_digit = list(chain(*df_img['x_centroids'].tolist()))
    list_y_digit = list(chain(*df_img['y_centroids'].tolist()))
    dict_mapping, dict_hist = get_leftside_metadata_grid_mapping(list_x_digit, list_y_digit, subdir_location)


    # Determine the value of metadata based on the mappings
    #print (dict_mapping)
    df_dict_meta = df_img.apply(lambda row: map_coord_to_metadata(row['x_centroids'], row['y_centroids'],
                                                                      dict_mapping['dict_cat_digit'],
                                                                      dict_mapping['dict_num_digit']), axis = 1)
    df_img = df_img.assign(dict_metadata = df_dict_meta)
    #df_img.to_csv("C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/01-mapping-to-output.csv")
    
    df_loss_mapping, loss_mapping = record_loss(df_img,'map_coord_to_metadata', subdir_location)
    df_img = df_img[~loss_mapping]
    
    
    df_loss = pd.concat([df_loss_centroids_extraction, df_loss_mapping],ignore_index=True)

    return df_img, df_loss, dict_mapping, dict_hist
   

In [73]:
#From process_directory.py
def process_subdir(subdir_path, regex_images):
    """Transform raw scanned images in subdir into information"""

    #Run segment_images on subdir
    df_img, df_loss, df_outlier = segment_metadata(subdir_path, regex_images)
    #df_img.to_csv("C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/segment-output.csv")

    #Translate metadata on bottom
    #df_img_bottom =  df_img.loc[df_img['metadata_type'] == 'bottom']
    df_img_bottom, df_loss_meta_bottom, _, __ = get_bottomside_metadata(df_img, subdir_path) #from metadata_translation.translate_bottomside_metadata
    df_loss = pd.concat([df_loss_meta_bottom], ignore_index=True)

    df_processed = df_img_bottom
    #df_loss = pd.concat([df_loss_coord_bottom], ignore_index=True)

    df_processed.to_csv("C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/01-processed-output.csv")
    df_loss.to_csv("C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/02-loss-output.csv")
    df_outlier.to_csv("C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/03-outlier-output.csv")
    return df_processed, df_loss, df_outlier

#print (process_subdir(SD_PATH, '/*'))
def process_df_bottomside_metadata(df_processed, subdir_name, source_dir):

    df_final_data = df_processed[['file_name', 'dict_metadata']]
    df_final_data['subdir_name'] = subdir_name
    labels = ['Operating Mode 1','Operating Mode 2','Station Number 1', 'Station Number 2', 'Year 1', 'Year 2', 'Day 1', 'Day 2', 'Day 3', 'Hour 1', 'Hour 2', 'Min 1', 'Min 2', 'Sec 1', 'Sec 2']

    for label in labels:
        df_final_data[label] = df_final_data['dict_metadata'].map(
            lambda dict_meta: sum(dict_meta[label]) if label in dict_meta.keys() else 0)

    del df_final_data['dict_metadata']

    #df_final_data['year'] = df_final_data['year'] + 1900
    #df_final_data['day'] = df_final_data['day_1'].astype(str) + df_final_data['day_2'].astype(str) + df_final_data['day_3'].astype(str) 
    #df_final_data['hour'] = df_final_data['hour_1'].astype(str) + df_final_data['hour_2'].astype(str) 
    #df_final_data['minute'] = df_final_data['minute_1'].astype(str) + df_final_data['minute_2'].astype(str) 
    #df_final_data['second'] = df_final_data['second_1'].astype(str) + df_final_data['second_2'].astype(str) 
    #df_final_data['station_number'] = df_final_data['station_number_1'].astype(str) + df_final_data['station_number_2'].astype(str) 
    #df_final_data['day'] = df_final_data['day'].astype(int)
    #df_final_data['hour'] = df_final_data['hour'].astype(int)
    #df_final_data['minute'] = df_final_data['minute'].astype(int)
    df_final_data['Operating_Mode'] = df_final_data["Operating Mode 1"].astype(str) + df_final_data["Operating Mode 2"].astype(str) 
    df_final_data["Station_Number"] = df_final_data['Station Number 1'].astype(str) + df_final_data['Station Number 2'].astype(str)
    df_final_data['Year'] = (df_final_data['Year 1'].astype(str) + df_final_data['Year 2'].astype(str)).astype(int) + 1900
    df_final_data["Day"] = (df_final_data['Day 1'].astype(str) + df_final_data['Day 2'].astype(str) + df_final_data['Day 3'].astype(str)).astype(int)
    df_final_data["Hour"] = (df_final_data['Hour 1'].astype(str) + df_final_data['Hour 2'].astype(str)).astype(int)
    df_final_data["Min"] = (df_final_data['Min 1'].astype(str) + df_final_data['Min 2'].astype(str)).astype(int)
    df_final_data["Sec"] = (df_final_data['Sec 1'].astype(str) + df_final_data['Sec 2'].astype(str)).astype(int)


    #if len(df_final_data) > 0:          
        #code_list_of_station_after1965 = pd.read_csv(source_dir + 'Post_July_1_1965_Code_List_Station.csv')
        #code_list_of_station_before1963 = pd.read_csv(source_dir + 'Pre_1963_Code_List_Station.csv')
        #code_list_of_station_between1963_1964 = pd.read_csv(source_dir + '1963_1964.csv')
        #df_result_after1965 = pd.merge(df_final_data.loc[df_final_data['year'] >= 1965], 
                                       #code_list_of_station_after1965, on='station_number')
        #df_result_before1963 = pd.merge(df_final_data.loc[df_final_data['year'] <= 1963],
                                        #code_list_of_station_before1963, on='station_number')
        #df_result_mid1964 = pd.merge (df_final_data.loc[df_final_data['year'] == 1964],
                                        #code_list_of_station_between1963_1964, on = 'station_number')
        #df_final_result = pd.concat([df_result_before1963, df_result_mid1964, df_result_after1965]).reset_index(drop=True)
        #df_final_result = df_result_before1963.append(df_result_after1965.append(df_result_mid1964, ignore_index=True)) #Why was pd.concat not used?        
    #else:
        #df_final_result = pd.DataFrame()
    
    df_final_data.to_csv("C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/final_processed.csv")
    return df_final_data


In [74]:
df_processed = process_subdir(SD_PATH, '/*')[0]
print (process_df_bottomside_metadata(df_processed, SD_PATH, ""))


[try]dict type dict_cat_digit
arr idx peaks [166 190]
bin edges [ 2.     2.042  2.084  2.126  2.168  2.21   2.252  2.294  2.336  2.378
  2.42   2.462  2.504  2.546  2.588  2.63   2.672  2.714  2.756  2.798
  2.84   2.882  2.924  2.966  3.008  3.05   3.092  3.134  3.176  3.218
  3.26   3.302  3.344  3.386  3.428  3.47   3.512  3.554  3.596  3.638
  3.68   3.722  3.764  3.806  3.848  3.89   3.932  3.974  4.016  4.058
  4.1    4.142  4.184  4.226  4.268  4.31   4.352  4.394  4.436  4.478
  4.52   4.562  4.604  4.646  4.688  4.73   4.772  4.814  4.856  4.898
  4.94   4.982  5.024  5.066  5.108  5.15   5.192  5.234  5.276  5.318
  5.36   5.402  5.444  5.486  5.528  5.57   5.612  5.654  5.696  5.738
  5.78   5.822  5.864  5.906  5.948  5.99   6.032  6.074  6.116  6.158
  6.2    6.242  6.284  6.326  6.368  6.41   6.452  6.494  6.536  6.578
  6.62   6.662  6.704  6.746  6.788  6.83   6.872  6.914  6.956  6.998
  7.04   7.082  7.124  7.166  7.208  7.25   7.292  7.334  7.376  7.418
  7.46   7.50

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_data['subdir_name'] = subdir_name
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_data[label] = df_final_data['dict_metadata'].map(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final_data[label] = df_final_data['dict_metadata'].map(
A value is trying to be set on a copy of a s

In [75]:
"""
def get_peaks(list_coord):
    """
#From a list of coordinates, return a list of the most common values through binning
    
    #:param list_coord: list of positions where metadata is detected
    #:type list_coord: class: `list`
    #:returns: peaks i.e. list of the most common values through binning
    #:rtype: class: `numpy.ndarray`
    
    
    """
    idx_peaks,bin_edges,counts = indices_highest_bin(list_coord)
    return bin_edges[np.array(idx_peaks)]

def get_leftside_metadata_grid_peaks(regex_subdir, regex_images,
                                     min_subset=10):
    """#Generates a dataframe containing information about peaks to generate metadata grids
    
    #:param regex_subdir: regular expression to extract subdirectory ex: 'E:/master/R*/[0-9]*/'
    #:type regex_img: str
    #:param regex_img: regular expression to extract images ex: '*.png'
    #:type regex_img: str
    #:param min_subset: minimum number of items extracted to be considered ,defaults to 10
    #:type min_subset: int, optional
    #:returns: df_summary_left_dot,df_summary_left_num,error_list: Dataframe listing peaks for dot metadata located on the left of ionogram,  Dataframe listing peaks for number metadata located on the left of ionogram, list of filenames leading to errors
    #:rtype: `pandas.core.frame.DataFrame`,`pandas.core.frame.DataFrame`, list
    
    """
    # All the subdirectory i.e. R014207948/1743-9/
    list_all_subdir = glob.glob(regex_subdir)

    df_summary_left_dot = pd.DataFrame(columns=['meta_peaks_x','meta_peaks_y'] )
    df_summary_left_num = pd.DataFrame(columns=['meta_peaks_x','meta_peaks_y'] )
    error_list = []

    for i,subdir_name in enumerate(list_all_subdir):
        try:
            df_img,_,_ =segment_metadata(subdir_name, regex_images)
            df_img_subset = df_img[df_img['metadata_type']=='left']
            if i % 50 == 0:
                print(i)
            
            if len(df_img_subset.index) > min_subset + 1:
            
                '''
                Following Code pasted from translate_leftside_metadata.get_leftside_metadata
                '''
                # Centroids extraction
                df_img_subset['rotated_metadata'] = df_img_subset['trimmed_metadata'].map(lambda trimmed_meta: np.rot90(trimmed_meta,-1))
                kernel_dilation = np.ones((1,1),np.uint8)
                df_img_subset['dilated_metadata'] = df_img_subset['rotated_metadata'].map(lambda rotated_meta: cv2.dilate(rotated_meta,kernel_dilation))
                df_img_subset['x_centroids'],df_img_subset['y_centroids'] = zip(*df_img_subset.apply(lambda row: extract_centroids(row['dilated_metadata'],row['file_name']),1))
                _,loss_centroids_extraction = record_loss(df_img_subset,'metadata_translation.determine_leftside_metadata_grid_mapping.extract_centroids_and_determine_type',subdir_name)
                  
                # Remove files whose centroid metadata extraction was not successful
                df_img_subset = df_img_subset[~loss_centroids_extraction]

                '''
                Above Code pasted from translate_leftside_metadata.get_leftside_metadata
                '''
                # Determine metadata mapping for dot-type metadata and num-type metadata
                #df_dot_subset = df_img_subset[df_img_subset['is_dot'] == True]
                #df_num_subset = df_img_subset[df_img_subset['is_dot'] == False]
                

                #list_x_dot, list_y_dot,
                list_x_digit,list_y_digit = [0],[0]
                #if not df_dot_subset.empty:
                    #list_x_dot = list(chain(*df_dot_subset['x_centroids'].tolist()))
                    #list_y_dot = list(chain(*df_dot_subset['y_centroids'].tolist()))
                    #x_peaks_dot,y_peaks_dot = get_peaks(list_x_dot),get_peaks(list_y_dot )
                    #to_apend_dot = pd.DataFrame({'meta_peaks_x':[x_peaks_dot],'meta_peaks_y':[y_peaks_dot]} )
                    #df_summary_left_dot = df_summary_left_dot.append(to_apend_dot)
                
                if not df_num_subset.empty:
                    list_x_digit = list(chain(*df_num_subset['x_centroids'].tolist()))
                    list_y_digit = list(chain(*df_num_subset['y_centroids'].tolist()))
                    x_peaks_num,y_peaks_num = get_peaks(list_x_digit),get_peaks(list_y_digit )
                    to_apend_num = pd.DataFrame({'meta_peaks_x':[x_peaks_num],'meta_peaks_y':[y_peaks_num]} )
                    df_summary_left_num = df_summary_left_num.append(to_apend_num)

        except Exception:
            traceback.print_exc()
            error_list.append(subdir_name)
            print(subdir_name )
            
    return df_summary_left_num,error_list

def  plot_hist_peaks_grids(*all_df,
                          nbins=500):
    """ #Plots histogram to determine default values for the ionogram grids
    
    #:param *all_df: dataframes (from grid_default_values) whose values are to be plotted
    #:type *all_df: tuple of or single class: `pandas.core.frame.DataFrame`
    #:param nbins: number of bins used for histogram, defaults to 500
    #:type nbins: int, optional 
    """
    nrow = len(*all_df)
    fig,axes = plt.subplots(nrows=nrow,ncols=2)
    ax = axes.ravel()

    for i,df in enumerate(*all_df):
        peaks_x = list(chain.from_iterable(df['meta_peaks_x']))
        select_peaks_idx,bin_edges,counts = indices_highest_bin(peaks_x, 500, 0.2, 30)
        bin_centers = (0.5*(bin_edges[1:] + bin_edges[:-1]))
        peaks = bin_edges[np.array(select_peaks_idx)]
        ax[2*i].plot(bin_centers,counts)
        ax[2*i].plot(peaks,counts[select_peaks_idx], "x")
        print("peaks", peaks)
        ax[2*i].set_title('meta_peaks_x')

        peaks_y = list(chain.from_iterable(df['meta_peaks_y']))
        select_peaks_idx,bin_edges,counts = indices_highest_bin(peaks_y, 500, 0.2, 30)
        bin_centers = (0.5*(bin_edges[1:] + bin_edges[:-1]))
        peaks = bin_edges[np.array(select_peaks_idx)]
        ax[2*i+1].plot(bin_centers,counts)
        ax[2*i+1].plot(peaks,counts[select_peaks_idx], "x")
        print("peaks", peaks)
        ax[2*i+1].set_title('meta_peaks_y')


df_summary_left_num1, error_list1 = get_leftside_metadata_grid_peaks(regex_subdir= SD_PATH, regex_images='*.png')
#df_summary_left_dot2,df_summary_left_num2,error_list2 = get_leftside_metadata_grid_peaks(regex_subdir='G:/AlouetteData/Alouette Data/R*/[0-9]*[0-9]/', regex_images='Image*[0-9].png')
print (plot_hist_peaks_grids((df_summary_left_num1)))
 #plot_hist_peaks_grids((df_summary_left_dot2,df_summary_left_num2)) """
    

IndentationError: unexpected indent (4210310140.py, line 12)