### ISIS Gridding for Metadata
By: Salini Punchiwickrama

This notebook takes the gridding code developed during the Alouette project to now read the metadata for the ISIS ionograms. 
All the code can be found on either the scan2data file on GitHub under the ISIS-working branch or under the Summer of 2022 file on Livelink in the Alouette folder. 

In [1]:
# Import libraries
import sys
import numpy as np
import cv2
import glob
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd
from itertools import chain
import ntpath
from scipy.signal import find_peaks
import os

Generates a random sub-directory and image

In [2]:

def gen_ran_subdir (subdir_path):
    """" Generates a random subdirectory
    Requires: 
    subdir_path: name of the path for the subdirectory
     L:/DATA/ISIS/ISIS_101300030772/b*/B1* """

    all_subs = glob.glob(subdir_path) #creates a list of all subdirects 
    selected_sub = all_subs[random.randint(0, len(all_subs)-1)] #picks a random one from list
    return (selected_sub)

def gen_ran_img (subdir_path, img):
    """" Generates a random image """

    all_img = glob.glob(subdir_path + img) #creates list of all images

    selected_img = all_img[random.randint(0, len(all_img) - 1)]
    return (selected_img)

In [3]:
#SD_Path = sub-directory path
#Test folder contains first 30 images from L:/DATA/ISIS/ISIS_101300030772/b3_R014207773/B1-35-31 ISIS B D-1059
SD_PATH = os.path.realpath("Test-Images")

#Image path
I_PATH = gen_ran_img(SD_PATH, '/*')

Subdir path: C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/Test-Images
Random image in subdir path C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/Test-Images\Image0024.png


#### Extracting metadata

In [4]:
### DICTIONARIES
# Labelling of coordinates
LABELS_NUM = [1,2,4,8]
LABELS_CAT_DOT = ['day_1','day_2','day_3','hour_1','hour_2','minute_1','minute_2','second_1', 'second_2','station_code']
#LABELS_CAT_DIGIT = ['satellite_number','year','day_1','day_2','day_3','hour_1','hour_2','minute_1','minute_2',
                    #'second_1', 'second_2', 'station_number_1','station_number_2']
LABELS_CAT_NUM = ['Operating Mode 1','Operating Mode 2','Station Number 1', 'Station Number 2', 'Year 1', 'Year 2', 'Day 1', 'Day 2', 'Day 3',
                    'Hour 1', 'Hour 2', 'Min 1', 'Min 2', 'Sec 1', 'Sec 2']
LABELS_DICT = ['dict_cat_digit','dict_num_digit','dict_cat_dot','dict_num_dot',]

#Defaults for dictionary mappings of coordinates to labels
DEFAULT_DICT_CAT_DIGIT = (53,21,661) #mean_dist_default,first_peak_default,last_peak_default
DEFAULT_DICT_NUM_DIGIT = (47,41,20) #mean_dist_default,first_peak_default,dist_btw_peaks for peak detection

#DEFAULT_DICT_CAT_DIGIT_F = (43,23,540) #mean_dist_default,first_peak_default,last_peak_default for those in LIST_DIRECTORY_DOTS 
#DEFAULT_DICT_NUM_DIGIT_F = (40,37,20) #mean_dist_default,first_peak_default,dist_btw_peaks for peak detection for those in LIST_DIRECTORY_DOTS 

#DEFAULT_DICT_CAT_DOT = (59,20,549)##mean_dist_default,first_peak_default,last_peak_default
#DEFAULT_DICT_NUM_DOT = (15,32,10) #mean_dist_default,first_peak_default,dist_btw_peaks for peak detection

#### Base Functions

In [5]:
# From helper_functions.py
def record_loss (df, function_name, subdir_location, columns_to_extract = None, loss_extraction = None):
    """Create dataframe that records loss."""

    if columns_to_extract is None:
        columns_to_extract = ['file_name']
    if loss_extraction is None:
        loss_extraction = []
    if len(loss_extraction) == 0:
        # function should return NA if there an error
        loss_extraction = df.isna().any(axis=1)
    # Record the files whose extraction was not successful
    df_loss_extraction = df[loss_extraction].copy()
    df_loss_extraction = df_loss_extraction[columns_to_extract]
    df_loss_extraction['func_name'] = function_name
    df_loss_extraction[ 'subdir_name'] = subdir_location

    return df_loss_extraction, loss_extraction


# From scan2data > image_segmentation > trim_raw_metadata > connected_components_metadata_location
def metadata_location(meta, min_count = 50, max_count = 1000):
    """"Use connected component algorithm to find the location of the metadata"""
    
    #run algorithm on metadata section
    _, labelled = cv2.connectedComponents(meta)

    #Dictionary of label:counts
    unique, counts = np.unique(labelled, return_counts = True)
    dict_components = dict(zip(unique, counts))

    #Remove outliers // Remove pixels not part of metadata
    dict_subset = {}
    dict_outlier = {}
    for k,v, in dict_components.items():
        if v > min_count and v < max_count:
            dict_subset[k] = v
        else:
            dict_outlier[k] = v
    
    if key_list_to_remove := list(dict_outlier.keys()):
        for k in key_list_to_remove:
            labelled[labelled == k] = 0
    
    return labelled

#From scan2data > image_segmentation > trim_raw_metadata
#test provided values and change if needed
def bottomside_metadata_trimming(connected_meta, opened_meta,
                                 h_window = 100, w_window = 700, starting_y = 0, starting_x = 15, step_size = 10, trim_if_small = 10):
    """Sliding window method to locate and trim bottomside metadata"""
    
    def sliding_window(image, starting_y, starting_x, h_window, w_window, step_size):
        """sliding window generator object"""
        h_img, w_img = np.shape(image)
        for y in range(starting_y, h_img - h_window, step_size):
            for x in range(starting_x, w_img - w_window, step_size):
                yield y,x,image[y:y + h_window, x:x + w_window]
    
    h_raw, w_raw = np.shape(opened_meta)
    
    if h_window + step_size  >= h_raw:
        h_window = h_raw -trim_if_small
    if w_window + step_size>= w_raw:
        w_window = w_raw -trim_if_small
    
    s = sliding_window(connected_meta, starting_y, starting_x, h_window, w_window, step_size)
    
    max_window = connected_meta[starting_y:h_window+starting_y,
                 starting_x:w_window+starting_x ]
    max_mean = np.mean(max_window)
    y_max= starting_y
    x_max = starting_x
    
    for y, x, window in s:
        tmp = window
        mean = np.mean(tmp)
        if mean > max_mean:
            max_window = tmp
            max_mean  = mean
            y_max = y
            x_max = x

    trimmed_metadata =  opened_meta[y_max:y_max + h_window, x_max:x_max + w_window]

    return trimmed_metadata

#From scan2data > image_segmentation > trim_raw_metadata
def trimming_metadata(raw_metadata,type_metadata, opening_kernal_size = (3,3), median_kernal_size = 5):
    """"Trim the rectangle containing metadata to smallest workable area."""

    try:
        #Filtering to reduce noise
        median_filtered_meta = cv2.medianBlur(raw_metadata, median_kernal_size)
        
        #Opening operation: Eroision + Dilation
        kernal_opening = np.ones(opening_kernal_size, dtype = np.uint8)
        opened_meta = cv2.morphologyEx(median_filtered_meta, cv2.MORPH_OPEN, kernal_opening)
        

        # Binarization
        _, metadata_binary = cv2.threshold(opened_meta, 127, 255, cv2.THRESH_BINARY)

        #Run connected components algorithm
        connected_meta = metadata_location(metadata_binary)

        trimmed_metadata = bottomside_metadata_trimming(connected_meta, metadata_binary)
        #bottomside_metadata_trimming function is from same location

        #Checking
        #cv2.imshow("test", trimmed_metadata)
        #cv2.waitKey(0)
        return (trimmed_metadata)
    except:
        return (np.nan)
    

##From scan2data > metadata_translation > leftside_metdata_grid_mapping
def indices_highest_bin(list_coords, nbins = 500, peak_threshold = 0.2, distance_bwtn_peaks = 30 ):
    """" returns indices of most common values using binning
    list_coords: (np.arrray)"""

    arr_coord= np.array(list_coords)

    mean_coords = np.mean(arr_coord)
    std_coords = np.std(arr_coord)
    no_outlier_coords = arr_coord[np.abs(arr_coord - mean_coords) < 3 * std_coords]

    #Binning
    counts, bin_edges = np.histogram(no_outlier_coords, bins=nbins)

    #Detect peaks
    counts_norm = (counts - np.min(counts)) / (np.map(counts) - np.min(counts))
    select_peaks = find_peaks(counts_norm, distance = distance_bwtn_peaks, promience = peak_threshold)    

    return select_peaks, bin_edges, counts

#From scan2data > metadata_translation> leftside_metadata_grid_mapping 
def extract_centroids(cut_metadata, file_name, min_pixels = 50, max_pixels = 1000, max_area_dot = 120):
    """Takes in cut metadata and extracts centroids
    
    cut_metadata: np.array"""

    try:
        _, __, stats, centroids = cv2.connectedComponentsWithStats(cut_metadata) 
        area_centroids = stats[:,-1]

        centroids_metadata = centroids[np.logical_and(area_centroids > min_pixels, area_centroids < max_pixels),:]
        #^ consider adjusting min and max range

        zip_centroids = list(zip(*centroids_metadata))
        #print ("ZIPPED CENTROIDS", zip_centroids)
        col_centroids = list((zip_centroids[0]))
        #print ("col", col_centroids)
        #for i in zip_centroids:
            #print (i)
        row_centroids = list((zip_centroids[1]))

        #Determine dot type (temp)
        #area_centroids = area_centroids[np.logical_and(area_centroids > min_num_pixels, area_centroids < max_number_pixels)]
        #median_area = np.median(area_centroids)
        #The line below is commented to prevent giving the dot items manually
        #if any([dir_dot in file_name for dir_dot in LIST_DIRECTORY_DOTS]) and median_area < max_area_dot:
        #is_dot = median_area < max_area_dot

        return col_centroids, row_centroids
    
    except:
        return np.nan, np.nan

#From scan2data > image_segmentation > extract_ionogram_from_scan
#For segment_metadata
def limits_ionogram(raw_img, row_or_col, starting_index_col = 15):

    mean_values = np.mean(raw_img, row_or_col)

    #normalize mean
    norm_mean = (mean_values - np.min(mean_values)) / np.max(mean_values)
    thresh = np.mean(norm_mean)

    if row_or_col == 0:
        #Protect against scans that includes cuts from another ionogram
        limits = [i for i, mean in enumerate(norm_mean) if mean > thresh and i > starting_index_col]
    else:
        limits = [i for i, mean in enumerate(norm_mean) if mean > thresh]

    return limits[0], limits[-1]


#From scan2data > image_segmentation > extract_ionogram_from_scan
# For segment_metadata
def extract_ionogram(raw_img_array):
    """"this function is here for now to get limits of ionogram. 
    can later be changed to include ionogram graph"""
    try:

    #Extract coordinate delimiting the graph
        x_left, x_right = limits_ionogram(raw_img_array, 0)
        y_upper, y_lower = limits_ionogram(raw_img_array, 1)

        limits = [x_left, x_right, y_upper, y_lower]
        #ionogram = raw_img_array[y_upper:y_lower,x_left_:x_right]

        #For checking the metadata part of image
        imgMetadataPart = raw_img_array[y_upper:y_lower, 15:(x_left - 1)]
        #cv2.imshow("test Metadata", imgMetadataPart)
        #cv2.waitKey(0)

        return (limits)
    except:
        return (np.nan)


#From scan2data > image_segmentation > extract_metadata_from_scan
def extract_metadata (raw_img, limits_iono):
    """Extract metadata from raw scanned image and return coordinates delimiting its limits"""

    #Limits for ionogram
    #print (limits_iono)
    x_left_lim = limits_iono[0][0]
    x_right_lim = limits_iono[0][1]
    y_upper_lim = limits_iono[0][2] 
    y_lower_lim = limits_iono[0][3]

    #Extract retangular block below** ionogram
    rect_left = raw_img[:,0:x_left_lim]
    rect_right = raw_img[:,x_right_lim::]
    rect_top = raw_img[0:y_upper_lim, :]
    rect_bottom = raw_img[y_lower_lim:: ,:]

    #Assumption: the location of the metadata will correspond to rectangle with the highest area
    rect_list = [rect_left, rect_right, rect_top, rect_bottom]
    rect_areas = [rect.shape[0] * rect.shape[1] for rect in rect_list]
    dict_mapping_meta = {0:'left', 1:"right", 2:"top", 3:'bottom'}

    type_metadata_idx = np.argmax(rect_areas)
    raw_metadata = rect_list[type_metadata_idx]
    type_metadata = dict_mapping_meta[type_metadata_idx]

    
    return (type_metadata, raw_metadata)

#From scan2data > metadata_translation > translate_leftside_metadata
def map_coord_to_metadata(list_cat_coord,list_num_coord,dict_mapping_cat, dict_mapping_num):
    """Map coordinate of metadata centroids to information
    
    :param list_cat_coord: list of metadata positions to map to categories   
    :type list_cat_coord: list
    :param list_num_coord: list of metadata positions to map to numbers 
    :type list_num_coord: list
    :param dict_mapping_cat: dictionary used to map coordinate positions to categories
    :type dict_mapping_cat: dict
    :param dict_mapping_num: dictionary used to map coordinate positions to numbers
    :type dict_mapping_num: dict
    :returns: dict_metadata
    :rtype: dict
    
    """
    
        
    list_coord = list(zip(list_cat_coord,list_num_coord))
    coord_mapping_cat = dict_mapping_cat.keys()
    coord_mapping_num = dict_mapping_num.keys()
        
    dict_metadata={}
    for cat_coord, num_coord in list_coord:
        cat_key = min(coord_mapping_cat, key=lambda x:abs(x-cat_coord))
        num_key = min(coord_mapping_num, key=lambda x:abs(x-num_coord))
            
        cat = dict_mapping_cat[cat_key]
        num = dict_mapping_num[num_key]
            
            # tODO: improve for many num
        if cat in dict_metadata:
            dict_metadata[cat].append(num)
        else:
            dict_metadata[cat] = [num]
    #print (dict_metadata)
    return dict_metadata
    #except:
        #return np.nan
    
# From scan2data > metadata_translation > leftside_metadata_grid_mapping
def get_leftside_metadata_grid_mapping(list_x_digit,list_y_digit,dir_name,
                      difference_ratio=0.75,use_defaults=True):
    
    """Determines and returns the the mapping between coordinate values on a metadata image 
    and metadata labels in a subdirectory, for metadata of types dot and digits, as well as returns 
    the histogram used to generate each mapping
    
    """
    # Dictionary of dictionaries that map labels to coordinate point in metadata
    all_labels = [LABELS_CAT_NUM, LABELS_NUM]
    all_dict_mapping = {}
    all_dict_hist = {}
    # Different protocols depending on the type of dictionary mappings
    for i, list_coord in (enumerate([list_x_digit,list_y_digit])):
        type_dict = LABELS_DICT[i]
        labels = all_labels[i]
        try:
            if 'cat' in type_dict:
                if type_dict == 'dict_cat_digit':
                    #if any([dir_dot in dir_name for dir_dot in LIST_DIRECTORY_DOTS]):
                        #mean_dist_default,first_peak_default,last_peak_default=DEFAULT_DICT_CAT_DIGIT_F
                    #else:
                    mean_dist_default,first_peak_default,last_peak_default = DEFAULT_DICT_CAT_DIGIT
            
                elif type_dict == 'dict_cat_dot':
                    mean_dist_default,first_peak_default,last_peak_default = DEFAULT_DICT_CAT_DOT
                try:
                    idx_peaks,bin_edges,counts = indices_highest_bin(list_coord)
                    peaks = bin_edges[np.array(idx_peaks)] #coordinate values on a metadata image probably corresponding to metadata
                    
                    n_labels = len(labels)
                    first_peak = peaks[0]
                    last_peak = peaks[-1]

                    if use_defaults and abs(last_peak -last_peak_default)  > difference_ratio*mean_dist_default:
                        last_peak = last_peak_default
                    if use_defaults and abs(first_peak -first_peak_default)  > difference_ratio*mean_dist_default:
                        first_peak = first_peak_default
                    
                    mean_dist_btw_peaks = (last_peak - first_peak)/(n_labels -1)
                    list_peaks = [int(round(first_peak + i* mean_dist_btw_peaks)) for i in range(0,n_labels )]
                    
                    all_dict_mapping[type_dict] =dict(zip(list_peaks,labels))
                    all_dict_hist[type_dict] = (idx_peaks,bin_edges,counts)
                

                except:
                    last_peak = last_peak_default
                    first_peak = first_peak_default
                    mean_dist_btw_peaks = mean_dist_default
                    n_labels = len(labels)
                    list_peaks = [int(round(first_peak + i* mean_dist_btw_peaks)) for i in range(0, n_labels)]
                    
                    all_dict_mapping[type_dict] =dict(zip(list_peaks,labels))
                    all_dict_hist[type_dict] = {}
                
            elif 'num' in type_dict:
                if  type_dict == 'dict_num_digit':
                    #if any([dir_dot in dir_name for dir_dot in LIST_DIRECTORY_DOTS]):
                        #mean_dist_default,peak_0_default,dist_btw_peaks = DEFAULT_DICT_NUM_DIGIT_F
                    #else:
                    mean_dist_default,peak_0_default,dist_btw_peaks = DEFAULT_DICT_NUM_DIGIT
                elif type_dict == 'dict_num_dot':
                    mean_dist_default,peak_0_default,dist_btw_peaks= DEFAULT_DICT_NUM_DOT

                    
                try:
                    idx_peaks,bin_edges,counts = indices_highest_bin(list_coord,peak_prominence_threshold=0.3,nbins=100,distance_between_peaks=dist_btw_peaks)
                
                    peaks = bin_edges[np.array(idx_peaks)]                
                    peak_0 = peaks[0]
                    if use_defaults and abs(peak_0 -peak_0_default)  > difference_ratio*mean_dist_default:
                        peak_0 = peak_0_default
                
                    # only first three peaks are deemed relevant
                    if len(peaks) < 3:
                        max_idx = 2
                    else:
                        max_idx = 3
                
                    mean_dist_btw_peaks = np.mean([peaks[i+1]-peaks[i] for i in range(0,max_idx)])
                    if use_defaults and abs(mean_dist_btw_peaks - mean_dist_default)  > difference_ratio*dist_btw_peaks:
                        mean_dist_btw_peaks = mean_dist_default
                    list_peaks = [int(round(peak_0 + i* mean_dist_btw_peaks)) for i in range(0,len(labels))]
                
                    all_dict_mapping[type_dict] =dict(zip(list_peaks,labels))
                    all_dict_hist[type_dict] = (idx_peaks,bin_edges,counts)
                except:
                    peak_0 = peak_0_default
                    mean_dist_btw_peaks = mean_dist_default
                    list_peaks = [int(round(peak_0 + i* mean_dist_btw_peaks)) for i in range(0,len(labels))]
                    all_dict_mapping[type_dict] =dict(zip(list_peaks,labels))
                    all_dict_hist[type_dict] =  {}
        except:
            all_dict_mapping[type_dict] ={}
            all_dict_hist[type_dict] =  {}

    return all_dict_mapping, all_dict_hist

#From scan2data> image_segmentation > segment_images_in_subdir.py
#some variables here are not necessary and can be removed, ie. height, width...
def segment_metadata(subdir_location, regex_img, min_bottom_height = 25, cutoff_width=300, cutoff_height=150):
    """Should only segment metadata. Can be adjusted to include ionogram. """
    regex_raw_image =  SD_PATH + ("/*")
    #print ("the raw images path is:", regex_raw_image)
    list_images = glob.glob(regex_raw_image)
    
    #Dataframe is processing
    df_img = pd.DataFrame(data = {"file_name": list_images})
    #Read each image in a 2D UTF-8 grayscale array
    df_img["raw"] = df_img['file_name'].map(lambda file_name: cv2.imread(file_name, 0))

    # Extract ionogram and coordinates delimiting its limits
    df_img['limits']= list(zip(df_img['raw'].map(lambda raw_img: extract_ionogram(raw_img)))) 
    # Record the files whose ionogram extraction was not successful
    df_loss_ion_extraction, loss_ion_extraction = record_loss(df_img,'image_segmentation.extract_ionogram_from_scan.extract_ionogram',subdir_location)
    df_img = df_img[~loss_ion_extraction]
    #df_img['height'],df_img['width'] = list(zip(df_img['ionogram'].map(lambda array_pixels: array_pixels.shape)))
    
    #Raw metadata
    df_tmp = (df_img.apply(lambda row: extract_metadata(row['raw'], row['limits']), axis = 1, result_type = 'expand'))
    df_img = df_img.assign(metadata_type = df_tmp[0])
    df_img = df_img.assign(raw_metadata = df_tmp[1])
    #extract_metadata is function from extract_metadata_from_scan
    
    # There should be no metadata on left and top, especially after flipping
    outlier_metadata_location = np.any([df_img['metadata_type'] == 'right',df_img['metadata_type']=='top', df_img['metadata_type'] == 'left'], axis=0)
    df_outlier_metadata_location ,_ =  record_loss(df_img,'image_segmentation.segment_images_in_subdir.segment_images: metadata not on left or bottom',subdir_location,
                                         ['file_name','metadata_type'],outlier_metadata_location )
    
    if not df_outlier_metadata_location.empty:
        df_outlier_metadata_location['details'] = df_outlier_metadata_location.apply(lambda row: str(row['metadata_type']),1)
        df_outlier_metadata_location = df_outlier_metadata_location[['file_name','func_name','subdir_name','details']]
    else:
        df_outlier_metadata_location = df_outlier_metadata_location[['file_name','func_name','subdir_name']]
    
    # Remove loss from detected metadata not being on the left or bottom
    df_img = df_img[~outlier_metadata_location]

    #Trimmed metadata
    #df_img['trimmed_metadata'] = list(zip(*df_img.apply(lambda row: trimming_metadata(row['raw_metadata'], row['metadata_type']), axis = 1, result_type = 'expand')))
    df_trim_tmp = (df_img.apply(lambda row: trimming_metadata(row["raw_metadata"], row['metadata_type']), axis = 1))
    df_img = df_img.assign(trimmed_metadata = df_trim_tmp)
    df_loss_trim, loss_trim = record_loss(df_img, 'image_segmentation.trim_raw_metadata.ntrimming_metadata', subdir_location)
    #trimming_metadata is a function from trim_raw_metadata
    #record_loss is a function from helper_functions
    
    df_img = df_img[~loss_trim]

    # Check if metadata too small
    #df_map_tmp = df_img['trimmed_metadata'].map(lambda array_pixels: np.shape(array_pixels), axis = 1)
    df_map_tmp = df_img.apply(lambda row: np.shape(row['trimmed_metadata']), axis = 1, result_type = 'expand')
    print (df_map_tmp)
    df_img = df_img.assign(meta_height = df_map_tmp[0])
    df_img = df_img.assign(meta_width = df_map_tmp[1])

    outlier_size_metadata = (np.logical_and(df_img['metadata_type'] == 'bottom', df_img['meta_height'] < min_bottom_height))    
        
    df_outlier_metadata_size, _ = record_loss(df_img,'image_segmentation.segment_images_in_subdir.segment_images: metadata size outlier',subdir_location,
                                           ['file_name','metadata_type','meta_height','meta_width'],outlier_size_metadata)

    if not df_outlier_metadata_size.empty:
        df_outlier_metadata_size['details'] = df_outlier_metadata_size.apply(lambda row: row['metadata_type'] + '_height: ' + \
                                                    str(row['meta_height'])+',width: ' + str(row['meta_width']),1)
        df_outlier_metadata_size = df_outlier_metadata_size[['file_name','func_name','subdir_name','details']]
        
    else:
        df_outlier_metadata_size = df_outlier_metadata_size[['file_name','func_name','subdir_name']]
    
    # Remove files whose metadata too small
    df_img = df_img[~outlier_size_metadata]
    
    # Dataframe recording loss from programming errors
    df_loss = pd.concat([df_loss_ion_extraction, df_loss_trim])
    
    # Dataframe recording loss from various filters i.e. metadata too small, ionogram too small/big
    df_outlier = pd.concat([df_outlier_metadata_location, df_outlier_metadata_size])

    return df_img,  df_loss, df_outlier
# function can also return df_loss, df_outlier



#### Processing Functions

In [6]:
#From scan2data > metadata_translation > translate_leftside_metadata.py
def get_bottomside_metadata (df_img, subdir_location, kernal_size =(1, 1)):
    """Reads the metadata"""

    kernel_dilation = np.ones(kernal_size, np.uint8)

    #df_dilate_tmp = df_img.apply(lambda trimmed_meta: cv2.dilate(trimmed_meta, kernel_dilation))
    df_dilate_tmp = df_img['trimmed_metadata'].map(lambda trimmed_meta: cv2.dilate(trimmed_meta, kernel_dilation))
    df_img = df_img.assign(dilated_metadata = df_dilate_tmp)


    #df_img['x_centroids'], df_img['y_centroids'], df_img['is_dot'] = zip(*df_img.apply(lambda row: extract_centroids(row['dilated_metadata'], row['file_name']), 1))
    df_cent_tmp = df_img.apply(lambda row: extract_centroids(row['dilated_metadata'], row['file_name']), axis = 1, result_type = 'expand')
    #df_cent_tmp.to_csv("C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/output-cent-tmp.csv")
    df_img = df_img.assign(x_centroids = df_cent_tmp[0])
    df_img = df_img.assign(y_centroids = df_cent_tmp[1])

    df_loss_centroids_extraction, loss_centroids_extraction = record_loss(df_img,'metadata_translation.determine_metadata_grid_mapping.extract_centroids_',subdir_location)
    # extract_centroids and record_loss are two other functions 

    # Remove files where the extraction didn't work
    df_img = df_img[~loss_centroids_extraction]
    # ^removes them from the main dataframe
    
    #df_num_subset = df_img[np.invert(np.array(df_img['is_dot']))]
    list_x_digit, list_y_digit = [0], [0]

    list_x_digit = list(chain(*df_img['x_centroids'].tolist()))
    list_y_digit = list(chain(*df_img['y_centroids'].tolist()))
    dict_mapping, dict_hist = get_leftside_metadata_grid_mapping(list_x_digit, list_y_digit, subdir_location)


    # Determine the value of metadata based on the mappings
    #print (dict_mapping)
    df_dict_meta = df_img.apply(lambda row: map_coord_to_metadata(row['x_centroids'], row['y_centroids'],
                                                                      dict_mapping['dict_cat_digit'],
                                                                      dict_mapping['dict_num_digit']), axis = 1)
    df_img = df_img.assign(dict_metadata = df_dict_meta)
    #df_img.to_csv("C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/01-mapping-to-output.csv")
    
    df_loss_mapping, loss_mapping = record_loss(df_img,'map_coord_to_metadata', subdir_location)
    df_img = df_img[~loss_mapping]
    
    
    df_loss = pd.concat([df_loss_centroids_extraction, df_loss_mapping],ignore_index=True)

    return df_img, df_loss, dict_mapping, dict_hist

#From process_directory.py
def process_subdir(subdir_path, regex_images):
    """Transform raw scanned images in subdir into information"""

    #Run segment_images on subdir
    df_img, df_loss, df_outlier = segment_metadata(subdir_path, regex_images)
    #df_img.to_csv("C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/segment-output.csv")

    #Translate metadata on bottom
    #df_img_bottom =  df_img.loc[df_img['metadata_type'] == 'bottom']
    df_img_bottom, df_loss_meta_bottom, _, __ = get_bottomside_metadata(df_img, subdir_path) #from metadata_translation.translate_bottomside_metadata
    df_img_bottom.to_csv("C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/bottomside-output.csv")
    df_loss = pd.concat([df_loss_meta_bottom], ignore_index=True)

    df_processed = df_img_bottom
    #df_loss = pd.concat([df_loss_coord_bottom], ignore_index=True)
    return df_processed, df_loss, df_outlier

print (process_subdir(SD_PATH, '/*'))

# Currently not being used
def process_df_bottomside_metadata(df_processed, subdir_name, source_dir):

    df_final_data = df_processed[['file_name', 'dict_metadata']]
    df_final_data['subdir_name'] = subdir_name
    labels = ['Operating Mode 1','Operating Mode 2','Station Number 1', 'Station Number 2', 'Year 1', 'Year 2', 'Day 1', 'Day 2', 'Day 3'
                    'Hour 1', 'Hour 2', 'Min 1', 'Min 2', 'Sec 1', 'Sec 2']

    for label in labels:
        df_final_data[label] = df_final_data['dict_metadata'].map(
            lambda dict_meta: sum(dict_meta[label]) if label in dict_meta.keys() else 0)

    del df_final_data['dict_metadata']

    #df_final_data['year'] = df_final_data['year'] + 1900
    #df_final_data['day'] = df_final_data['day_1'].astype(str) + df_final_data['day_2'].astype(str) + df_final_data['day_3'].astype(str) 
    #df_final_data['hour'] = df_final_data['hour_1'].astype(str) + df_final_data['hour_2'].astype(str) 
    #df_final_data['minute'] = df_final_data['minute_1'].astype(str) + df_final_data['minute_2'].astype(str) 
    #df_final_data['second'] = df_final_data['second_1'].astype(str) + df_final_data['second_2'].astype(str) 
    #df_final_data['station_number'] = df_final_data['station_number_1'].astype(str) + df_final_data['station_number_2'].astype(str) 
    #df_final_data['day'] = df_final_data['day'].astype(int)
    #df_final_data['hour'] = df_final_data['hour'].astype(int)
    #df_final_data['minute'] = df_final_data['minute'].astype(int)
    #df_final_data['second'] = df_final_data['second'].astype(int)
    #df_final_data['station_number'] = df_final_data['station_number'].astype(int)

    df_final_data['Operating_Mode'] = df_final_data["Operating Mode 1"].astype(str) + df_final_data["Operating Mode 2"].astype(str) 
    df_final_data["Station_Number"] = df_final_data['Station Number 1'].astype(str) + df_final_data['Station Number 2'].astype(str)
    df_final_data['Year'] = (df_final_data['Year 1'].astype(str) + df_final_data['Year 2'].astype(str)).astype(int) + 1900
    df_final_data["Day"] = (df_final_data['Day 1'].astype(str) + df_final_data['Day 2'].astype(str) + df_final_data['Day 3'].astype(str)).astype(int)
    df_final_data["Hour"] = (df_final_data['Hour 1'].astype(str) + df_final_data['Hour 2'].astype(str)).astype(int)
    df_final_data["Min"] = (df_final_data['Min 1'].astype(str) + df_final_data['Min 2'].astype(str)).astype(int)
    df_final_data["Sec"] = (df_final_data['Sec 1'].astype(str) + df_final_data['Sec 2'].astype(str)).astype(int)


    #if len(df_final_data) > 0:          
        #code_list_of_station_after1965 = pd.read_csv(source_dir + 'Post_July_1_1965_Code_List_Station.csv')
        #code_list_of_station_before1963 = pd.read_csv(source_dir + 'Pre_1963_Code_List_Station.csv')
        #code_list_of_station_between1963_1964 = pd.read_csv(source_dir + '1963_1964.csv')
        #df_result_after1965 = pd.merge(df_final_data.loc[df_final_data['year'] >= 1965], 
                                       #code_list_of_station_after1965, on='station_number')
        #df_result_before1963 = pd.merge(df_final_data.loc[df_final_data['year'] <= 1963],
                                        #code_list_of_station_before1963, on='station_number')
        #df_result_mid1964 = pd.merge (df_final_data.loc[df_final_data['year'] == 1964],
                                        #code_list_of_station_between1963_1964, on = 'station_number')
        #df_final_result = pd.concat([df_result_before1963, df_result_mid1964, df_result_after1965]).reset_index(drop=True)
        #df_final_result = df_result_before1963.append(df_result_after1965.append(df_result_mid1964, ignore_index=True)) #Why was pd.concat not used?        
    #else:
        #df_final_result = pd.DataFrame()
    
    '''if len(df_final_data['year']) != 0:  # and df_final_data['year'] >= 1965:
        code_list_of_station = pd.read_csv(source_dir+'Post_July_1_1965_Code_List_Station.csv')
    #else:
    #    code_list_of_station = pd.read_csv(source_dir + 'Pre_July_1_1965_Code_List_Station.csv')
    df_final_result = pd.merge(df_final_data,code_list_of_station, on='station_number')
    code_list_of_station_after1965 = pd.read_csv(source_dir + 'Post_July_1_1965_Code_List_Station.csv')
    code_list_of_station_before1963 = pd.read_csv(source_dir + 'Pre_1963_Code_List_Station.csv')
    code_list_of_station_between1963_1964 = pd.read_csv(source_dir + '1963_1964.csv')
    df_result_after1965 = pd.merge(df_final_data.loc[df_final_data['year'] >= 1965], code_list_of_station_after1965,
                                   on='station_number')
    df_result_before1963 = pd.merge(df_final_data.loc[df_final_data['year'] <= 1963],
                                    code_list_of_station_before1963, on='station_number')
    df_result_mid1964 = pd.merge (df_final_data.loc[df_final_data['year'] == 1964],
                                    code_list_of_station_between1963_1964, on = 'station_number')
    df_final_result = df_result_before1963.append(df_result_after1965.append(df_result_mid1964, ignore_index=True))'''
    
    return df_final_data

source_dir = "C:/Users/spunchiwickrama/Documents/Projects/ISIS_I/"
#print (process_df_bottomside_metadata(process_subdir(SD_PATH, '/*')[0], SD_PATH, source_dir))

     0    1
0   43  700
1   44  700
2   44  700
3   43  700
4   44  700
5   44  700
6   44  700
7   44  700
8   44  700
9   44  700
10  43  700
11  43  700
12  44  700
13  42  322
17  44  700
18  45  700
19  45  700
20  44  700
21  45  700
22  44  700
23  44  700
24  44  700
25  45  700
26  45  700
27  45  700
28  45  700
29  44  700
(                                            file_name  \
0   C:/Users/spunchiwickrama/Documents/Projects/IS...   
1   C:/Users/spunchiwickrama/Documents/Projects/IS...   
2   C:/Users/spunchiwickrama/Documents/Projects/IS...   
4   C:/Users/spunchiwickrama/Documents/Projects/IS...   
5   C:/Users/spunchiwickrama/Documents/Projects/IS...   
6   C:/Users/spunchiwickrama/Documents/Projects/IS...   
7   C:/Users/spunchiwickrama/Documents/Projects/IS...   
9   C:/Users/spunchiwickrama/Documents/Projects/IS...   
13  C:/Users/spunchiwickrama/Documents/Projects/IS...   
17  C:/Users/spunchiwickrama/Documents/Projects/IS...   
18  C:/Users/spunchiwickrama/Documen