### ISIS Gridding for Metadata (in-progress)

In [None]:
# Import libraries
import sys
import numpy as np
import cv2
import glob
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pandas as pd

#### Generate a random image
Generates and displays a random image from subdir

In [None]:

def gen_ran_subdir (subdir_path):
    """" Generates a random subdirectory
    Requires: 
    subdir_path: name of the path for the subdirectory
     L:/DATA/ISIS/ISIS_101300030772/b*/B1* """

    all_subs = glob.glob(subdir_path) #creates a list of all subdirects 
    selected_sub = all_subs[random.randint(0, len(all_subs)-1)] #picks a random one from list
    return (selected_sub)

def gen_ran_img (subdir_path, img):
    """" Generates a random image """

    all_img = glob.glob(subdir_path + img) #creates list of all images

    selected_img = all_img[random.randint(0, len(all_img) - 1)]
    return (selected_img)

img_path = ((gen_ran_img(gen_ran_subdir("L:/DATA/ISIS/ISIS_101300030772/b*/B1*"),"/*")))

# Display
print (img_path)
img = mpimg.imread(img_path)
cv2.imshow("image", img)
cv2.waitKey(0)

# Crop image
# Need to crop bottom ~20%

height, width = img.shape[0:2]

y, x = img.shape[0], img.shape[1]
h = int(y*0.85)

crop_img = img[h:y, 0:x]
cv2.imshow("cropped image", crop_img)
cv2.waitKey(0)


#### Extracting metadata

In [None]:
# From helper_functions.py
def record_loss (df, function_name, subdir_location, columns_to_extract=['file_name'], loss_extraction=[]):
    """Create dataframe that records loss."""

    if len(loss_extraction) == 0:
        #should return NA if there's an error
        loss_extraction = df.isna().any(axis = 1)

    df_loss_extraction = df[loss_extraction].copy()
    df_loss_extraction = df_loss_extraction[columns_to_extract]
    df_loss_extraction["func_name"] = function_name
    df_loss_extraction["subdir_name"] = subdir_location

    return df_loss_extraction, loss_extraction


# location?
def metadata_location(meta, min_count = 50, max_count = 1000):
    """"Use connected component algorithm to find the location of the metadata"""
    
    #run algorithm on metadata section
    _, labelled = cv2.connectedComponents(meta)

    #Dictionary of label:counts
    unique, counts = np.unique(labelled, return_counts = True)
    dict_components = dict(zip(unique, counts))

    #Remove outliers // Remove pixels not part of metadata
    dict_subset = {}
    dict_outlier = {}
    for k,v, in dict_components.items():
        if v > min_count and v < max_count:
            dict_subset[k] = v
        else:
            dict_outlier[k] = v
    
    key_list_to_remove = list(dict_outlier.keys())
    if len(key_list_to_remove) != 0:
        for k in key_list_to_remove:
            labelled[labelled == k] = 0
    
    return labelled

#From scan2data > image_segmentation > trim_raw_metadata
#test provided values and change if needed
def bottomside_metadata_trimming(connected_meta, opened_meta,
                                 h_window = 100, w_window = 700, starting_y = 0, starting_x = 15, step_size = 10, trim_if_small = 10):
    """Sliding window method to locate and trim bottomside metadata"""
    
    def sliding_window(image, starting_y, starting_x, h_window, w_window, step_size):
        """da sliding window"""
        h_img, w_img = image.shape
        for y in range(starting_y, h_img - h_window, step_size):
            for x in range(starting_x, w_ing - w_window, step_size):
                yield y,x,image[y:y + h_window, x:x + w_window]
    
    h_raw,w_raw = opened_meta.shape
    
    if h_window + step_size  >= h_raw:
        h_window = h_raw -trim_if_small
    if w_window + step_size>= w_raw:
        w_window = w_raw -trim_if_small
    
    s = sliding_window(connected_meta,starting_y,starting_x,h_window,w_window,step_size)

    max_window = connected_meta[starting_y:h_window+starting_y,
                 starting_x:w_window+starting_x ]
    max_mean = np.mean(max_window)
    y_max= starting_y
    x_max = starting_x
    for y,x,window in s:
        tmp = window
        mean = np.mean(tmp)
        if mean > max_mean:
            max_window = tmp
            max_mean  = mean
            y_max= y
            x_max =x

    trimmed_metadata =  opened_meta[y_max:y_max+h_window,x_max:x_max+w_window]

    return trimmed_metadata

#From scan2data > image_segmentation > trim_raw_metadata
def trimming_metadata(raw_metadata, opening_kernal_size = (3,3), median_kernal_size = 5):
    """"Trim the rectangle containing metadata to smallest workable area."""

    try:
        #Filtering to reduce noise
        median_filtered_meta = cv2.medianBlur(raw_metadata, median_kernal_size)
        
        #Opening operation: Eroision + Dilation
        kernal_opening = np.ones(opening_kernal_size, dtype = np.uint8)
        opened_meta = cv2.morphologyEx(binr, cv2.MORPH_OPEN, kernel_opening, iterations = 1)

        # Binarization
        _, metadata_binary = cv2.threshold(opened_meta, 127, 255, cv2.THRESH_BINARY)

        #Run connected components algorithm
        connected_meta = connected_components_metadata_location(metadata_binary)

        trimmed_metadata = bottomside_metadata_trimming(connected_meta, metadata_binary)
        #function from somewhere else

        #Checking
        cv2.imshow("test", trimmed_metadata)
        cv2.waitKey(0)
        return (trimmed_metadata)
    except:
        return (np.nan)
    

#location?
#function is possible not needed -- can potentially remove
def indices_highest_bin(list_coords):
    """" returns indices of most common values using binning
    list_coords: (np.arrray)"""

    nbins = 50
    peak_threshold = 0.2
    distance_bwtn_peaks = 30

    mean_coords = np.mean(list_coords)
    std_coords = np.std(list_coords)
    no_outlier_coords = list_coords[np.abs(list_coords - mean_coords) < 3 * std_coords]

    #Binning
    counts, bin_edges = np.histogram(no_outlier_coords, bins=nbins)

    #Detect peaks
    counts_norm = (counts - np.min(counts)) / (np.map(counts) - np.min(counts))
    select_peaks = find_peaks(counts_norm, distance = distance_bwtn_peaks, promience = peak_threshold)    

    return select_peaks, bin_edges, counts

#location? 
def extract_centroids(cut_metadata):
    """Takes in cut metadata and extracts centroids
    
    cut_metadata: np.array"""

    min_pixels = 50
    max_pixels = 1000

    _, __, stats, centroids = cv2.connectedComponentsWithStats(cut_metadata, 8) 
    area_centroids = stats[:,-1]

    centroids_metadata = centroids[np.logical_and(area_centroids > min_pixels, area_centroids < max_pixels)]
    col_centroids, row_centroids = zip(*centroids_metadata)

    #Round to nearest integer
    col_centroids = list(map(round, col_centroids))
    row_centroids = list(map(round, row_centroids))

    return col_centroids, row_centroids

#location?
#From scan2data > image_segmentation > extract_metadata_from_scan
def extract_metadata (raw_img, limits_iono):
    """Extract metadata from raw scanned image and return coordinates delimiting its limits"""
    
    #Limits for ionogram
    x_left_lim, x_right_lim, y_upper_lim, y_lower_lim = limits_iono

    #Extract retangular block below** ionogram
    rect_left = raw_img[:,0:x_left_lim]
    rect_right = raw_img[:,x_right_lim::]
    rect_top = raw_img[0:y_upper_lim, :]
    rect_bottom = raw_img[y_lower_lim::,:]

    #Assumption: the location of the metadata will correspond to rectangle with the highest area
    rect_list = [rect_left, rect_right, rect_top, rect_bottom]
    rect_areas = [rect.shape[0] * rect_shape[1] for rect in rect_list]
    dict_mapping_meta = {0:'left', 1:"right", 2:"top", 3:'bottom'}

    type_metadata_idx = np.argmax(rect_areas)
    raw_metadata = rect_list[type_metadata_idx]
    type_metadata = dict_mapping_meta[type_metadata_idx]

    return (type_metadata, raw_metadata)


#From scan2data> image_segmentation > segment_images_in_subdir.py
#some variables here are not necessary and can be removed, ie. height, width...
def segment_metadata(subdir_location, regex_img, min_bottom_height = 25, cutoff_width=300, cutoff_height=150):
    """Should only segment metadata. Can be adjusted to include ionogram. """
    regex_raw_image = subdir_location + regex_img
    list_images = glob.glob(regex_raw_image)
    
    #Dataframe is processing
    df_img = pd.DataFrame(data = {"file_name": list_images})
    
    #Read each image in a 2D UTF-8 grayscale array
    df_img["raw"] = df_img['file_name'].map(lambda file_name: cv2.imread(file_name, 0))

    # Extract ionogram and coordinates delimiting its limits
    #df_img['limits'],df_img['ionogram'] = zip(*df_img['raw'].map(lambda raw_img: extract_ionogram(raw_img)))

    # Record the files whose ionogram extraction was not successful
    #df_loss_ion_extraction, loss_ion_extraction = record_loss(df_img,'image_segmentation.extract_ionogram_from_scan.extract_ionogram',subdir_location)
    
    #Raw metadata
    df_img['metadata_type'], df_img['raw_metadata'] = zip(*df_img.apply(lambda row: extract_metadata(row['raw'], row['limits']), 1))
    #extract_metadata is function from extract_metadata_from_scan

    # There should be no metadata on left and top, especially after flipping
    outlier_metadata_location = np.any([df_img['metadata_type'] == 'right',df_img['metadata_type']=='top'],axis=0)
    df_outlier_metadata_location ,_ =  record_loss(df_img,'image_segmentation.segment_images_in_subdir.segment_images: metadata not on left or bottom',subdir_location,
                                         ['file_name','metadata_type'],outlier_metadata_location )
    
    if not df_outlier_metadata_location.empty:
        df_outlier_metadata_location['details'] = df_outlier_metadata_location.apply(lambda row: str(row['metadata_type']),1)
        df_outlier_metadata_location = df_outlier_metadata_location[['file_name','func_name','subdir_name','details']]
    else:
        df_outlier_metadata_location = df_outlier_metadata_location[['file_name','func_name','subdir_name']]
    
    # Remove loss from detected metadata not being on the left or bottom
    df_img = df_img[~outlier_metadata_location]

    #Trimmed metadata
    df_img['trimmed_metadata'] = df_img.apply(lambda row: trimming_metadata(row['raw_metadata'], row['metadata_type']), 1)
    df_loss_trim, lost_trim = record_loss(df.img, 'image_segmentation.trim_raw_metadata.ntrimming_metadata', subdir_location)
    #trimming_metadata is a function from trim_raw_metadata
    #record_loss is a function from helper_functions

    df_img = df_img[~loss_trim]

    # Check if metadata too small
    df_img['meta_height'],df_img['meta_width'] = zip(*df_img['trimmed_metadata'].map(lambda array_pixels: array_pixels.shape))
    outlier_size_metadata = np.logical_or(np.logical_and(df_img['metadata_type'] == 'left', 
                                                      df_img['meta_width'] < min_leftside_meta_width),
                                       np.logical_and(df_img['metadata_type'] == 'bottom', 
                                                      df_img['meta_height'] < min_bottomside_meta_height))
        
    df_outlier_metadata_size, _ = record_loss(df_img,'image_segmentation.segment_images_in_subdir.segment_images: metadata size outlier',subdir_location,
                                           ['file_name','metadata_type','meta_height','meta_width'],outlier_size_metadata)

    if not df_outlier_metadata_size.empty:
        df_outlier_metadata_size['details'] = df_outlier_metadata_size.apply(lambda row: row['metadata_type'] + '_height: ' + \
                                                    str(row['meta_height'])+',width: ' + str(row['meta_width']),1)
        df_outlier_metadata_size = df_outlier_metadata_size[['file_name','func_name','subdir_name','details']]
        
    else:
        df_outlier_metadata_size = df_outlier_metadata_size[['file_name','func_name','subdir_name']]
    
    # Remove files whose metadata too small
    df_img = df_img[~outlier_size_metadata]
    
    
    # Dataframe recording loss from programming errors
    df_loss = pd.concat([df_loss_ion_extraction,df_loss_trim])
    
    # Dataframe recording loss from various filters i.e. metadata too small, ionogram too small/big
    df_outlier = pd.concat([df_outlier_ionogram,df_outlier_metadata_location,df_outlier_metadata_size])

    return df_img,  df_loss, df_outlier
# function can also return df_loss, df_outlier

#From scan2data > metadata_translation > translate_leftside_metadata.py
def get_bottomside_metadata (df_img, subdir_location, kernal_size =(1, 1)):
    """Reads the metadata (finally)"""

    kernel_dilation = np.ones(kernal_size, np.uint8)

    df_img['dilated_metadata'] = df_img['trimmed_metadata'].map(
        lambda trimmed_meta: cv2.dilate(trimmed_meta, kernel_dilation))
    df_img['x_centroids'], df_img['y_centroids'], df_img['is_dot'] = zip(
        *df_img.apply(lambda row: extract_centroids(row['dilated_metadata'], row['file_name']), 1))
    df_loss_centroids_extraction, loss_centroids_extraction = record_loss(df_img,
                                                                          'metadata_translation.determine_metadata_grid_mapping.extract_centroids_',
                                                                          subdir_location)
    # extract_centroids and record_loss are two other functions 

    # Remove files where the extraction didn't work
    df_img = df_img[~loss_centroids_extraction]
    # ^removes them from the main dataframe
    df_num_subset = df_img[np.invert(np.array(df_img['is_data']))]

    list_x_digit = list(chain(*df_num_subset['x_centroids'].tolist()))
    list_y_digit = list(chain(*df_num_subset['y_centroids'].tolist()))
    dict_mapping, dict_hist = get_leftside_metadata_grid_mapping(list_x_dot, list_y_dot, list_x_digit, list_y_digit,
                                                                 subdir_location)





In [None]:
#### SOME GOOD CODE FOR ONCE 

#From process_directory.py
def process_subdir(subdir_path, regex_images, output_folder_if_pickle,
                   to_pickle=True):
    """Transform raw scanned images in subdir into information"""

    #Run segment_images on subdir
    df_img, df_loss, df_outlier = segment_images(subdir_path, regex_images)

    #Translate metadata on bottom
    df_img_bottom = df_img[df_img['metadata_type'] == 'bottom']

    df_img, df_loss_meta, dict_mapping, dict_hist = get_bottonside_metadata(df_img, subdir_path)
    #get_bottomside_metadata is another function
    df_all_loss = df_loss

    #pickle 
    if to_pickle:
        start, subdir_name = ntpath.split(subdir_path[:-1])
        start, dir_name = ntpath.split(start)
        df_processed.to_pickle(os.pardir + '/pickle/' + str(dir_name) + '_' + str(subdir_name) + '.pkl')

    df_all_loss = df_all_loss.append(df_loss_coord)

    return df_processed, df_all_loss, df_outlier
    
