In [None]:
%matplotlib inline

# Evaluation of classification methods for automatic phytolith identification

This notebook contains functions and examples to obtain geometric, morphological, and elliptic Fourier characteristics from phytoliths images.

The execution of this notebook with a great number of images can take several hours, depending on the machine where it is executed.

## Authors
- José-Francisco Díez-Pastor
- Pedro Latorre-Carmona
- Álvar Arnaiz-González
- Javier Ruiz-Pérez
- Débora Zurro

# Exploration and Dataset Generation


In [None]:
'''
Install pyedf, compute elliptic fourier descriptors (if needed)
'''
!pip install pyefd

In [None]:
'''
Import libraries
'''

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt                
import json
import math
import glob
import os
from skimage.draw import polygon
from skimage.measure import regionprops, find_contours, label
from skimage.transform import resize, rotate
from skimage.util import montage
from skimage.morphology import convex_hull_image
from skimage import io
from skimage.io import imread, imshow
from skimage.color import rgb2gray
from pyefd import elliptic_fourier_descriptors

from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

In [None]:
'''
Configuration data

csvs_path = Directory with multiple subfolders, one for each morphotype, 
each of these subfolders with a .csv file per image.

imgs_path = Directory with multiple subfolders, one for each morphotype, 
each of these subfolders contains multiple images.

Watch out: delimiter separator depends on the operating system.
'''

path = "./phytoliths"

# Path to csv files
csvs_path = path + os.sep + "csvs"

# Path to image files
imgs_path = path + os.sep + "imgs"

features_file = path + os.sep + "features_2020.csv"
metadata_file = path + os.sep + "metadata.csv"

In [None]:
def string_to_dict(dict_string):
    """
    Convert a string to a dictionary
    
    Parameters
    ----------
    dict_string : string
        string containing a dictionary encoded as text
    
    Returns
    -------
    dictionay : dict
        A dictionary containing the same information
    """
    dict_string = dict_string.replace("'", '"').replace('u"', '"')
    return json.loads(dict_string)

def process_csv(file):
    """
    Process the csv files obtained by the image labeler    
    
    Parameters
    ----------
    file : string
        string containing the path to the csv file
    
    Returns
    -------
    data : tuple
        A tuple containing the bounding box coordinates, 
        the full list of contour points and the image name
    """
    data = pd.read_csv(file) 
    img_name = data.filename
    points = data.region_shape_attributes[0]
    points_dict = string_to_dict(points)
    
    xs = points_dict["all_points_x"]
    ys = points_dict["all_points_y"]
    
    coords = list(zip(ys,xs)) 
    
    return ((min(ys),min(xs)),(max(ys),max(xs))) ,coords, img_name[0]

In [None]:
'''
Examine and process all csv files
'''

path = csvs_path+os.sep
clases = []
rects = []
coords = []
image_files = []

# Listing all .csv files
files = [f for f in glob.glob(path+"**"+os.sep+"*.csv", recursive=True)]

for file in files:    
   
    clases.append(file.split(os.sep)[-2])
        
    rect, coord, image_file = process_csv(file)
    rects.append(rect)
    coords.append(coord)
    image_files.append(image_file)
    
# intialise data of lists. 
data = {'Image':image_files, 
        'Rectangle':rects,
        'Coords': coords,
        'Class':clases} 
  
# Create DataFrame 
df = pd.DataFrame(data)[["Image","Rectangle","Coords","Class"]] 

# Show first lines
df.head()   

In [None]:
# Save the metadata asociated with each image in a metadata file
df.to_csv(metadata_file)

In [None]:
# Show the number of examples per class
df.groupby(["Class"]).count().Coords

In [None]:
# Example: Removing a morphotype with few records
df = df[~(df.Class=="Trilobate")]

In [None]:
'''
Generation of a DataFrame with the data of 8 images for each of the 8
studied morphotypes.

This DataFrame is only used for visualizations
'''

classes = df.Class.unique()

# In each class choose the first 8 elements
classes_8 = list(map(lambda clase: df[df.Class==clase].head(8),classes))

df_mosaic = pd.concat(classes_8, axis= 0)

In [None]:
def get_mask(img,r,c):
    """
    Obtains the image mask    
    
    Parameters
    ----------
    img : ndarray
        The image
    r ndarray
        Row coordinates of vertices of polygon.
    c ndarray
        Column coordinates of vertices of polygon.

    
    Returns
    -------
    mask : ndarray of type ‘bool’.

    The mask that corresponds to the input polygon.

    """
    image_shape = img.shape[:-1]    
    mask = np.zeros(image_shape, dtype=np.uint8)
    rr, cc = polygon(r, c)
    mask[rr, cc] = 1
    
    return mask


def get_debug_imgs(register,masks_imgs,gray_imgs):
    """
    Store croped images and their mask images for visualization 
    and testing
    
    Parameters
    ----------
    register : Series
        A pandas Series that stores metadata about the image 
        and the phytolith present inside, such as the polygon 
        that defines it 
    masks_imgs ndarray
        List to store the mask images.
    gray_imgs ndarray
        List to store the gray images.

    
    Returns
    -------
    None

    """
    
    img_name = register.Image   
    Class = register.Class  
    img_dir = imgs_path+os.sep+Class+os.sep        
    img_path = img_dir+img_name     
    
    
    r,c = zip(*register.Coords)
    (y1,x1),(y2,x2) = register.Rectangle
    
    img = imread(img_path)
    img_gray = rgb2gray(img)    
    mask = get_mask(img,r,c)    
    

    masks_imgs.append(mask[y1:y2,x1:x2].copy())
    gray_imgs.append(img_gray[y1:y2,x1:x2].copy())    

In [None]:
masks_imgs = []
gray_imgs = []

# df_none is empty, the 'apply' stores the images in masks_imgs and gray_imgs
df_none = df_mosaic.apply(lambda x: get_debug_imgs(x,masks_imgs,gray_imgs),axis=1)

In [None]:
def show(x=0):
    """
    Show the image with the mask image
    
    Parameters
    ----------
    x : integer
        Index of the image to show. 
    
    
    Returns
    -------
    None

    """
    c = np.concatenate((gray_imgs[x],masks_imgs[x]))
    io.imshow(c)

In [None]:
'''
Interactively display the grayscale image along with 
the binary mask that delimits the phytolith
'''
interact(show, x = (0, len(gray_imgs), 1));

In [None]:
def get_montage_debug_imgs(gray_imgs,masks_imgs):
    """
    Create the figure that group together 8 images of each morphotype
    
    Parameters
    ----------
    gray_imgs : list
        List of gray images 
        
    masks_imgs : list
        List of mask images
    
    
    Returns
    -------
    arr_out : ndarray 

    The image of the assembly that contains 8 pairs of images for 
    each morphotype, each pair is formed by the image in grayscale 
    and the binary mask that delimits the shape of the phytolith.

    """
    pair_list = []
    for i in range(len(gray_imgs)):

        new_shape = (256,128)    
        c = np.concatenate((gray_imgs[i],masks_imgs[i]))
        #print(i,c.shape)
        # reshape keeping aspect_ratio

        ratio = np.min([n / i for n, i in zip(new_shape, c.shape)])
        interm_shape = np.rint([s * ratio for s in c.shape]).astype(np.int)
        interm_img = resize(c, interm_shape, order=1, cval=np.min(c))

        result = np.zeros(new_shape)
        result[:interm_img.shape[0],:interm_img.shape[1]] = interm_img.copy()

        pair_list.append(result)


    # 3d array (n_images, 256, 128)
    arr_in = np.array(pair_list)
    
    # montage
    arr_out = montage(arr_in)

    return arr_out

In [None]:
montage_img = get_montage_debug_imgs(gray_imgs,masks_imgs)

In [None]:
fig, ax = plt.subplots(figsize=(180, 20))
ax.imshow(montage_img,cmap=plt.get_cmap('gray'))
plt.tight_layout()

In [None]:
def get_min_max_feret(mask):
    """
    Compute min_feret and max_feret    
    
    Parameters
    ----------
    mask : ndarray
        Binary image 
   
    Returns
    -------
    feret_max : integer 
    feret_min : integer

    """
    feret_max = 0
    feret_min = 999999

    # the idea is to make 360 rotations and take out the size of the bounding box
    for i in range(360):
        mask_r = rotate(mask,i,preserve_range=True,resize=True)

        label_image = label(mask_r)

        region = regionprops(label_image)[0]
        minr, minc, maxr, maxc = region.bbox
        lengths = (maxc-minc, maxr - minr)
        max_l = max(lengths)
        min_l = min(lengths)

        if max_l > feret_max:
            feret_max = max_l
        if min_l < feret_min:
            feret_min = min_l
        
    return feret_max, feret_min 


def get_efd(mask):
    """
    Compute elliptic fourier descriptors    
    
    Parameters
    ----------
    mask : ndarray
        Binary image 
   
    Returns
    -------
    Edfs : ndarray 
        Array of elliptic fourier descriptors

    """
    contours = find_contours(mask, 0.5)
    
    coeffs = elliptic_fourier_descriptors(contours[0],order=10,normalize=True)
    return coeffs.flatten()[3:]

In [None]:
def register_to_features(register):
    """
    Compute elliptic fourier descriptors    
    
    Parameters
    ----------
    mask : ndarray
        Binary image 
   
    Returns
    -------
    Edfs : ndarray 
        Array of elliptic fourier descriptors

    """
    img_name = register.Image
    Class = register.Class    
    
    img_dir = imgs_path+os.sep+Class+os.sep    
    Name_str = Class+"_"+img_name    
    
    img_path = img_dir+img_name 
    
    
    # the polygon function needs the rows on one var and the columns on other
    r,c = zip(*register.Coords)
    (y1,x1),(y2,x2) = register.Rectangle
    
    img = imread(img_path)
    img_gray = rgb2gray(img)
    
    
    mask = get_mask(img,r,c)    
    efds = get_efd(mask)  
    
    # min feret and max feret
    Length,Width = get_min_max_feret(mask[y1:y2,x1:x2])
    
    ## Properties of the mask
    region = regionprops(mask,intensity_image=img_gray)[0]
    Perimeter = region.perimeter
    Area = region.area
    ConvexArea = region.convex_area
    MajorAxisLength = region.major_axis_length
    MinorAxisLength = region.minor_axis_length
    EquivDiam = region.equivalent_diameter
    
    ## convex hull of the mask.
    chull = convex_hull_image(mask)
    regionPerimConvexHull = regionprops(chull.astype(int))[0]
    perimeterHull = regionPerimConvexHull.perimeter
    
    Convexity = perimeterHull/Perimeter    
    Solidity = Area/ConvexArea
    AspectRatio = Length/Width
    Roundness = (4*Area*(math.pi))/((Length)**2)
    Compactness = EquivDiam/Length
    
    FormFactor = (4*Area*(math.pi))/((Perimeter)**2)
    
    basic_values = pd.Series([Name_str,Class],["Name","Class"])
    
    morfo_values = pd.Series([Perimeter,perimeterHull,Area,ConvexArea,MajorAxisLength,MinorAxisLength,
                               EquivDiam,FormFactor,Length,Width,
                               Convexity,Solidity, AspectRatio,Roundness,Compactness],
                              ["Perimeter","PerimeterHull","Area","Convex Area",
                               "Major axis length","Minor axis length",
                               "Equivalent diameter","Form factor","Length","Width",
                               "Convexity","Solidity", "AspectRatio","Roundness","Compactness"])
    edf_values = pd.Series(efds,["edf"+str(i) for i in range(len(efds))])
    
    return pd.concat((basic_values,morfo_values,edf_values))

In [None]:
# Test one example
example1 = df.iloc[0]
example1

In [None]:
register_to_features(example1)

In [None]:
df_morpho = df.apply(register_to_features,axis=1)
df_morpho.head()

In [None]:
# Save the dataset in a csv file
df_morpho.to_csv(features_file)