<a href="https://colab.research.google.com/github/aubricot/computer_vision_with_eol_images/blob/master/object_detection_for_image_cropping/inspect_crop_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Inspect EOL user-generated cropping coordinates file
---
*Last Updated 1 June 2021*   
The file contains user-determined "best" square cropping coordinates for ~20,000 images from many taxonomic groups. Inspect what is available for building new pipelines.

## Installs & Imports
---

In [None]:
# Optional: Mount google drive to import/export files
# Note: You can also run these steps in a Colab runtime that gets cleared after each session
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import os
import csv
import pandas as pd
import numpy as np
# So URL's don't get truncated & show all cols in display
pd.set_option('display.max_colwidth',1000)
pd.set_option('display.max_columns', None)

# Optional: If using mounted drive, set working directory
if os.path.isdir('/content/drive'):
    # TO DO: Type in the path to your working directory in form field to right
    wd = "/content/drive/MyDrive/train/tf2" #@param {type:"string"}
    %cd $wd

## Get EOL user-generated cropping file
---

In [None]:
# Download EOL user generated cropping data

# Download EOL user generated cropping file to temporary runtime location
#!wget --user-agent="Mozilla" https://editors.eol.org/other_files/EOL_v2_files/image_crops_withEOL_pk.txt.zip

# Unzip cropping file to your working directory
#!unzip /content/image_crops_withEOL_pk.txt.zip -d .

# Read file into runtime
df = pd.read_csv('image_crops_withEOL_pk.txt', sep='\t', header=0)
print("EOL user-generated cropping data: \nNumber of available cropping coordinates: {} \n\n {}".format(len(df), df.head()))

## Inspect available taxa from EOL user-generated cropping file
---

In [None]:
# Define functions

# Tidy special characters from crops ancestry column
def tidy_chars(df):
    df.ancestry.replace(to_replace=['{', '}', '\(', '\)'], value= '', regex=True, inplace=True)
    df.ancestry.replace(to_replace=["'", '"'], value= '', regex=True, inplace=True)
    df.ancestry.replace('{', '', regex=True, inplace=True)
    df.ancestry.replace(to_replace='name:', value='', regex=True, inplace=True)
    print("Tidied ancestry column: \n", df.head())
    
    return df

# Split ancestry column into taxonomic groups
def split_by_ancestry(df, disp_results=False):
    split = pd.DataFrame(df.ancestry.str.split(",", expand=True).stack(), columns=['a'])

    # Class (but not subclass, superclass or infraclass)
    cla = split.a[(split.a.str.contains('class', case=False)==True) & (split.a.str.contains('sub', case=False)==False)
          & (split.a.str.contains('super', case=False)==False) & (split.a.str.contains('infra', case=False)==False)]
    cla.replace('class:', '', regex=True, inplace=True)
    cla = cla.str.split(" ", expand=True)[0]
    if disp_results:
        print("\nSplit by Class: \n", cla)

    # Order (but not suborder, infraorder or superorder)
    ord = split.a[(split.a.str.contains('order', case=False)==True) & (split.a.str.contains('sub', case=False)==False)
          & (split.a.str.contains('infra', case=False)==False) & (split.a.str.contains('super', case=False)==False)]
    ord.replace('order:', '', regex=True, inplace=True)
    ord = ord.str.split(" ", expand=True)[0]
    if disp_results:
        print("\nSplit by Order: \n", ord)

    # Family (but not superfamily or subfamily)
    fam = split.a[(split.a.str.contains('family', case=False)==True) & (split.a.str.contains('super', case=False)==False)
          & (split.a.str.contains('sub', case=False)==False)]
    fam.replace('family:', '', regex=True, inplace=True)
    fam = fam.str.split(" ", expand=True)[0]
    if disp_results:
        print("\nSplit by Family: \n", fam)

    # Genus
    gen = split.a[split.a.str.contains('genus', case=False)==True]
    gen.replace('genus:', '', regex=True, inplace=True)
    gen = gen.str.split(" ", expand=True)[0]
    if disp_results:
        print("\nSplit by Genus: \n", gen)

    return cla, ord, fam, gen

# Get frequency counts for available taxonomic groups
def count_frequency(cla, ord, fam, gen, disp_results=False):
    # Combine all taxa
    all_taxa = pd.concat((cla, ord, fam, gen), axis=0, ignore_index=True)
    #print(all_taxa.head())
    # Count frequency for all taxa
    all_ct = pd.DataFrame(all_taxa.value_counts()).reset_index()
    # Sort by decreasing frequency
    all_ct.columns = ["taxon", "freq"]
    all_ct.sort_values(by="freq", axis=0, ascending=False, inplace=True)

    # Class only
    cla_ct = pd.DataFrame(cla.value_counts()).reset_index()
    # Sort by decreasing frequency
    cla_ct.columns = ["taxon", "freq"]
    cla_ct.sort_values(by="freq", axis=0, ascending=False, inplace=True)
    if disp_results:
        print("\nTop 10 most frequent Classes: \n", cla_ct[:10])
  
    # Order only
    ord_ct = pd.DataFrame(ord.value_counts()).reset_index()
    # Sort by decreasing frequency
    ord_ct.columns = ["taxon", "freq"]
    ord_ct.sort_values(by="freq", axis=0, ascending=False, inplace=True)
    if disp_results:
        print("\nTop 10 most frequent Orders: \n", ord_ct[:10])

    # Family only
    fam_ct = pd.DataFrame(fam.value_counts()).reset_index()
    # Sort by decreasing frequency
    fam_ct.columns = ["taxon", "freq"]
    fam_ct.sort_values(by="freq", axis=0, ascending=False, inplace=True)
    if disp_results:
        print("\nTop 10 most frequent Families: \n", fam_ct[:10])

    # Genus only
    gen_ct = pd.DataFrame(gen.value_counts()).reset_index()
    # Sort by decreasing frequency
    gen_ct.columns = ["taxon", "freq"]
    gen_ct.sort_values(by="freq", axis=0, ascending=False, inplace=True)
    if disp_results:
        print("\nTop 10 most frequent Genera: \n", gen_ct[:10])

    return cla_ct, ord_ct, fam_ct, gen_ct

In [None]:
# Make new dataframe from ancestry column, see ex entry below
crops = df[['ancestry']].copy()
print("Sample ancestry entry to be tidied & split: \n", crops.head(1))

# Tidy special characters from ancestry column
ancestry = tidy_chars(crops)

# Split ancestry column into taxonomic groups
cla, ord, fam, gen = split_by_ancestry(ancestry, disp_results=False)

# Get frequency counts for available taxonomic groups
cla_ct, ord_ct, fam_ct, gen_ct = count_frequency(cla, ord, fam, gen, disp_results=True)

In [None]:
# Optional: Save frequency counts to file

# Class
outfpath = "EOL_cropcoords_freqcounts_order.tsv" #@param {type:"string"}
cla_ct.to_csv(outfpath, sep = '\t', index=False, header=True)

# Order
outfpath = "EOL_cropcoords_freqcounts_order.tsv" #@param {type:"string"}
ord_ct.to_csv(outfpath, sep = '\t', index=False, header=True)

# Family
outfpath = "EOL_cropcoords_freqcounts_family.tsv" #@param {type:"string"}
fam_ct.to_csv(outfpath, sep = '\t', index=False, header=True)

# Genus 
outfpath = "EOL_cropcoords_freqcounts_genus.tsv" #@param {type:"string"}
gen_ct.to_csv(outfpath, sep = '\t', index=False, header=True)