# Get Codes. Author Vinicio Soto, CICIMA

## This code lists every code under a folder. And, if that folder has genus or species information, creates a dataframe with that information. 
## Finally, it saves the data as a .txt file 


In [1]:
#Dependencies

In [2]:
import pandas as pd #Data analysis
import numpy as np  #Array and numeric methods
from matplotlib.backends.backend_pdf import PdfPages #pri*nt PDFS
import matplotlib #pri*nt graphs
import matplotlib.pyplot as plt #pri*nt graphs
import os #operating system
import re #regular expression manipulation
from datetime import datetime #date and time methods
import logging #to log errors

#import spectraltools
#This script requires the file spectraltools.py to work
import sys
# Add the external folder to the system path
current_dir = os.getcwd()
external_folder_path = os.path.abspath(os.path.join(current_dir, '../libraries'))
sys.path.append(external_folder_path)

#This line of code allow us to access data in colab
#functionality to reload modules

import importlib
import spectraltools
import metrics
import datapath_selector

# clear the import cache
importlib.reload(metrics)
importlib.reload(spectraltools)
importlib.reload(datapath_selector)
# now you can import my_class and it'll be updated
from metrics import *
from spectraltools import *
from datapath_selector import get_paths
from pathlib import Path

In [3]:
#parent folder path
date = str(datetime.today().date())

#file_folder_data_path =[ #r"C:\Users\EstebanSoto\Jupyter\escarabajos\L1050_data\CICIMA-2024-05-REFLECTANCE\DORSAL",
                         #r"C:\Users\EstebanSoto\Jupyter\escarabajos\L1050_data\CICIMA-2024-05-REFLECTANCE\VENTRAL",
                         #r"C:\Users\EstebanSoto\Jupyter\escarabajos\L1050_data\CICIMA-2024-03-REFLECTANCE\without iris nor lens",
                         #r"C:\Users\EstebanSoto\Jupyter\escarabajos\L1050_data\2024-04-INBUCR-REFLECTANCE",
                         #r"C:\Users\EstebanSoto\Jupyter\escarabajos\L1050_data\2023-03-CICIMAUCR-2-REFLECTANCE",
                         #r"C:\Users\esteb\cicima\escarabajos\L1050_data\CICIMA-2024-05-REFLECTANCE\DORSAL"
                       #]
parent_folder_data_path = Path(r"E:\Estudio-espectral-escarabajos") 

#create a subfolder called report with the correction process info  
report_path = ((parent_folder_data_path.parent / "reports" )/ f"{date}") / (parent_folder_data_path.name)
#pri*nt(report_path)
report_path.mkdir(parents=True, exist_ok=True)

In [4]:
"""OPTIONS: cicima_laptop, colaboratory, wfh, cicima_desktop
    """
collection_paths = get_paths()
#pri*nt(collection_paths)

inbio_2018_2019_collection = Specimen_Collection("INBIO", collection_paths["2018_2019_inbio_collection_path"] , collection_paths["2018_2019_inbio_collection_metadata"] , "HIGH")
angsol_collection = Specimen_Collection("ANGSOL", collection_paths["angsol_collection_path"] , collection_paths["angsol_collection_metadata"] , "HIGH")
angsol_collection.set_description("ANGSOL collection has specimens that belong to Angel Sol√≠s. The confidence that we have about specimen identification is high.")

cicimaucr_collection = Specimen_Collection("CICIMAUCR1", collection_paths["cicimaucr_collection_path"] , collection_paths["cicima_ucr_metadata"] , "HIGH")
cicimaucr_collection_2 = Specimen_Collection("CICIMAUCR2", collection_paths["cicimaucr_collection_2_path"] , collection_paths["cicima_ucr_metadata"] , "HIGH")
cicimaucr_collection_3 = Specimen_Collection("CICIMAUCR3", collection_paths["cicimaucr_collection_3_path"] , collection_paths["cicima_ucr_metadata"] , "HIGH")
inbucr_collection = Specimen_Collection("INBUCR", collection_paths["inbucr_collection_path"] , collection_paths["inbucr_metadata"] , "HIGH")
bioucr_collection = Specimen_Collection("BIOUCR", collection_paths["bioucr_collection_path"] , collection_paths["bioucr_metadata"] , "LOW")

collection_list = [
                    inbio_2018_2019_collection,
                    angsol_collection,
                    cicimaucr_collection,
                    cicimaucr_collection_2,
                    cicimaucr_collection_3,
                    inbucr_collection,
                    bioucr_collection,
                    ]
codes_in_collection = []
for collection in collection_list:
    codes_in_collection += collection.get_codes()
#print(codes_in_collection)

In [5]:
#The program will go through each folder, it will check if there is any code. If that code already exists in any collection it is ignored.
#If not, it will 

In [25]:
def get_info_from_str(string):
    #genus, species, locus, location, code, polarization, run 
    chrysina =  ["Chrysina", "chrysina"]
    chalcothea = ["chalcothea", "Chalcothea", "chacoltea"]
    calomacraspis = ["Calomacraspis", "calomacraspis"]
    macraspis = ["Macraspis", "macraspis"]
    cupreomarginata = ["cupreo", "cupreomarginata"]
    boucardi = ["boucardi", "Boucardi"]
    sp337 = ["sp377", "Sp337"]
    sp320 = ["sp320", "Sp337"]
    resplendens = ["resplendens"]
    aurigans = ["aurigans"]
    batesi = ["batesi"]
    optima = ["optima"]
    chrysis = ["chrysis"]
    locations = ["talamanca", "tenorio", "CVC", "CG", "Talamanca", "Tenorio"]
    polarizations = ["total","R", "L"]
    loci = ["pronotum", "scutellum", "pronoto", "scutelo", "elytrum", "elitro", "ojo"]
    colors = ["azul", "blue", "green", "verde", "rojo", "dorado", "plateado"]
    genus, species, polarization, location, locus, color = None, None, None, None, None, None
    
    for str1 in locations:
        if (str1 in string):
            location = str1
    for str1 in polarizations:
        if (str1 in string):
            polarization = str1
            if str1 =="total":
                polarization = "T"
    for str1 in chrysina:
        if (str1 in string):
            genus = "Chrysina"
    for str1 in chalcothea:
        if (str1 in string):
            genus = "Chalcothea"
    for str1 in macraspis:
        if (str1 in string):
            genus = "Macraspis"
    for str1 in calomacraspis:
        if (str1 in string):
            genus = "Calomacraspis" #goes below because macraspis is IN calomacraspis
    for str1 in cupreomarginata:
        if (str1 in string):
            genus = "Chrysina"
            species = "cupreomarginata"
    for str1 in sp337:
        if (str1 in string):
            genus = "Chrysina"
            species = "sp337"
    for str1 in resplendens:
        if (str1 in string):
            genus = "Chrysina"
            species = "resplendens"
    for str1 in aurigans:
        if (str1 in string):
            genus = "Chrysina"
            species = "aurigans"
    for str1 in batesi:
        if (str1 in string):
            genus = "Chrysina"
            species = "batesi"
    for str1 in optima:
        if (str1 in string):
            genus = "Chrysina"
            species = "optima"
    for str1 in boucardi:
        if (str1 in string):
            genus = "Chrysina"
            species = "boucardi"
    for str1 in sp337:
        if (str1 in string):
            genus = "Chrysina"
            species = "sp337"
    for str1 in sp320:
        if (str1 in string):
            genus = "Chrysina"
            species = "sp320"
    for str1 in loci:
        if (str1 in string):
            locus = str1
    for str1 in colors:
        if (str1 in string):
            color = str1
    return (genus,species, polarization, location, locus, color)
        
        

In [26]:

df = pd.DataFrame([])
def list_all_filepaths(parent_folder):
    filepaths = []
    miscellaneous_specimens = []

    df = pd.DataFrame(columns = ["code","genus", "species", "polarization", "location", "locus", "color", "path"])
    
    
    # Walk through the directory
    counter = 0
    for root, dirs, files in os.walk(parent_folder):
        
        #print(f"{root=}")
        #print(f"{dirs=}")
        for file in files:
            
            # Combine the root directory with the file name to get the full path
            filepath = os.path.join(root, file)
            filepaths.append(filepath)

            
            #check if the file is a .csv file
            if not file.endswith(".csv"):
                continue
            
            
            #check if the file is not a refscan, darkscan
            if "refscan" in file or "darkscan" in file or "espejo" in file: 
                continue

            #basename
            basename = file.replace(".csv", "")
            
            #check if the file is in any collection list ignore it
            info = get_info_from_format(file)
            code = info["code"]
            #print(f"Info {code=}")
            
            if code in codes_in_collection:
                continue
                
            if not code:
                code = basename #if there is no code, basename will be the new code
            #if there is no collection with that code get info from the directory name
            counter = counter + 1
            
            #create a Specimen
            #get folder title
            genus, species, polarization, location, locus, color = None, None, None, None, None, None
            
            folder_titles = root.split("\\")
            
            print(f"{folder_titles=}")
            
            for folder_title in folder_titles:
                gen, sp, polarization, location, locus, color = get_info_from_str(folder_title)
                
                if gen:
                    genus = gen
                if sp: 
                    species = sp

            
            #add it to the miscellaneous collection

            print(genus, species, polarization, location, locus)
            #get the filename
            #print(file, code)
            path = Path(root) / file
            df.loc[counter] = [code, genus, species, polarization, location, locus, color, path]
            #continue
            print(f"{root + "//" + file =}")
            print(f"{df=}")
    return df
    
df = list_all_filepaths(parent_folder_data_path)
df = df.fillna("")
df = df.astype(str)

print(df)

#convert dataframe to numpy
my_numpy_df_data = df.to_numpy()

#
folder = Path(parent_folder_data_path).parent / "reports"
#create folder
if not os.path.exists(folder):
    os.mkdir(folder)

basename = "no_info_specimens_collections.txt"
#new_archive_name
new_archive_name = folder / basename

#saves info
with open(new_archive_name, 'wb') as f:
    np.savetxt(new_archive_name, my_numpy_df_data, delimiter="\t", fmt = "%s" )

folder_titles=['E:', 'Estudio-espectral-escarabajos', 'Calomacraspis haroldi', 'L-elytrum']
Calomacraspis None L None elytrum
root + "//" + file ='E:\\Estudio-espectral-escarabajos\\Calomacraspis haroldi\\L-elytrum//calomacraspis-elytrum-stdL.csv'
df=                         code          genus species polarization location  \
1  calomacraspis-elytrum-stdL  Calomacraspis    None            L     None   

     locus color                                               path  
1  elytrum  None  E:\Estudio-espectral-escarabajos\Calomacraspis...  
folder_titles=['E:', 'Estudio-espectral-escarabajos', 'Calomacraspis haroldi', 'L-elytrum']
Calomacraspis None L None elytrum
root + "//" + file ='E:\\Estudio-espectral-escarabajos\\Calomacraspis haroldi\\L-elytrum//calomacraspis-elytrumL.csv'
df=                         code          genus species polarization location  \
1  calomacraspis-elytrum-stdL  Calomacraspis    None            L     None   
2      calomacraspis-elytrumL  Calomacraspis    N