In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import os
import math
import re


In [2]:
# This line of code allow us to access data in colab
# open(r"/content/drive/My Drive/CICIMA/escarabajos_files/L1050_data", "r")


### Workplace

In [3]:
"""This section allow the user to choose their workplace location.
This is important if the user has multiple locations and operating systems in which this 
script is run"""

#select location
working_at = "cicima_desktop"

#Training data is used when we are already certain of species and genera for a particular sample
training_data_is_used = False

if working_at == "colaboratory":
  from google.colab import drive
  drive.mount("/content/drive")
  #base folder
  """Select the location for your base folder"""
    
  base_folder = r"/content/drive/My Drive/CICIMA/escarabajos_files"
  
elif working_at == "wfh":

    """Select the location of your base folder"""
    base_folder = r""

elif working_at == "cicima_desktop":
  
    """Select the location of your base folder"""
    base_folder = r"C:\Users\EstebanSoto\Jupyter\escarabajos"

elif working_at == "cicima_laptop":
    
    """Select the location of your base folder"""
    base_folder = r"/home/vinicio/escarabajos"

#define the location of the tables with information about the collections and its parent directory

collection_tables_main_path =  os.path.join(base_folder, "L1050_data","collections")
collection_files_main_path = os.path.join(base_folder, "L1050_data")

# Define report location
report_location = os.path.join(base_folder, "reports","data_analysis")

#collection_descriptor = r"CICIMAUCR and ANGSOL" tododelete

#File location and metadata location for collection 1
angsol_collection_path = os.path.join(collection_files_main_path,"ANGSOL","average") 
angsol_collection_metadata = os.path.join(collection_tables_main_path,"CICIMA-beetles-general-inventory - ANGSOL.txt") 

#File location and metadata location for collection 2
cicimaucr_collection_path = os.path.join(collection_files_main_path,r"TRA_data_CICIMA_INBUCR","CICIMAUCR","reflectance")  #listo
cicimaucr_collection_2_path = os.path.join(collection_files_main_path,r"CICIMA-2024-01-REFLECTANCE","average")
cicimaucr_collection_3_path = os.path.join(collection_files_main_path,r"CICIMA-2024-03-REFLECTANCE","without iris nor lens","average")
cicima_ucr_metadata = os.path.join(collection_tables_main_path,r"CICIMA-beetles-general-inventory - CICIMAUCR.txt") 

#File location and metadata location for collection 3
inbucr_collection_path = os.path.join(collection_files_main_path,r"INBUCR","average") #listo
inbucr_metadata = os.path.join(collection_tables_main_path,r"CICIMA-beetles-general-inventory - INBUCR.txt") 

#File location and metadata location for collection 4
bioucr_collection_path = os.path.join(collection_files_main_path,r"BIOUCR","average")  #listo
bioucr_metadata = os.path.join(collection_tables_main_path,r"CICIMA-beetles-general-inventory - BIOUCR.txt") 

#agregated data location, here averages and std will be saved when training data and retreived when classifying spectra
agregated_data_location = os.path.join(base_folder, "agregated_data")
agregated_data_avg_path = os.path.join(agregated_data_location,"peak_averages_krc.txt" )  #listo
agregated_data_std_path  = os.path.join( agregated_data_location , r"peak_std_krc.txt") #listo

#These variables delimit the thresholds used to determine if a point can be considered a maximum
prominence_threshold_min = 0.15
prominence_threshold_max = 0.60
min_height_threshold_denominator = 3.0
max_height_threshold_denominator = 2.5
min_distance_between_peaks = 125 #nm

#amount of probability that is ok to overlap with other species region
percentile_differentiation_amount = 0.25


In [17]:
class Specimen_Collection:
    """This class represents a physical collection of specimens"""
    def read_collection(self,collection_path):
        with open(collection_path, encoding= "latin1") as f:
          df = pd.read_csv(f, sep="\t", decimal =",", header=0, encoding="iso-8859-1")
          return df

    def __init__(self, name, data_folder_path, metadata_path, quality):
        self.name = name
        self.data_folder_path = data_folder_path
        self.metadata = self.read_collection(metadata_path)
        self.quality = quality
        self.description = "No description"

    def set_description(self, description):
        self.description = description
        
    def get_metadata(self):
        return self.metadata
    
    def get_data_folder_path(self):
        return self.data_folder_path
    
    def get_data_filenames(self):
        
        folder_path = self.get_data_folder_path()
        
        #list files in folder
        file_list = os.listdir(folder_path)

        #file extension
        file_extension = ".txt"

        # filters a list of strings to create a new list containing only the elements that end with file_extension
        def filter_substring_elements(path_strings, substring):
            filtered_paths = [path for path in path_strings if substring in path]
            return filtered_paths

        #full path list
        filtered_list = [os.path.join(folder_path, path) for path in filter_substring_elements(file_list, file_extension)]
        
        return filtered_list
        
    def read_spectrum(self, file_path, collection):

        metadata, df = get_metadata_and_dataframe(file_path)
        #print(metadata)
        #print(df)
        spectrum = Spectrum(metadata["filename"], metadata, df, collection )
        #print(spectrum.data)
        return spectrum


In [6]:
#Instantiate collection
angsol_collection = Specimen_Collection("ANGSOL", angsol_collection_path, angsol_collection_metadata, "HIGH")
angsol_collection.set_description("ANGSOL collection has specimens that belong to Angel Solis. The confidence that we have about specimen identification is high.")

cicimaucr_collection = Specimen_Collection("CICIMAUCR1", cicimaucr_collection_path, cicima_ucr_metadata, "HIGH")
cicimaucr_collection.set_description("CICIMAUCR1 has samples of the elytrum of beetles collected in Monteverde and La Amistad")

cicimaucr_collection_2 = Specimen_Collection("CICIMAUCR2", cicimaucr_collection_2_path, cicima_ucr_metadata, "HIGH")

cicimaucr_collection_3 = Specimen_Collection("CICIMAUCR3", cicimaucr_collection_3_path, cicima_ucr_metadata, "HIGH")
cicimaucr_collection_3.set_description("CICIMAUCR1 has samples of the elytrum of beetles collected in Monteverde and La Amistad")

inbucr_collection = Specimen_Collection("INBUCR", inbucr_collection_path, inbucr_metadata, "MID")

bioucr_collection = Specimen_Collection("BIOUCR", bioucr_collection_path, bioucr_metadata, "LOW")
bioucr_collection.set_description("BIOUCR collection has specimens of the Whittan collection. Most of the specimens were collected in Monteverde")

collection_list = [
                    #angsol_collection,
                    #cicimaucr_collection,
                    #cicimaucr_collection_2,
                    cicimaucr_collection_3,
                    #inbucr_collection,
                    #bioucr_collection,
                    ]
collection_names_set = set([collection.name for collection in collection_list])
collection_names = " ".join(sorted(collection_names_set))
print(collection_names)
#date
from datetime import datetime
current_date = datetime.now().date()

CICIMAUCR3


In [18]:
#test collection class
def test_collection_class():
    angsol_collection = Specimen_Collection("ANGSOL", angsol_collection_path, angsol_collection_metadata, "HIGH")
    #print(f"{angsol_collection.get_metadata()=} \n" )
    print(f"{angsol_collection.get_data_folder_path()=} \n" )   
    print(f"{angsol_collection.get_data_filenames()=} \n" )
test_collection_class()

angsol_collection.get_data_folder_path()='C:\\Users\\EstebanSoto\\Jupyter\\escarabajos\\L1050_data\\ANGSOL\\average' 

angsol_collection.get_data_filenames()=['C:\\Users\\EstebanSoto\\Jupyter\\escarabajos\\L1050_data\\ANGSOL\\average\\ANGSOL0001.txt', 'C:\\Users\\EstebanSoto\\Jupyter\\escarabajos\\L1050_data\\ANGSOL\\average\\ANGSOL0002.txt', 'C:\\Users\\EstebanSoto\\Jupyter\\escarabajos\\L1050_data\\ANGSOL\\average\\ANGSOL0003.txt', 'C:\\Users\\EstebanSoto\\Jupyter\\escarabajos\\L1050_data\\ANGSOL\\average\\ANGSOL0004.txt', 'C:\\Users\\EstebanSoto\\Jupyter\\escarabajos\\L1050_data\\ANGSOL\\average\\ANGSOL0005.txt', 'C:\\Users\\EstebanSoto\\Jupyter\\escarabajos\\L1050_data\\ANGSOL\\average\\ANGSOL0006.txt', 'C:\\Users\\EstebanSoto\\Jupyter\\escarabajos\\L1050_data\\ANGSOL\\average\\ANGSOL0007.txt', 'C:\\Users\\EstebanSoto\\Jupyter\\escarabajos\\L1050_data\\ANGSOL\\average\\ANGSOL0008.txt', 'C:\\Users\\EstebanSoto\\Jupyter\\escarabajos\\L1050_data\\ANGSOL\\average\\ANGSOL0009.txt', 'C:\

In [8]:
def create_path_if_not_exists(path):
        # Check if the path already exists
        if not os.path.exists(path):
            # Create the directory and any missing parent directories
            os.makedirs(path)
            print(f"Directory '{path}' created successfully.")
        else:
            print(f"Directory '{path}' already exists.")

In [26]:
#Spectrum class
class Spectrum:
    """This class represents the data and metadata for a L1050 file"""
    def get_metadata_and_dataframe(self, file_location):
         #definitions
        #Logic to read ASCII data
        import os
        import pandas as pd
        import re

        def get_sample_code_from_filename(row_str, file_location):
            #print("string")
            #print(file_location)
            filename = os.path.basename(file_location)
            re1 = r"([a-zA-Z\d]+)(?:-\d)*(?:.Sample)*.(?:txt)*(?:ASC)*"
            #Names are in the form CODE-MEASUREMENTNUMBER.TXT
            p = re.compile(re1)
            m = p.match(filename)
            # print(f"match filename: {m}")
            if m:
                # print(f"group 1: {m.group(1)}")
                return(m.group(1))
            return get_sample_code(row_str)

        def get_sample_code(row_str):
            #Tries to get the sample code from the file, if it does not match
            #it tries to get it from the filename.
            # print("string")
            # print(row_str)
            re1 = r"([a-zA-Z\d]+)(?:-\d)*(?:.Sample)*.(?:txt)*(?:ASC)*"
            #Names are in the form CODE-MEASUREMENTNUMBER.TXT
            p = re.compile(re1)
            m = p.match(row_str)
            # print(f"match: {m}")
            if m:
                return(m.group(1))
            else:
                ""

        def responses(str):
            re1 = "\d+/(\d+,\d+) \d+,\d+/(\d+,\d+)"
            p = re.compile(re1)
            m= p.match(str)
            if m:
                return m.group(1),m.group(2)
            else:
                return "",""
        def attenuator_settings(str):
            re1 = "S:(\d+,\d+) R:(\d+,\d+)"
            p = re.compile(re1)
            m= p.match(str)
            if m:
                return m.group(1),m.group(2)
            else:
                return "",""
        def slit_pmt_aperture(str):
            re1 = "\d+/servo \d+,\d+/(\d+,\d+)"
            p = re.compile(re1)
            m= p.match(str)
            if m:
                return m.group(1)
            else:
                return ""
        #Initializa metadata dict
        metadata = {}

        #Read header
        lines = []
        with open(file_location, encoding= "latin1") as myfile:
            lines = myfile.readlines()[0:90]
        metadata["header"] = "".join(lines)


        #read_metadata
        f = open(file_location, encoding= "latin1")

        df = pd.DataFrame()
        with f as data_file:
            for index, row in enumerate(data_file): #0-89

                row_str = row.strip()
                if index +1 == 3: #Filename and extension
                    metadata["filename"]= row_str
                    metadata["code"] = get_sample_code_from_filename(row_str, file_location)
                if index + 1 == 4: #date DD/MM/YYYY
                    metadata["date"]= row_str
                if index + 1 == 5:#Time HH:MM:SS.SS
                    metadata["time"]= row_str
                if index + 1 == 8:#user
                    metadata["user"]= row_str
                if index + 1 == 9:#description
                    metadata["description"]= row_str
                if index + 1 == 10:#minimum wavelength
                    metadata["minimum_wavelength"]= row_str
                if index + 1 == 12:#equipment name
                    metadata["equipment"]= row_str
                if index + 1 == 13:#equipment series
                    metadata["series"]= row_str
                if index + 1 == 14:#data visualizer version, equipment version, date and time
                    metadata["software"]= row_str
                if index + 1 == 21:#Operating mode
                    metadata["operating_mode"]= row_str
                if index + 1 == 22: #Number of cycles
                    metadata["cycles"]= row_str
                if index + 1 == 32: #range/servo
                    metadata["slit_pmt"]= slit_pmt_aperture(row_str)
                if index + 1 == 33:
                    metadata["response_ingaas"], metadata["response_pmt"]= responses(row_str)
                if index + 1 == 35: #pmt gain, if 0 is automatic
                    metadata["pmt_gain"]= row_str
                if index + 1 == 36: #InGaAs detector gain
                    metadata["ingaas_gain"]= row_str
                if index + 1 == 42:#monochromator wavelength nm
                    metadata["monochromator_change"]= row_str
                if index + 1 == 43:#lamp change wavelength
                    metadata["lamp_change"]= row_str
                if index + 1 == 44:#pmt wavelength
                    metadata["pmt_change"]= row_str
                if index + 1 == 45:#beam selector
                    metadata["beam_selector"]= row_str
                if index + 1 == 46:
                    metadata["cbm"]= row_str
                if index + 1 == 47: #cbd status, on/off
                    metadata["cbd_status"]= row_str
                if index + 1 == 48: #attenuator percentage
                    metadata["attenuator_sample"], metadata["attenuator_reference"]= attenuator_settings(row_str)
                if index + 1 == 49:
                    metadata["polarizer"]= row_str
                if index + 1 == 80:
                    metadata["units"]= row_str
                if index + 1 == 81:
                    metadata["measuring_mode"]= row_str
                if index + 1 == 84:
                    metadata["maximum_wavelength"]= row_str
                if index + 1 == 85:
                    metadata["step"]= row_str
                if index + 1 == 86:
                    metadata["number_of_datapoints"]= row_str
                if index + 1 == 88:
                    metadata["maximum_measurement"]= row_str
                if index + 1 == 89:
                    metadata["minimum_measurement"]= row_str
                if index +1 == 90:
                    break
            df = pd.read_csv(f, sep="\t", decimal =".", names=["wavelength", metadata["measuring_mode"]]).dropna()
            df = df[df["wavelength"]<2000]
            df["wavelength"],df[metadata["measuring_mode"]] = df["wavelength"].astype(float), df[metadata["measuring_mode"]].astype(float)
            return metadata, df
    
    def __str__(self):
        return self.code

    def __init__(self, file_location, collection):

        import re

        def get_genus(code, collection):
            #print("get_genus")

            specimen= collection.loc[collection["code"]==code]

            if specimen.empty:
                print(f"No data for {code} in collection {collection}")
                return ""
            #print("not mt")
            # print(f"specimen genus {specimen}")
            result = specimen.iloc[0]["genus"]
            #print(f"genus, type{type(result)}")
            
            if isinstance(result,str):

                return result
            else:

                return str(result)

        def get_species(code, collection):
            #print("get_species")
            # print(f"code: {code}")
            specimen=collection.loc[collection["code"]==code]

            if specimen.empty:
                print(f"No data for {code} in collection {collection}")
                result = ""
            #print("not mt")
            #print(f"specimen species {specimen}")
            result = str(specimen.iloc[0]["species"])
            #print(f"species, type{type(result)}")
            if isinstance(result,str):

                return result
            else:

                return str(result)

        #attributes
        self.file_location = file_location
        self.collection = collection
        
        self.metadata, self.data = self.get_metadata_and_dataframe(file_location)
        self.code = self.metadata["code"]
        self.filename =  self.metadata["filename"]
        self.genus = get_genus(self.code, collection.get_metadata())
        self.species = get_species(self.code, collection.get_metadata())
        self.measuring_mode = self.metadata["measuring_mode"]

    def plot(self):
        measuring_mode = self.metadata["measuring_mode"]
        return self.data.plot(x="wavelength", y =self.metadata["measuring_mode"], grid=True, markersize=3, title=f"{measuring_mode} for {self.genus} {self.species}, code {self.code}")


    def get_normalized_spectrum(self):
        df = self.data[["wavelength", self.measuring_mode]]
        max_value = df[self.measuring_mode].max()
        df[self.measuring_mode] = df[self.measuring_mode]/max_value
        return df
####



In [27]:

def test_spectrum_class():
    angsol_collection = Specimen_Collection("ANGSOL", angsol_collection_path, angsol_collection_metadata, "HIGH")
    filenames = angsol_collection.get_data_filenames()
    file1 = filenames[0]
    spectrum = Spectrum(file1, angsol_collection)
    spectrum
test_spectrum_class()

AttributeError: 'Specimen_Collection' object has no attribute 'loc'

In [10]:
def read_spectrum(file_path, collection):

    metadata, df = get_metadata_and_dataframe(file_path)
    #print(metadata)
    #print(df)
    spectrum = Spectrum(metadata["filename"], metadata, df, collection )
    #print(spectrum.data)
    return spectrum

def read_spectra_from_folder(folder_path, collection_metadata):

    #list files in folder
    file_list = os.listdir(folder_path)

    #file extension
    file_extension = ".txt"

# filters a list of strings to create a new list containing only the elements that end with file_extension

    def filter_substring_elements(path_strings, substring):
        filtered_paths = [path for path in path_strings if substring in path]
        return filtered_paths

    #full path list
    filtered_list = [os.path.join(folder_path, path) for path in filter_substring_elements(file_list, file_extension)]

    #read each element of filtered_list
    spectra = []

    for path in filtered_list:
        spectrum = read_spectrum(path, collection_metadata)
        spectra.append(spectrum)

    return spectra

In [13]:
def read_spectra_from_collection(collection):
    data_folder_path = collection.get_data_folder_path()
    collection_metadata = collection.get_metadata()
    
    spectra = read_spectra_from_folder(data_folder_path, collection_metadata)
    print(collection_metadata)

read_spectra_from_collection(angsol_collection)

NameError: name 'get_metadata_and_dataframe' is not defined