# Quadratic difference measurement

#### Author: Vinicio Soto, CICIMA
#### This script works with .txt files with wavelength and transflectance spectra and uses average files and std deviation files to calculate the difference between samples and patterns.

### Dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib
import os
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import itertools

### Data paths

In [2]:
#report path
report_location = r"C:\Users\EstebanSoto\Documents\Estudio Optico Escarabajos\TRANSFLECTANCIA NOV 23\report"

#patterns and samples

patterns_avg_data_path = r"C:\Users\EstebanSoto\Documents\Estudio Optico Escarabajos\TRANSFLECTANCIA NOV 23\datos\corrected_files\average"
patterns_std_dev_data_path = r"C:\Users\EstebanSoto\Documents\Estudio Optico Escarabajos\TRANSFLECTANCIA NOV 23\datos\corrected_files\std_dev"

sample_data_path = r"C:\Users\EstebanSoto\Documents\Estudio Optico Escarabajos\TRANSFLECTANCIA NOV 23\datos\corrected_files"


### Create folder if it does not exist

In [3]:
def create_folder_if_not_exists(folder_path):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print(f"Folder '{folder_path}' created successfully.")
    else:
        print(f"Folder '{folder_path}' already exists.")

### Lists all archives in folder

In [4]:
import os

def list_files_in_folder(folder_path):
    file_list = []
    
    # Check if the folder path exists
    if os.path.exists(folder_path):
        # Get all files in the folder
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                file_list.append(file_path)
    else:
        print("Folder path does not exist.")

    return file_list


avg_file_list_general = list_files_in_folder(patterns_avg_data_path)
std_dev_file_list_general = list_files_in_folder(patterns_std_dev_data_path)
sample_file_list_general = list_files_in_folder(sample_data_path)

#std_dev_file_list
sample_file_list_general

Folder path does not exist.
Folder path does not exist.
Folder path does not exist.


[]

#### For this analysis only Platy specimens will be used as control

In [10]:
# filters a list of strings to create a new list containing only the elements that end with ".txt"

def filter_asc_elements(input_list):
    return [element for element in input_list if element.endswith(".txt")]
def filter_substring_elements(path_strings, substring):
    filtered_paths = [path for path in path_strings if substring in path]
    return filtered_paths 

pattern_species = "PLATY"
sample_species = "AVERAGE"
# Filtering elements ending with ".ASC"
avg_file_list = filter_substring_elements(avg_file_list_general , pattern_species)
#std_dev_file_list = filter_substring_elements(std_dev_file_list_general, "")
sample_file_list = filter_substring_elements(sample_file_list_general, sample_species)

# Displaying the filtered list
#avg_file_list
sample_file_list

['C:\\Users\\EstebanSoto\\Documents\\Estudio Optico Escarabajos\\TRANSFLECTANCIA NOV 23\\datos\\corrected_files\\average\\C. AURIGANS INBIO DESCABEZADO_AVERAGE.txt',
 'C:\\Users\\EstebanSoto\\Documents\\Estudio Optico Escarabajos\\TRANSFLECTANCIA NOV 23\\datos\\corrected_files\\average\\C. CHRYSARGYREA #73 MV23_AVERAGE.txt',
 'C:\\Users\\EstebanSoto\\Documents\\Estudio Optico Escarabajos\\TRANSFLECTANCIA NOV 23\\datos\\corrected_files\\average\\C. CHRYSARGYREA #84 MV23_AVERAGE.txt',
 'C:\\Users\\EstebanSoto\\Documents\\Estudio Optico Escarabajos\\TRANSFLECTANCIA NOV 23\\datos\\corrected_files\\average\\C. CHRYSARGYREA INBIOCRI002426713_AVERAGE.txt',
 'C:\\Users\\EstebanSoto\\Documents\\Estudio Optico Escarabajos\\TRANSFLECTANCIA NOV 23\\datos\\corrected_files\\average\\C. CUPREOMARGINATA MV 23-15_AVERAGE.txt',
 'C:\\Users\\EstebanSoto\\Documents\\Estudio Optico Escarabajos\\TRANSFLECTANCIA NOV 23\\datos\\corrected_files\\average\\CHRYSINA BOUCARDI CORONAD0 (ICP) 2023_AVERAGE.txt',
 'C:

### Read data

In [12]:
# Function to read data from files and filter within wavelength range
def read_data(pattern_file, sample_file, initial_wavelength, final_wavelength):
    # Read pattern file
    pattern_data = pd.read_csv(pattern_file, sep='\s+', header=None, names=['Wavelength', 'Transflectance'])
    pattern_data.dropna(inplace=True)

    # Filter pattern data within the specified wavelength range
    pattern_data = pattern_data[(pattern_data['Wavelength'] >= initial_wavelength) & 
                                (pattern_data['Wavelength'] <= final_wavelength)]

    # Read sample file
    sample_data = pd.read_csv(sample_file, sep='\s+', header=None, names=['Wavelength', 'Transflectance'])
    sample_data.dropna(inplace=True)

    # Filter sample data within the specified wavelength range
    sample_data = sample_data[(sample_data['Wavelength'] >= initial_wavelength) & 
                              (sample_data['Wavelength'] <= final_wavelength)]

    return pattern_data, sample_data

### calculate differences

In [11]:


# Function to calculate the sum of squares of differences between two datasets
def sqrt_of_sum_of_squares_differences(df1, df2):
     # Merge the dataframes on 'Wavelength' to get common wavelengths
    merged = pd.merge(df1, df2, on='Wavelength', suffixes=('_df1', '_df2'), how='inner')

    # Calculate squared differences for common wavelengths
    merged['Squared_Differences'] = (merged['Transflectance_df1'] - merged['Transflectance_df2']) ** 2

    # Create a new dataframe with common wavelengths and squared differences
    result_df = merged[['Wavelength', 'Squared_Differences']]
    
    sum_squares_diff = np.sum(result_df ** 2, axis=0)
    sqrt_of_sum_squares_diff = np.sqrt(sum_squares_diff)
    return sqrt_of_sum_squares_diff

# Function to calculate squared differences for common wavelengths between dataframes
def squared_differences_common_wavelengths(df1, df2):
    # Merge the dataframes on 'Wavelength' to get common wavelengths
    merged = pd.merge(df1, df2, on='Wavelength', suffixes=('_df1', '_df2'), how='inner')

    # Calculate squared differences for common wavelengths
    merged['Squared_Differences'] = (merged['Transflectance_df1'] - merged['Transflectance_df2']) ** 2

    # Create a new dataframe with common wavelengths and squared differences
    result_df = merged[['Wavelength', 'Squared_Differences']]

    return result_df

# Main function to compute differences and return squared differences dataframe
def differences(pattern_file, sample_file, initial_wavelength, final_wavelength):
    pattern_data, sample_data = read_data(pattern_file, sample_file, initial_wavelength, final_wavelength)
    
    # Calculate sum of squares of differences
    #sum_squares_diff = sum_of_squares_differences(squared_differences_common_wavelengths(pattern_data, sample_data))
    
    
    sqrt_sum_squares_diff = sqrt_of_sum_of_squares_differences(pattern_data, sample_data)
    return squared_differences_common_wavelengths(pattern_data, sample_data), sqrt_sum_squares_diff

### Pdf report definition

In [None]:
def plot_and_save_diff_report(initial_wavelength, final_wavelength, sample_file_list, pattern_file_list, report_location):
    
    create_folder_if_not_exists(report_location)

    columns = ['sample', 'pattern', 'dataframe']
    dataframes = pd.DataFrame(columns=columns)
    new_rows_list = []
    counter= 0
    substring = "_AVERAGE.txt"

    for avg_pattern in pattern_file_list:
        for sample in sample_file_list:

            if counter == 100: break

            df, sqrt_sum_squares_diff = differences(sample, avg_pattern, initial_wavelength, final_wavelength)
            # Plot using Pandas' built-in plot method
            txt = 'Squared Diff: '+ str(np.round(sqrt_sum_squares_diff["Squared_Differences"],2))   + ' for ' + os.path.basename(sample).replace(substring, "") + " and " + os.path.basename(avg_pattern).replace(substring, "")
            ax = df.plot(x='Wavelength', y='Squared_Differences', label="Dataframe", title=txt)
            ax.set_xlabel('Wavelength' + "\n\n\n" + txt)
            ax.set_ylabel('Squared_Differences')
            ax.legend()
            ax.grid(True)

            #plt.show()  # Display the plot

            # Save figures
            pdf.savefig()
            # Close plot
            plt.close()

            # Increment counter
            counter += 1


### Print reports for each range 

In [None]:
create_folder_if_not_exists(report_location)

with matplotlib.backends.backend_pdf.PdfPages(report_location + "\\" + 'sqrt_diff_plots_UV.pdf') as pdf:
#250 nm a 399 nm
    plot_and_save_diff_report(250, 399, sample_file_list, avg_file_list, report_location)

with matplotlib.backends.backend_pdf.PdfPages(report_location + "\\" + 'sqrt_diff_plots_VIS.pdf') as pdf:
#400 nm a 699 nm
    plot_and_save_diff_report(400, 699, sample_file_list, avg_file_list, report_location)

with matplotlib.backends.backend_pdf.PdfPages(report_location + "\\" + 'sqrt_diff_plots_IR.pdf') as pdf:
#699 nm a 2000 nm 
    plot_and_save_diff_report(699, 2200,  sample_file_list, avg_file_list, report_location)

### Comparison between metallic species

In [None]:
def filter_substring_list_elements(path_strings, substring_list): 
    result =[]
    for substring in substring_list:
        filtered_paths = [path for path in path_strings if substring in path]
        result.append(filtered_paths)
    single_list = list(itertools.chain.from_iterable(result))
    return single_list 


In [None]:

substring_1 = ["PLATY","PELID"]
    
filter_substring_list_elements(sample_file_list, substring_1)
