# Mean, Standard Deviation and Moving Average 
### This script averages all measurements for a particular specimen and filters noise using a modifiable moving average. It also calculates standard deviation for each frequency.
#### It requires a document with code names and sample descriptions. Data should be jump corrected and in .txt format

In [1]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.backends.backend_pdf import PdfPages

#constants
moving_average_cicles=6
markersize = 2
# Set the directory path where the files are located
transflectance_directory = r'C:\Users\EstebanSoto\Documents\Estudio Optico Escarabajos\new_data\2023NOV_TRANSFLECTANCE\transflectance\jump_corrected_files'
transflectance_save_directory = transflectance_directory.replace(r"\jump_corrected_files", "")

transmittance_directory = r'C:\Users\EstebanSoto\Documents\Estudio Optico Escarabajos\new_data\2023NOV_TRANSMITTANCE\transmittance\jump_corrected_files'
transmittance_save_directory = transmittance_directory.replace(r"\jump_corrected_files", "")


#Get sample names
sample_names_path = r"C:\Users\EstebanSoto\Documents\Estudio Optico Escarabajos\CODE NAME SAMPLES.txt"

sample_names_df = pd.read_csv(sample_names_path, sep='\t', header=0, names=['code', 'sample']) 
sample_names_df

Unnamed: 0,code,sample
0,C.KALI.LA22,CHRYSINA KALININI LA AMISTAD 2022
1,C.RESP.MV23,C. RESPLENDENS MV 23-19
2,C.CUPR.MV23,C. CUPREOMARGINATA MV 23-15
3,C.AURI.INBI,C. AURIGANS INBIO DESCABEZADO
4,C.CHRY.6713,C. CHRYSARGYREA INBIOCRI002426713
5,C.OPTI.8.17,C. OPTIMA #81 7 SIGNO PREGUNTA
6,C.CHRY.MV73,C. CHRYSARGYREA #73 MV23
7,C.CHRY.MV84,C. CHRYSARGYREA #84 MV23
8,S.BELT.1063,STRIGIDIA BELTI INBIO0004211063
9,S.GLAB.0620,STRIGIDIA GLABRA INBIOCRI0003420620


In [2]:
def mean_and_moving_avg(directory,save_directory, substring, measurement_type):
    #print("test")
    # Get a list of all .txt files containing the specified substring in their names
    files = [file for file in os.listdir(directory) if file.endswith('.txt') and substring in file]
    #print(files)
    #If files is an empty list, break
    if len(files): 
        
        #print("Files")
        #print(files)

        # Create an empty list to store dataframes
        dataframes = []

        # Read each file, create dataframes, and plot them
        for file in files:
            # Read the file into a pandas DataFrame
            file_path = os.path.join(directory, file)
            df = pd.read_csv(file_path, sep='\t', header=0, names=['wavelength', measurement_type])

            #Replace commas with dots
            df = df.replace(',','.', regex=True)
            df.dtypes

            #print(df)

            #convert columns to float
            df['wavelength'] = df['wavelength'].astype(float)
            df[measurement_type] = df[measurement_type].astype(float)
            # Append the dataframe to the list
            dataframes.append(df)

            # Plot the dataframe
            #plt.figure(figsize=(8, 6))
            #plt.plot(df['frequency'], df['transflectance'])
            #plt.xlabel('frequency')
            #plt.ylabel('transflectance')
            #plt.title(f'Plot of {file}')
            #plt.show()

        # Calculate the average of all dataframes
        average_df = pd.concat(dataframes).groupby('wavelength').mean().reset_index()
        std_dev_df = pd.concat(dataframes).groupby('wavelength').std().reset_index()
        #median_df = pd.concat(dataframes).groupby('wavelength').median().reset_index()

       
        #Moving average
        average_df[measurement_type] = average_df[measurement_type].rolling(moving_average_cicles).mean().dropna()
        #median_df['transflectance'] = median_df['transflectance'].rolling(moving_average_cicles).mean()
        
        
        #standard deviation
        std_plot = std_dev_df.plot(x='wavelength', y=measurement_type, kind='scatter', s=markersize, title = substring + " std dev", figsize = (8, 6))
        
        #save figures
        #pdf.savefig()
        #close plot
        plt.close()
        #median_df.plot(x='frequency', y='transflectance', kind='scatter', s=markersize, title = directory + "\\" + substring + "_mean_and_mov_avg.txt")
        #mean is preferred to median because the results have less noise
        
        #SAVE FILE
        #Convert to numpy
        average_np = average_df.to_numpy()
        std_dev_np = std_dev_df.to_numpy()
        
        #Create new folder
        avg_new_path = save_directory + "\\average\\"
        std_dev_new_path = save_directory + "\\std_dev\\"
        
        if not os.path.exists(avg_new_path):
            os.mkdir(avg_new_path)
        if not os.path.exists(std_dev_new_path):
            os.mkdir(std_dev_new_path)
        
        #creates new filename
        new_avg_archive_name = avg_new_path + substring + '_AVERAGE'+'.txt'
        new_std_dev_archive_name = std_dev_new_path + substring+ '_STD_DEV'+'.txt'
        #print(new_avg_archive_name)
        #saves files
        np.savetxt(new_avg_archive_name, average_np,fmt='%s', delimiter="\t")
        np.savetxt(new_std_dev_archive_name, std_dev_np,fmt='%s', delimiter="\t")
        #print("fin de la iteracion")
        

In [3]:
#transflectance

measurement_type = "transflectance" #transflectance, transmittance

substring_list = sample_names_df["code"]
for substring in substring_list:
    #print(substring)
    mean_and_moving_avg(transflectance_directory, transflectance_save_directory, substring, measurement_type)

In [4]:
#transmittance

measurement_type = "transmittance" #transflectance, transmittance

substring_list = sample_names_df["code"]
for substring in substring_list:
    print(substring)
    mean_and_moving_avg(transmittance_directory,transmittance_save_directory, substring, measurement_type)

C.KALI.LA22
C.RESP.MV23
C.CUPR.MV23
C.AURI.INBI
C.CHRY.6713
C.OPTI.8.17
C.CHRY.MV73
C.CHRY.MV84
S.BELT.1063
S.GLAB.0620
S.SIGN.PREG
NI#0002
P.GRAT.3610
MACRASPIS.SP
NI#0001
M.HIRT.DIUR
C.BOUC.CO23
PELID.001
PLATY.001
PELID.002
C.BOUC.INB5610
C.BOUC.INB5423
