# Exploratory Analysis Script
#### This script performs a exploratory analysis of every file in a given directory.

In [45]:
import os
from datetime import datetime
import pandas as pd
from pandas.errors import ParserError
import seaborn as sns
import matplotlib.pyplot as plt
import socket
import re
import matplotlib.pyplot as plt

def get_file_info(folder_path):
    file_info = []

    for root, dirs, files in os.walk(folder_path):
        for file in files:
            file_path = os.path.join(root, file)
            file_type = os.path.splitext(file)[1].lower()
            file_year = datetime.fromtimestamp(os.path.getmtime(file_path)).year
            file_info.append({
                'File Name': file,
                'File Type': file_type,
                'File Year': file_year,
                'File Path': file_path
            })

    return file_info

def summarize_file_info(file_info):
    total_files = len(file_info)
    file_types = set([info['File Type'] for info in file_info])
    years = set([info['File Year'] for info in file_info])

    summary = {
        'Total Files': total_files,
        'File Types': file_types,
        'Years': years
    }

    return summary

computer_name  = socket.gethostname()
print(computer_name)
# Ejemplo de uso:

if "ThinkPad-L440" == computer_name:
    base_path = r"/home/vinicio/escarabajos"
    folder_path = r"/home/vinicio/escarabajos/L1050_data/Mediciones Chrysina/Espectros 2018"
elif  "Shannon" == computer_name:
    folder_path = r'C:\Users\esteb\cicima\escarabajos\L1050_data\Mediciones Chrysina\Espectros 2018'
elif  "CICIMA-EVSM" == computer_name:
    folder_path = r'C:\Users\EstebanSoto\Jupyter\escarabajos\L1050_data\Mediciones Chrysina\Espectros 2018'
file_info = get_file_info(folder_path)
summary = summarize_file_info(file_info)

print(f"Total de archivos: {summary['Total Files']}")
print(f"Tipos de archivos: {', '.join(summary['File Types'])}")
print(f"Años de los datos: {', '.join(map(str, summary['Years']))}")

ThinkPad-L440
Total de archivos: 825
Tipos de archivos: , .csv, .ds_store, .asc
Años de los datos: 2024


In [26]:

def split_header_and_data(file_path):
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        lines = file.readlines()
    
    # Verificar si el archivo tiene menos de 10 líneas
    if len(lines) < 10:
        raise ValueError(f"Error: El archivo '{file_path}' tiene menos de 10 líneas.")

    # Encabezado es hasta la línea 9 (índice 0 a 8) y el dataframe desde la línea 10 (índice 9)
    header = lines[:9]
    data = lines[9:]

    # Convertir datos en un DataFrame
    from io import StringIO
    data_str = StringIO(''.join(data))
    df = pd.read_csv(data_str, sep='\s+', header=None, encoding='utf-8')

    return header, df

def analyze_variables_to_dataframe(directory_path):
    files = [os.path.join(directory_path, file) for file in os.listdir(directory_path) if file.endswith('.csv')]
    
    # Inicializar lista para almacenar la información de cada archivo
    data_summary = []
    
    for file_path in files:
        try:
            header, df = split_header_and_data(file_path)
            #print(f"Análisis del archivo: {file_path}")
            
            # Listar todas las variables (columnas)
            columns = df.columns.tolist()
            
            # Tipo de datos de cada variable
            dtypes = df.dtypes.tolist()
            
            # Agregar la información a la lista
            data_summary.append({
                'File Name': os.path.basename(file_path),
                'Columns': columns,
                'Data Types': dtypes
            })
            
        except Exception as e:
            print(e)
            
    
    # Convertir la lista a un DataFrame
    summary_df = pd.DataFrame(data_summary)
    return summary_df

# Ejemplo de uso:
summary_df = analyze_variables_to_dataframe(folder_path)
print(summary_df)


Error: El archivo '/home/vinicio/escarabajos/L1050_data/Mediciones Chrysina/Espectros 2018/._3720411R3.csv' tiene menos de 10 líneas.
Error: El archivo '/home/vinicio/escarabajos/L1050_data/Mediciones Chrysina/Espectros 2018/._3165411R3.csv' tiene menos de 10 líneas.
Error: El archivo '/home/vinicio/escarabajos/L1050_data/Mediciones Chrysina/Espectros 2018/._1736459L1.csv' tiene menos de 10 líneas.
Error: El archivo '/home/vinicio/escarabajos/L1050_data/Mediciones Chrysina/Espectros 2018/._4298666L2.csv' tiene menos de 10 líneas.
Error: El archivo '/home/vinicio/escarabajos/L1050_data/Mediciones Chrysina/Espectros 2018/._1037299L1.csv' tiene menos de 10 líneas.
Error: El archivo '/home/vinicio/escarabajos/L1050_data/Mediciones Chrysina/Espectros 2018/._2249504L3.csv' tiene menos de 10 líneas.
Error: El archivo '/home/vinicio/escarabajos/L1050_data/Mediciones Chrysina/Espectros 2018/._1037308R2.csv' tiene menos de 10 líneas.
Error: El archivo '/home/vinicio/escarabajos/L1050_data/Medici

In [27]:
import os
import pandas as pd
from pandas.errors import ParserError

def check_data_consistency(data):
    """
    Checks the consistency and coherence of the data.
    
    Arguments:
    data -- Pandas DataFrame with the data to check.
    
    Returns:
    message -- A message indicating the result of the check.
    """
    message = ""  # Initialize the message
    min_value1 = 200.00  # nm
    max_value1 = 2000.00  # nm
    min_value2 = 0.0  # %R
    max_value2 = 110.0  # %R
    
    # Check ranges for numerical variables
    numerical_variables = data.select_dtypes(include=['int', 'float'])
    valid_ranges = {'wavelength': (min_value1, max_value1), '%R': (min_value2, max_value2)}  # Define valid ranges
    for column, (min_value, max_value) in valid_ranges.items():
        valid_data = data[(data[column] >= min_value) & (data[column] <= max_value)]
    if data.equals(valid_data):
        message += "All values are within the valid range."
    else:
        message += "Some values are outside the valid range. Correction is needed."
    
    return message
   
def check_data_consistency_in_directory(directory):
    """
    Checks the consistency and coherence of all files in a directory.
    
    Arguments:
    directory -- Path to the directory containing the files to check.
    
    Returns:
    results -- A DataFrame containing the verification information for each file.
    """
    results = []  # Initialize the list of results
    
    for file in os.listdir(directory):
        if file.endswith('.csv'):  # Only process CSV files, adjust according to the type of files you have
            file_path = os.path.join(directory, file)
            try:
                data = pd.read_csv(file_path, encoding='utf-8', header=9, names=["wavelength", "%R"])
            except UnicodeDecodeError:
                print(f"Encoding error reading the file: {file}")
                continue  # Skip to the next file if there's an encoding error
            except ParserError as e: 
                print(e)
                continue
            verification_result = check_data_consistency(data)
            results.append({'File': file, 'Result': verification_result})
    
    return pd.DataFrame(results)

# Example usage

directory_path = 'your_directory_path_here'  # Replace with your directory path
directory_results = check_data_consistency_in_directory(folder_path)
print(directory_results)


Passed header=9 but only 4 lines in file
Passed header=9 but only 4 lines in file
Passed header=9 but only 6 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 5 lines in file
Passed header=9 but only 5 lines in file
Passed header=9 but only 1 lines in file
Passed header=9 but only 5 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 4 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 4 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 5 lines in file
Passed header=9 but only 5 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 4 lines in file
Passed header=9 but only 5 lines in file
Passed header=9 but only 8 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 

In [28]:
import os
import pandas as pd
from pandas.errors import ParserError

def data_precision(data):
    """
    Calculates the minimum number of decimals for numerical variables.
    
    Arguments:
    data -- Pandas DataFrame containing the data.
    
    Returns:
    precision -- A dictionary where keys are the variables and values are the minimum number of decimals.
    """
    precision = {}  # Initialize the precision dictionary
    
    # Calculate the minimum number of decimals for numerical variables
    numerical_variables = data.select_dtypes(include=['float'])
    for column in numerical_variables.columns:
        precision[column] = data[column].apply(lambda x: len(str(x).split('.')[1]) if '.' in str(x) else 0).min()
    
    return precision

def data_precision_directory(directory):
    """
    Checks the data precision for each file within a directory.
    
    Arguments:
    directory -- Path to the directory containing the files to check.
    
    Returns:
    results -- A DataFrame containing the precision information for each file.
    """
    results = []  # Initialize the list of results
    
    for file in os.listdir(directory):
        if file.endswith('.csv'):  # Only process CSV files, adjust according to the type of files you have
            file_path = os.path.join(directory, file)
            try:
                data = pd.read_csv(file_path, encoding='utf-8', header=8, names=["wavelength", "%R"])
            except pd.errors.EmptyDataError:
                continue  # Skip to the next file if it's empty
            except UnicodeDecodeError:
                continue  # Skip to the next file if there's an encoding error
            except ParserError as e:
                continue
            precision = data_precision(data)
            results.append({'File': file, 'Precision': precision})
    
    return pd.DataFrame(results)

# Example usage
directory_results = data_precision_directory(folder_path)
print(directory_results)


              File                   Precision
0    2234738R2.csv  {'wavelength': 5, '%R': 5}
1     987517R1.csv  {'wavelength': 5, '%R': 5}
2    2375565R1.csv  {'wavelength': 5, '%R': 5}
3    4224249L3.csv  {'wavelength': 5, '%R': 5}
4    4298666R3.csv  {'wavelength': 5, '%R': 5}
..             ...                         ...
406   353553R2.csv  {'wavelength': 5, '%R': 6}
407  4146715R4.csv  {'wavelength': 5, '%R': 3}
408  2388550R4.csv  {'wavelength': 5, '%R': 5}
409   987517L1.csv  {'wavelength': 5, '%R': 5}
410  2249505R1.csv  {'wavelength': 5, '%R': 5}

[411 rows x 2 columns]


In [29]:

import os
import pandas as pd
import matplotlib.pyplot as plt
from pandas.errors import ParserError

def plot_histograms_directory(directory):
    """
    Plots histograms for numerical variables in all files within a directory.
    
    Arguments:
    directory -- Path to the directory containing the data files.
    """
    for file in os.listdir(directory):
        if file.endswith('.csv'):
            file_path = os.path.join(directory, file)
            try:
                data = pd.read_csv(file_path, encoding='utf-8', header=8, names=["wavelength", "%R"])
            except pd.errors.EmptyDataError:
                # print(f"The file {file} is empty. Skipping.")
                continue
            except UnicodeDecodeError:
                # print(f"Encoding error reading the file: {file}")
                continue  # Skip to the next file if there's an encoding error
            except ParserError as e:
                # print(e)
                continue
            
            # Select numerical variables
            numerical_variables = data.select_dtypes(include=['int', 'float'])
            # Plot histograms for numerical variables
            for column in numerical_variables.columns:
                plt.figure(figsize=(8, 6))
                plt.hist(data[column], bins=20, color='skyblue', edgecolor='black')
                plt.title(f'Histogram of {column} - {file}', fontsize=16)
                plt.xlabel(column, fontsize=14)
                plt.ylabel('Frequency', fontsize=14)
                plt.grid(True)
                plt.show()

# Example usage
#plot_histograms_directory(folder_path)



In [30]:
def plot_categorical_distribution_directorio(directorio):
    """
    Grafica la distribución de categorías para variables categóricas en todos los archivos dentro de un directorio.
    
    Argumentos:
    directorio -- Ruta del directorio que contiene los archivos con los datos.
    """
    resultados = []
    for archivo in os.listdir(directorio):
        #print(resultados)
        if archivo.endswith('.csv'):
            ruta_archivo = os.path.join(directorio, archivo)
            try:
                datos = pd.read_csv(ruta_archivo, encoding='utf-8', header = 9 , names=["wavelength", "%R"])
            except pd.errors.EmptyDataError:
                resultados.append( {'Archivo': archivo,"Vacío": True,  'Variables Categóricas': False})
                continue
            except UnicodeDecodeError:
                #print(f"Error de codificación al leer el archivo: {archivo}")
                continue  # Saltar al siguiente archivo si hay un error de codificación
            except ParserError as e:
                resultados.append( {'Archivo': archivo,"Vacío": True,  'Variables Categóricas': False})
                #print(e)
                continue
            # Seleccionar variables categóricas
            categorical_variables = datos.select_dtypes(include=['object'])
            
            if categorical_variables.empty:
                resultados.append(  {'Archivo': archivo,"Vacío": False,  'Variables Categóricas': False})
            else:
                resultados.append(  {'Archivo': archivo, "Vacío": False, 'Variables Categóricas': True})
            
            # Graficar distribución de categorías para variables categóricas
            for columna in categorical_variables.columns:
                plt.figure(figsize=(8, 6))
                datos[columna].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
                plt.title(f'Distribución de categorías para {columna} - {archivo}', fontsize=16)
                plt.xlabel(columna, fontsize=14)
                plt.ylabel('Frecuencia', fontsize=14)
                plt.xticks(rotation=45)
                plt.grid(True)
                plt.show()
            
            return pd.DataFrame(resultados)


# Ejemplo de uso
plot_categorical_distribution_directorio(folder_path)


Unnamed: 0,Archivo,Vacío,Variables Categóricas
0,2234738R2.csv,False,False


In [31]:
import os
import pandas as pd
from pandas.errors import ParserError

def calculate_correlation_matrix_directory(directory):
    """
    Calculates the correlation matrix for numerical variables in all files within a directory.
    
    Arguments:
    directory -- Path to the directory containing the data files.
    
    Returns:
    results -- Pandas DataFrame containing the correlation matrix for each file.
    """
    results = []  # Initialize the list of results
    
    for file in os.listdir(directory):
        if file.endswith('.csv'):
            file_path = os.path.join(directory, file)
            try:
                data = pd.read_csv(file_path, encoding='utf-8', header=9, names=["wavelength", "%R"])
            except pd.errors.EmptyDataError:
                print(f"The file {file} is empty. Skipping.")
                continue
            except UnicodeDecodeError:
                print(f"Encoding error reading the file: {file}")
                continue
            except ParserError as e:
                print(e)
                continue
            
            # Convert columns to float
            data["wavelength"] = data["wavelength"].astype(float)
            data["%R"] = data["%R"].astype(float)
            
            # Select numerical variables
            numerical_variables = data.select_dtypes(include=['int', 'float'])
            
            # Calculate the correlation matrix
            correlation_matrix = numerical_variables.corr()
            
            # Append results
            results.append({'File': file, 'Correlation between variables': correlation_matrix.loc["wavelength"]["%R"]})
    
    return pd.DataFrame(results)

# Example usage

correlation_results = calculate_correlation_matrix_directory(folder_path)
print(correlation_results)



Passed header=9 but only 4 lines in file
Passed header=9 but only 4 lines in file
Passed header=9 but only 6 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 5 lines in file
Passed header=9 but only 5 lines in file
Passed header=9 but only 1 lines in file
Passed header=9 but only 5 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 4 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 4 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 5 lines in file
Passed header=9 but only 5 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 but only 4 lines in file
Passed header=9 but only 5 lines in file
Passed header=9 but only 8 lines in file
Passed header=9 but only 3 lines in file
Passed header=9 

In [32]:


def detect_anomalies_directory(directory):
    """
    Detects anomalies or outliers in all files within a directory.
    
    Arguments:
    directory -- Path to the directory containing the data files.
    
    Returns:
    results -- Pandas DataFrame containing the results of anomaly detection for each file.
    """
    results = []  # Initialize the list of results
    
    for file in os.listdir(directory):
        if file.endswith('.csv'):
            file_path = os.path.join(directory, file)
            try:
                data = pd.read_csv(file_path, encoding='utf-8', header=9, names=["wavelength", "%R"])
            except pd.errors.EmptyDataError:
                print(f"The file {file} is empty. Skipping.")
                continue
            except UnicodeDecodeError:
                print(f"Encoding error reading the file: {file}")
                continue
            except ParserError as e:
                print(e)
                continue
            
            # Anomaly detection using visualization method
            plt.figure(figsize=(10, 6))
            sns.boxplot(data=data)
            plt.title(f'Anomaly Detection - {file}', fontsize=16)
            plt.xticks(rotation=45)
            plt.grid(True)
            plt.show()
            
            # Numerical anomaly detection using Tukey's method
            Q1 = data.quantile(0.25)
            Q3 = data.quantile(0.75)
            IQR = Q3 - Q1
            outliers = ((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=0)
            results.append({'File': file, 'Anomalies': outliers})
    
    return pd.DataFrame(results)

# Example usage
#anomaly_detection_results = detect_anomalies_directory(folder_path)
#print(anomaly_detection_results)



In [33]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from pandas.errors import ParserError

def plot_dataframes_in_directory(folder_path):
    """
    Iterates over all files in the directory and plots the dataframe if the file is a .csv file.
    All files have a header at the 8th line with columns "wavelength" and "%R".
    
    Arguments:
    folder_path -- Path to the directory containing the files.
    """
    for file in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file)
        
        if file.endswith('.csv'):
            try:
                data = pd.read_csv(file_path, encoding='utf-8', header=8, names=["wavelength", "%R"])
                
                plt.figure(figsize=(10, 6))
                plt.plot(data["wavelength"], data["%R"], marker='o', linestyle='-', color='b', markersize=1)
                plt.title(f'Data from {file}')
                plt.xlabel('wavelength')
                plt.ylabel('%R')
                plt.ylim(0,100)
                plt.grid(True)
                plt.show()
                
            except pd.errors.EmptyDataError:
                #print(f"The file {file} is empty. Skipping.")
                continue
            except UnicodeDecodeError:
                #print(f"Encoding error reading the file: {file}")
                continue
            except ParserError as e:
                #print(f"Parser error: {e}")
                continue
            except Exception as e:
                #print(f"An error occurred while processing the file {file}: {e}")
                continue
        else:
            #print(f"The file {file} is not a .csv file. Skipping.")
            continue

# Example usage

#plot_dataframes_in_directory(folder_path)


In [34]:
### Notes:
#### I don't trust the values before 420 and after 975. I'd truncate the data to those ranges.

In [35]:
#2249504L2.csv
regex = "(\d+)(L|R)(\d+).csv"

In [36]:
import re
def check_match(regex_pattern, test_string):
    # Use re.match to check if the test_string matches the regex_pattern from the beginning of the string
    match = re.match(regex_pattern, test_string)
    
    if match:
        # Print the entire matched string
        #print(f"String '{test_string}' matches the pattern '{regex_pattern}'")
        print(f"str {test_string}. pattern {regex_pattern}. ok")
        # Print each group found
        for i, group in enumerate(match.groups(), start=1):
            print(f"  Group {i}: {group}")

        return match.groups()
    else:
        print( (Color.RED + f"String '{test_string}' does not match the pattern '{regex_pattern}'" + Color.END) )

def get_code_polarization_and_number(string):
    match = re.match(regex_pattern, test_string)
    
    if match:
        # Print the entire matched string
        print(f"String '{test_string}' matches the pattern '{regex_pattern}'")
        
        # Print each group found
        for i, group in enumerate(match.groups(), start=1):
            print(f"  Group {i}: {group}")
    else:
        print(f"String '{test_string}' does not match the pattern '{regex_pattern}'")
        
    return

In [37]:
filename_list = ["3045302R3.csv","3043302Z3.csv","3115302R3.csv"]
regex_pattern = r"(\d+)(L|R)(\d+).csv"
valid_filenames = [filename for filename in filename_list if re.match(regex_pattern, filename)]
for filename in valid_filenames: 
    matches = re.match(regex_pattern, filename)
    filename_info = {"code":matches[0], "polarization":matches[1], "number":matches[2]}    
    print(filename_info)

filename_test = os.path.join(folder_path, "2388550L3.csv")
def get_archive_info(filename):
    with open(filename) as f: 
        print(f.readlines())
def get_archive_line(filename, n):
    with open(filename) as f: 
        return (f.readlines()[n])
def get_archive_str(filename):
    with open(filename) as f: 
        str = (f.read())
        return str
test_file = get_archive_str(filename_test)

{'code': '3045302R3.csv', 'polarization': '3045302', 'number': 'R'}
{'code': '3115302R3.csv', 'polarization': '3115302', 'number': 'R'}


In [38]:
class Color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

# Example usage
print(Color.RED + 'This is red text' + Color.END)

[91mThis is red text[0m


In [39]:
line0_regex = r"Time1=(\d*)ms:Average1=(\d*):Objective=(\d*)X:Aperture=(\d+): \((\d+/\d+/\d+ \d+:\d+:\d+ \w+)\)\n"
#match = check_match(line0_regex, 'Time1=100ms:Average1=5:Objective=10X:Aperture=1: (11/28/2018 10:45:37 AM)\n')
#line0_info = {"time1": match[0], "average1":match[1], "objective":match[2], "aperture":match[3], "timestamp":match[4]}
#line0_info

line1_regex_1 = r"(\d+/\d+/\d+ \d+:\d+:\d+ \w+)\n"
line1_regex = r"(\d+/\d+/\d+ \d+:\d+:\d+ \w+)\n"
test_line1 = "11/28/2018 10:45:37 AM\n"
#match = check_match(line1_regex, test_line1)

line2_regex = r"(\w+)\n"
test_line2 = 'Reflectance\n'
#match = check_match(line2_regex, test_line2)

line3_regex = r"(\d+)X\n"
test_line4 = '10X\n'
#match = check_match(line4_regex, test_line4)

line4_regex = r'Ap#: (\d+)\n'
test_line3 = 'Ap#: 1\n'
#match = check_match(line3_regex, test_line3)



line5_regex = r"Avg1: (\d+(\.\d+)?)\n"
test_line5 = 'Avg1: 5.000000\n'
#match = check_match(line5_regex, test_line5)

line6_regex = r"Avg2: (\d+(\.\d+)?)\n"
test_line6 = 'Avg2: 0.000000\n'
#match = check_match(line6_regex, test_line6)

line7_regex = r"Int.Time1:(\d+(\.\d+)?)\n"
test_line7 = 'Int.Time1:100.000000\n'
#match = check_match(line7_regex, test_line7)

line8_regex = r"Int.Time2:(\d+(\.\d+)?)\n"
test_line8 = 'Int.Time2:0.000000\n'
#match = check_match(line8_regex, test_line8)

line9_regex = r"((\d+(\.\d+)?),(\d+(\.\d+)?)\n)+$"
test_line9 = '1019.91691385,57.63073639\n1020.31281360,57.42786480'
#match = check_match(line9_regex, test_line9)

craic_complete_file_regex = line0_regex+line1_regex+line2_regex+line3_regex+line4_regex+line5_regex+line6_regex+line7_regex+line8_regex+line9_regex
match = check_match(craic_complete_file_regex, test_file)

str Time1=100ms:Average1=5:Objective=10X:Aperture=1: (11/28/2018 10:45:37 AM)
11/28/2018 10:45:37 AM
Reflectance
10X
Ap#: 1
Avg1: 5.000000
Avg2: 0.000000
Int.Time1:100.000000
Int.Time2:0.000000
350.16613620,76.15888616
350.60193709,75.98051566
351.03772818,76.19826177
351.47350942,75.73056817
351.90928082,75.60828301
352.34504234,75.97277961
352.78079397,76.06002443
353.21653569,76.01191519
353.65226748,76.28171479
354.08798932,76.31994387
354.52370120,76.56153914
354.95940309,76.33219208
355.39509497,76.33365664
355.83077683,76.43997530
356.26644864,76.47578650
356.70211039,76.59668314
357.13776206,76.64336901
357.57340363,76.95180508
358.00903508,76.74087266
358.44465639,76.98830670
358.88026755,76.66339197
359.31586852,76.48838658
359.75145930,76.71513962
360.18703986,76.49946448
360.62261019,76.55831399
361.05817026,76.22840777
361.49372007,76.23797648
361.92925958,76.03822453
362.36478878,76.59460425
362.80030765,76.74232309
363.23581617,76.80631990
363.67131432,76.93674484
364.10

In [46]:
folder_path_2 = os.path.join(base_path ,"L1050_data","2023-03-CICIMAUCR-2-REFLECTANCE")
filename_test_l1050 = os.path.join(folder_path_2, r"CICIMAUCR0001-1.Sample.ASC")
test_file_l1050 = get_archive_str(filename_test_l1050)
print(test_file_l1050)
def get_archive_line(filename, n):
    with open(filename) as f: 
        return (f.readlines()[n])
lines = []

for i in range(0,100):
    line = get_archive_line(filename_test_l1050, i) 
    #print(line)
    lines.append(line)



PE UV       SUBTECH     SPECTRUM    ASCII       PEDS        4.00        
   -1
CICIMAUCR0001-1.Sample.ASC
24/03/08
10:42:50.00
24/03/08
10:42:50.00
Vinicio Soto Monge
C. kalinini
250,000000
1
Lambda 1050
1050L1511233
PerkinElmer UV WinLab 6.3.2.0749 / 2.02.05 Lambda 900 UV/VIS/NIR, Aug  7 2015 09:38:08

0
0
3350/servo 860,8/2
0
0
UV/VIS
1
1
1
DoubleDePol,CommonBeamDepol,RBeamAtt,SBeamAtt,60mm sphere, WB InGaAs Detector
0
0
0
0
Program
Program
3350/servo 860,8/2
3350/0,2 860,8/0,2
3350/0,2 860,8/0,2
0
15.00
 
 
 
0
 
860,8
319,2
860,8
Front
100
on
S:100 R:100
0
0
0
0
 
0
 
 
 
0
 
 
0
0
0
0
0
 
 
 
 
0
0


0
0
#HDR
-1
-1
#GR
nm
%R
1.0
0.0
2500,000000
-1,000000
2251
8
36,425100
2,679800
#DATA
2500,000000	3,463500
2499,000000	3,416200
2498,000000	3,524100
2497,000000	3,399600
2496,000000	3,417400
2495,000000	3,518100
2494,000000	3,499800
2493,000000	3,519300
2492,000000	3,504600
2491,000000	3,465000
2490,000000	3,492000
2489,000000	3,455200
2488,000000	3,486200
2487,000000	3,453200
2486,0

In [48]:
#print(lines)
#basic
float_pattern = r"(-*\d*.*\d*)"
float_pattern_comma = r"(-*\d*,*\d*)"
any_information_pattern = r"(.*)\n"
date_pattern = r"(\d+/\d+/\d+)\n"
hour_pattern = r"(\d+:\d+:\d+.\d+)\n"
#
first_line_pattern = f"PE UV       SUBTECH     SPECTRUM    ASCII       PEDS        {float_pattern}"+r"\n"
second_line_pattern = r"   -1\n"
filename_pattern = any_information_pattern
name_pattern = any_information_pattern
description_pattern = any_information_pattern
model_pattern = any_information_pattern
white_line_pattern = any_information_pattern
software_version_pattern = any_information_pattern
mode_of_operation_pattern = r"(%.*)\n"
nm_pattern = r"nm\n"
gr_pattern = r"#GR\n"
hdr_pattern = r"#HDR\n"
data_header_pattern = r"#DATA\n"
data_points_pattern = r"((-*\d*,*\d*)\t(-*\d*,*\d*)\n)*$"
three_floats_pattern = f"{float_pattern}/{float_pattern}/{float_pattern}"+r"\n"
sensor_configuration_pattern = r"(.*)/(.*)/(.*)\n"
lambda1050_pattern = r"Lambda 1050\n"
sample_reference_pattern = r"S:(\d*) R:(\d*)\n"
l1050_regex = [first_line_pattern, #PE UV       SUBTECH     SPECTRUM    ASCII       PEDS        4.00        
               second_line_pattern,#    -1
               filename_pattern,#CICIMAUCR0001-1.Sample.ASC
               date_pattern, #24/03/08
               hour_pattern,#10:42:50.00
               date_pattern ,#24/03/08
               hour_pattern, #10:42:50.00 
               name_pattern, #Vinicio Soto Monge
               description_pattern,#C. kalinini
#-1
#-1
##GR
#nm
#%R
#1.0
#0.0
#2500,000000
#-1,000000
#2251
#8
#36,425100
#2,679800
##DATA
#2500,000000	3,463500
               float_pattern, #250,000000
               float_pattern, #1
               lambda1050_pattern, #Lambda 1050
               any_information_pattern,#1050L1511233
               model_pattern,#PerkinElmer UV WinLab 6.3.2.0749 / 2.02.05 Lambda 900 UV/VIS/NIR, Aug  7 2015 09:38:08
               software_version_pattern,#
               white_line_pattern, #0
               float_pattern,#0
               sensor_configuration_pattern,#3350/servo 860,8/2
               float_pattern,#0
               float_pattern, #0
               any_information_pattern,#UV/VIS
               float_pattern,#1
               float_pattern,#1
               float_pattern,#1
               any_information_pattern,#DoubleDePol,CommonBeamDepol,RBeamAtt,SBeamAtt,60mm sphere, WB InGaAs Detector
               float_pattern,#0
               float_pattern,#0
               float_pattern,#0
               float_pattern,#0
               any_information_pattern,#Program
               any_information_pattern,#Program
               three_floats_pattern,#3350/servo 860,8/2
               three_floats_pattern,#3350/0,2 860,8/0,2
               three_floats_pattern,#3350/0,2 860,8/0,2
               float_pattern,#0
               float_pattern,#15.00
               any_information_pattern,# 
               any_information_pattern,# 
               any_information_pattern,# 
               float_pattern,#0
               any_information_pattern,# 
               float_pattern_comma,#860,8
               float_pattern_comma,#319,2
               float_pattern_comma, #860,8
               any_information_pattern,#Front
               float_pattern,#100
               any_information_pattern,#on
               sample_reference_pattern,#S:100 R:100
               float_pattern,#0
               float_pattern,#0
               float_pattern,#0
               float_pattern,#0
               any_information_pattern,# 
               float_pattern,#0
               any_information_pattern,# 
               any_information_pattern,# 
               any_information_pattern,#
               float_pattern,#0
               any_information_pattern,#  
               any_information_pattern,#
               float_pattern,#0
               float_pattern,#0
               float_pattern,#0
               float_pattern,#0
               float_pattern,#0
               any_information_pattern,# 
               any_information_pattern,#
               any_information_pattern,#
               any_information_pattern,#
               float_pattern,#0
               float_pattern,#0
               #float_pattern,
               any_information_pattern,#
               any_information_pattern,#
               float_pattern,#0
               float_pattern,#0
               hdr_pattern,#HDR
               float_pattern,
               float_pattern,
               gr_pattern,
               nm_pattern, 
               mode_of_operation_pattern,
               float_pattern,
               float_pattern,
               float_pattern_comma,
               float_pattern_comma,
               float_pattern_comma,
               float_pattern,
               float_pattern,
               float_pattern,
               data_header_pattern,
               data_points_pattern
]
l1050_complete_regex = ""

for element in l1050_regex: 
    l1050_complete_regex += element
    
n=91
for n, element in enumerate(l1050_regex):
    match = check_match(l1050_regex[n], lines[n])

str PE UV       SUBTECH     SPECTRUM    ASCII       PEDS        4.00        
. pattern PE UV       SUBTECH     SPECTRUM    ASCII       PEDS        (-*\d*.*\d*)\n. ok
  Group 1: 4.00        
str    -1
. pattern    -1\n. ok
str CICIMAUCR0001-1.Sample.ASC
. pattern (.*)\n. ok
  Group 1: CICIMAUCR0001-1.Sample.ASC
str 24/03/08
. pattern (\d+/\d+/\d+)\n. ok
  Group 1: 24/03/08
str 10:42:50.00
. pattern (\d+:\d+:\d+.\d+)\n. ok
  Group 1: 10:42:50.00
str 24/03/08
. pattern (\d+/\d+/\d+)\n. ok
  Group 1: 24/03/08
str 10:42:50.00
. pattern (\d+:\d+:\d+.\d+)\n. ok
  Group 1: 10:42:50.00
str Vinicio Soto Monge
. pattern (.*)\n. ok
  Group 1: Vinicio Soto Monge
str C. kalinini
. pattern (.*)\n. ok
  Group 1: C. kalinini
str 250,000000
. pattern (-*\d*.*\d*). ok
  Group 1: 250,000000
str 1
. pattern (-*\d*.*\d*). ok
  Group 1: 1
str Lambda 1050
. pattern Lambda 1050\n. ok
str 1050L1511233
. pattern (.*)\n. ok
  Group 1: 1050L1511233
str PerkinElmer UV WinLab 6.3.2.0749 / 2.02.05 Lambda 900 UV/VIS/N

In [42]:
complete_file_regex_l1050 = ""
for element in l1050_regex:
    complete_file_regex_l1050 += element
match = check_match(complete_file_regex_l1050, test_file_l1050)

NameError: name 'l1050_regex' is not defined

In [43]:
# cuando usamos polarizadores, el rango a los sumo hasta 700 nm. Tiene una capa l/4 para verde. Ya a 800 no funciona. Hay que renormalizarlo. Se normalizó con el polarizador puesto. 

#### Con referencia de spectralon
#### Cupreo hay que renormalizar. 

Escojo un punto: Se mide sin polarizacion, luego con polarización izquierda y derecha. Al final se mide el espectro del estandar con el filtro derecho e izquierdo. Todo lo izquierdo se multiplica por el factor 50%. 
750 nm maximo, y el minimo. Lambda: La incertidumbre está dada por longitud de onda. O trabajar con la sensibilidad del detector. Buscar la sensibilidad. Buscar manual de la esfera integradora. 

In [None]:
# this section checks makes a list of every file in a directory and checks if it has the pattern for a l1050 file or craic file

In [None]:
import os

def list_files_in_folder(folder_path):
    files = []
    for file_name in os.listdir(folder_path):
        # Join the folder path with the file name to get the absolute path
        absolute_path = os.path.join(folder_path, file_name)
        # Check if it's a regular file (not a directory)
        if os.path.isfile(absolute_path):
            files.append(file_name)
    return files

# Example usage:
folder_path = r'C:\Users\esteb\cicima\escarabajos\L1050_data\2023-03-CICIMAUCR-2-REFLECTANCE'
files = list_files_in_folder(folder_path)
#print(files)
files_ending_in_ASC_CSV_TXT = [file for file in files 
                               if ((file.endswith(".ASC") or file.endswith(".csv") or file.endswith(".txt")) and not file.startswith(".") )  ]
#L1050 files 

#CRAIC files
craic_file_regex_pattern = r"(\d+)(L|R)(\d+).csv"
valid_filenames = [filename for filename in filename_list if re.match(regex_pattern, filename)]

#print(files_ending_in_ASC_CSV_TXT)
print(files_ending_in_ASC_CSV_TXT)
l1050_filenames  = []
craic_filenames = []

for filename in files_ending_in_ASC_CSV_TXT:
    with open(os.path.join(folder_path, filename)) as f:
        file_text = f.read()
        #print(f.read())
        if re.match(complete_file_regex_l1050, file_text):
            l1050_filenames.append(filename)
        elif re.match(craic_complete_file_regex, file_text):
            craic_filenames.append(filename)
print(l1050_filenames)
print(craic_filenames)