In [11]:
import os
import re
from typing import List

def read_files(directory_list: List[str]) -> List[str]:
    """
    Reads all files in the given list of directories and returns their contents as a list of strings.
    
    Args:
    directory_list -- The list of directories containing the files to read.
    
    Returns:
    A list of file contents as strings.
    """
    file_contents = []
    for directory in directory_list:
        for filename in os.listdir(directory):
            if filename.endswith('.txt'):  # Assuming the files are text files
                with open(os.path.join(directory, filename), 'r', encoding='utf-8') as file:
                    file_contents.append(file.read())
    return file_contents

def find_common_pattern(file_contents: List[str]) -> str:
    """
    Finds a common regex pattern that matches all file contents.
    
    Args:
    file_contents -- A list of file contents as strings.
    
    Returns:
    A common regex pattern as a string.
    """
    if not file_contents:
        return ""

    # Split each file content into lines
    split_contents = [content.splitlines() for content in file_contents]
    
    # Find the minimum length of lines in the files
    min_lines = min(len(lines) for lines in split_contents)

    # Create a list to store common patterns
    common_patterns = []

    for line_idx in range(min_lines):
        # Extract the corresponding lines from each file
        lines = [content[line_idx] for content in split_contents]
        
        # Find the common pattern for the current line index
        common_pattern = lines[0]
        for line in lines[1:]:
            common_pattern = ''.join(['.' if a != b else a for a, b in zip(common_pattern, line)])
        
        common_patterns.append(common_pattern)

    # Combine the common patterns into a single regex
    common_regex = '\n'.join(common_patterns)
    return common_regex

def main(directories: List[str]):
    """
    Main function to read files from directories and find a common regex pattern.
    
    Args:
    directories -- A list of directories containing the files to read.
    """
    file_contents = read_files(directories)
    common_regex = find_common_pattern(file_contents)
    return common_regex

# Usage in Jupyter Notebook:
directories = [
    r"C:\Users\EstebanSoto\Jupyter\escarabajos\L1050_data\CICIMA-2024-03-REFLECTANCE\old_average",
    # Add more directories as needed
]
common_regex = main(directories)
print(f"Common Regex Pattern:\n{common_regex}")



Common Regex Pattern:
PE UV       SUBTECH     SPECTRUM    ASCII       PEDS        4.00        
   -1
CICIMAUCR0...-..Sample.ASC
24/04/..
..:..:...00
24/04/..
..:..:...00
Vinicio Soto Monge
C..........
250,000000
1
Lambda 1050
1050L1511233
PerkinElmer UV WinLab 6.3.2.0749 / 2.02.05 Lambda 900 UV/VIS/NIR, Aug  7 2015 09:38:08

0
0
3350/......8..
0
0
UV/VIS
1
1
1
DoubleDePol,CommonBeamDepol,RBeamAtt,SBeamAtt,60mm sphere, WB InGaAs Detector
0
0
0
0
Program
Program
3350/......8..
3350/0,2 860,8/0,2
3350/0,2 860,8/0,2
0
15.00
 
 
 
0
 
860,8
319,2
860,8
Front
100
on
S:100 R:100
0
0
0
0
 
0
 
 
 
0
 
 
0
0
0
0
0
 
 
 
 
0
0


0
0
#HDR
-1
-1
#GR
nm
%R
1.0
0.0
2.00,000000
-1,000000
2.51
8
........0
.,....00
#DATA
250.000000	........
251.000000	........
252.000000	........
253.000000	........
254.000000	........
255.000000	........
256.000000	........
257.000000	........
258.000000	........
259.000000	........
260.000000	........
261.000000	........
262.000000	........
263.000000	........
264.00

In [12]:
import re

def convert_repetitive_data_to_expression(regex: str, num_lines: int) -> str:
    """
    Converts a regex pattern with repetitive data into an expression that admits multiple lines of that type.
    
    Args:
    regex: The regex pattern with repetitive data.
    num_lines: The number of lines to allow in the expression.
    
    Returns:
    A regex expression that admits multiple lines of the given type.
    """
    # Escape special characters in the regex pattern
    regex = re.escape(regex)
    
    # Replace the repetitive data with a regex expression that allows multiple lines
    # Use \s* to match zero or more whitespace characters between the numbers and dots
    # Use \n to match the newline character
    replacement = f"(?:\\d+\\.\\d+{regex}\\d+\\.\\d+{regex}){{{num_lines - 1}}}"
    
    # Construct the final regex pattern
    pattern = f"\\b(?:\\d+\\.\\d+{regex}){replacement}\\b"
    
    return pattern

# Example usage:
regex = r"\s+\.\.\.\.\.\.\."
num_lines = 6
result = convert_repetitive_data_to_expression(common_regex, num_lines)
print("Regex expression:")
print(result)


Regex expression:
\b(?:\d+\.\d+PE\ UV\ \ \ \ \ \ \ SUBTECH\ \ \ \ \ SPECTRUM\ \ \ \ ASCII\ \ \ \ \ \ \ PEDS\ \ \ \ \ \ \ \ 4\.00\ \ \ \ \ \ \ \ \
\ \ \ \-1\
CICIMAUCR0\.\.\.\-\.\.Sample\.ASC\
24/04/\.\.\
\.\.:\.\.:\.\.\.00\
24/04/\.\.\
\.\.:\.\.:\.\.\.00\
Vinicio\ Soto\ Monge\
C\.\.\.\.\.\.\.\.\.\.\
250,000000\
1\
Lambda\ 1050\
1050L1511233\
PerkinElmer\ UV\ WinLab\ 6\.3\.2\.0749\ /\ 2\.02\.05\ Lambda\ 900\ UV/VIS/NIR,\ Aug\ \ 7\ 2015\ 09:38:08\
\
0\
0\
3350/\.\.\.\.\.\.8\.\.\
0\
0\
UV/VIS\
1\
1\
1\
DoubleDePol,CommonBeamDepol,RBeamAtt,SBeamAtt,60mm\ sphere,\ WB\ InGaAs\ Detector\
0\
0\
0\
0\
Program\
Program\
3350/\.\.\.\.\.\.8\.\.\
3350/0,2\ 860,8/0,2\
3350/0,2\ 860,8/0,2\
0\
15\.00\
\ \
\ \
\ \
0\
\ \
860,8\
319,2\
860,8\
Front\
100\
on\
S:100\ R:100\
0\
0\
0\
0\
\ \
0\
\ \
\ \
\ \
0\
\ \
\ \
0\
0\
0\
0\
0\
\ \
\ \
\ \
\ \
0\
0\
\
\
0\
0\
\#HDR\
\-1\
\-1\
\#GR\
nm\
%R\
1\.0\
0\.0\
2\.00,000000\
\-1,000000\
2\.51\
8\
\.\.\.\.\.\.\.\.0\
\.,\.\.\.\.00\
\#DATA\
250\.000000\	\.\.\.\.\.\.

In [5]:
common_regex

(\d+\.\d+\s+\.{8}\n)*

'<given_name><number><given_name><number><given_name>-<number><given_name><number>-<number><given_name><date><given_name><number>:<number>:<number><given_name><number><given_name><date><given_name><number>:<number>:<number><given_name><number><given_name><number><given_name><number><given_name><number><given_name><number><given_name><number><given_name><number><given_name><number><given_name><number><given_name><number><given_name><number><given_name>/<given_name><number><given_name><number><given_name><number><given_name><number><given_name>/<given_name>/<given_name><number><given_name><number><given_name><number>:<number>:<number><given_name><number><given_name><number><given_name><number>/<number><given_name><number><given_name><number>/<number><given_name><number><given_name><number><given_name>/<given_name><number><given_name><number><given_name><number><given_name><number><given_name><number><given_name><number><given_name><number><given_name><number><given_name><number>/<number>