In [8]:
import exrex
import random
import re
import faker
import json

In [9]:
fake = faker.Faker()


In [10]:
re.MULTILINE

re.MULTILINE

In [16]:
regexes=[
    {'pattern': r'((978[\-– ])?[0-9][0-9\-– ]{10}[\-– ][0-9xX])|((978)?[0-9]{9}[0-9Xx])', 'name':'isbn  10  and 13'},
    {'pattern': r"^[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?\.)+(?:[a-zA-Z]{2}|aero|asia|biz|cat|com|coop|edu|gov|info|int|jobs|mil|mobi|museum|name|net|org|pro|tel|travel)$", 'name':'email'},
    {'pattern': r"^(\d{4}-){3}\d{4}$|^(\d{4} ){3}\d{4}$|^\d{16}$", 'name':'card number'},
    {'pattern': r"^([a-zA-Z '-]+)$", 'name':'name'},
    {'pattern': r"(::|(([a-fA-F0-9]{1,4}):){7}(([a-fA-F0-9]{1,4}))|(:(:([a-fA-F0-9]{1,4})){1,6})|((([a-fA-F0-9]{1,4}):){1,6}:)|((([a-fA-F0-9]{1,4}):)(:([a-fA-F0-9]{1,4})){1,6})|((([a-fA-F0-9]{1,4}):){2}(:([a-fA-F0-9]{1,4})){1,5})|((([a-fA-F0-9]{1,4}):){3}(:([a-fA-F0-9]{1,4})){1,4})|((([a-fA-F0-9]{1,4}):){4}(:([a-fA-F0-9]{1,4})){1,3})|((([a-fA-F0-9]{1,4}):){5}(:([a-fA-F0-9]{1,4})){1,2}))", 'name':'IPv6'},
    {'pattern': r"^(?!^\d{10,}$)(\d+(\.\d*)?|\.\d+)$", 'name':'float'},
    {'pattern': r"^(?=.*[0-9]+.*)(?=.*[a-zA-Z]+.*)[0-9a-zA-Z]{6,}$", 'name':'password'},
    {'pattern': r"^((?!000)(?!666)(?:[0-6]\d{2}|7[0-2][0-9]|73[0-3]|7[5-6][0-9]|77[0-2]))-((?!00)\d{2})-((?!0000)\d{4})$", 'name':'ssn'},
    {'pattern': r"(^(((GIR)\s{0,1}((0AA))))|(([A-PR-UWYZ][0-9][0-9]?)|([A-PR-UWYZ][A-HK-Y][0-9][0-9]?)|([A-PR-UWYZ][0-9][A-HJKSTUW])|([A-PR-UWYZ][A-HK-Y][0-9][ABEHMNPRVWXY]))\s{0,1}([0-9][ABD-HJLNP-UW-Z]{2})$)", 'name':'uk postcode'},
    {'pattern': r"^(|(0\d)|(1[0-2])):(([0-5]\d)):(([0-5]\d)) ([AP]M)$", 'name':'time'}]    

In [17]:
def generate_data(regex_definitions, num_samples=1000, data_filename="data.txt", matches_filename="matches.txt"):
    data = []
    generated_matches_for_targets = [] # This will store the dicts for matches.txt

    print(f"Generating {num_samples} data samples...")
    for i in range(num_samples):
        # Cycle through the provided regex definitions to guide data generation
        current_regex_def = regex_definitions[i % len(regex_definitions)]
        name = current_regex_def["name"]
        pattern_for_exrex = current_regex_def["pattern"] # Use this for exrex if applicable
        sample = ""

        try:
            if name == "name":
                sample = fake.name()
            elif name == "email":
                if random.choice([True, False]):
                    sample = fake.ascii_email()
                else:
                    sample = fake.ascii_company_email()
            elif name == "card number":
                # exrex might struggle with very complex ORs, pick a part or simplify
                # For simplicity, let's assume the exrex pattern is good enough for a sample
                sample = exrex.getone(pattern_for_exrex)
            elif name == "isbn  10  and 13":
                if random.choice([True, False]):
                    sample = fake.isbn10()
                else:
                    sample = fake.isbn13()
            elif name == "IPv6":
                sample = fake.ipv6()
            elif name == "float":
                sample = str(fake.pyfloat(left_digits=random.randint(1,4), right_digits=random.randint(1,5), positive=True))
            elif name == "password":
                l = random.randint(8, 16)
                sample = fake.password(length=l, special_chars=False, digits=True, upper_case=True, lower_case=True)
            elif name == "ssn":
                sample = fake.ssn()
            elif name == "uk postcode":
                sample = faker.Faker("en_GB").postcode()
            elif name == "time":
                # exrex.getone might need a simpler pattern part, or a specific time generator
                # For now, trying with the provided pattern
                sample = exrex.getone(pattern_for_exrex)
            else:
                # Fallback for unknown types: try exrex or a generic string
                try:
                    sample = exrex.getone(pattern_for_exrex)
                except Exception:
                    sample = fake.word() + str(random.randint(100,999))
            
            if sample: # Ensure a sample was generated
                data.append(str(sample)) # Ensure all data items are strings
        except Exception as e:
            print(f"Warning: Error generating data for '{name}' (sample {i+1}): {e}. Skipping this sample.")
            # Add a placeholder or just skip
            data.append(f"error_generating_for_{name.replace(' ','_')}")


    print(f"Writing {len(data)} items to {data_filename}...")
    with open(data_filename, "w", encoding='utf-8') as fi:
        for item in data:
            fi.write(f"{item}\n")

    # Now, for each original regex, find its matches in the generated data.txt
    # This creates the "ground truth" for your evolutionary algorithm.
    print(f"Processing matches for {matches_filename} based on predefined regexes...")
    data_join = "\n".join(data) # Create a single string of all data for re.finditer

    for r_def in regex_definitions:
        original_pattern = r_def["pattern"]
        target_name = r_def["name"]
        current_target_matches = []
        print(f"Finding matches for: {target_name} (Pattern: {original_pattern[:60]}...)")
        try:
            # Use re.finditer to find all non-overlapping matches in the entire data_join string
            # re.MULTILINE is important if patterns use ^ or $
            for match_obj in re.finditer(original_pattern, data_join, re.MULTILINE):
                current_target_matches.append(match_obj.group(0))
        except re.error as e:
            print(f"  Regex error for '{target_name}': {e}. Skipping this pattern for matches.txt.")
            continue # Skip this regex if it's invalid

        generated_matches_for_targets.append({
            "pattern": original_pattern, # The original pattern used for this target
            "name": target_name,
            "matches": list(set(current_target_matches)) # Store unique matches found
        })
        print(f"  Found {len(current_target_matches)} raw matches ({len(set(current_target_matches))} unique) for {target_name}.")


    print(f"Writing {len(generated_matches_for_targets)} target definitions to {matches_filename}...")
    with open(matches_filename, "w", encoding='utf-8') as fi:
        for item_dict in generated_matches_for_targets:
            # ----- THIS IS THE CRITICAL CHANGE -----
            fi.write(json.dumps(item_dict) + "\n")
            # ---------------------------------------
    print("Data generation complete.")

In [18]:
generate_data(regexes, 10000)

Generating 10000 data samples...
Writing 10000 items to data.txt...
Processing matches for matches.txt based on predefined regexes...
Finding matches for: isbn  10  and 13 (Pattern: ((978[\-– ])?[0-9][0-9\-– ]{10}[\-– ][0-9xX])|((978)?[0-9]{9...)
  Found 2000 raw matches (2000 unique) for isbn  10  and 13.
Finding matches for: email (Pattern: ^[a-zA-Z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-zA-Z0-9!#$%&'*+/=?^_...)
  Found 1000 raw matches (1000 unique) for email.
Finding matches for: card number (Pattern: ^(\d{4}-){3}\d{4}$|^(\d{4} ){3}\d{4}$|^\d{16}$...)
  Found 1000 raw matches (1000 unique) for card number.
Finding matches for: name (Pattern: ^([a-zA-Z '-]+)$...)
  Found 982 raw matches (977 unique) for name.
Finding matches for: IPv6 (Pattern: (::|(([a-fA-F0-9]{1,4}):){7}(([a-fA-F0-9]{1,4}))|(:(:([a-fA-...)
  Found 1000 raw matches (1000 unique) for IPv6.
Finding matches for: float (Pattern: ^(?!^\d{10,}$)(\d+(\.\d*)?|\.\d+)$...)
  Found 1000 raw matches (972 unique) for float.
Finding mat

In [65]:
exrex.getone(r"^(\d{4}-){3}\d{4}$|^(\d{4} ){3}\d{4}$|^\d{16}$")

'9844-5915-6520-1595'

In [None]:
re.match(r"^\d+(\.\d+)?$", )

['.45']

In [None]:
faker.credit_card_number(card_type="mastercard")

'5121901239721303'