In [120]:
import re

from typing import List, Dict

from langchain_openai import ChatOpenAI
from langchain_experimental.tools import PythonREPLTool
from langchain_core.prompts import ChatPromptTemplate

from extract import (
    extract_function_names,
    extract_function_by_name,
    extract_imports,
    is_valid_python,
)

python_repl = PythonREPLTool()

llm = ChatOpenAI(temperature=0, model="gpt-4o")

In [138]:
class FunctionCodeWithUnitTests:
    def __init__(self, function_code: List[str], unit_tests: Dict[str, List[str]], unit_test_imports: str):
        self.function_code = function_code
        self.unit_tests = unit_tests
        self.unit_test_imports = unit_test_imports
    
    def replace_function_code(self, new_function_code: str) -> None:
        self.function_code.append(new_function_code)

    def replace_unit_test(self, unit_test_name: str, new_unit_test_code: str) -> None:
        self.unit_tests[unit_test_name].append(new_unit_test_code)
    
    def get_function_code(self) -> str:
        return self.function_code[-1]
    
    def get_unit_test_code(self, unit_test_name: str) -> str:
        return self.unit_tests[unit_test_name][-1]
    

In [95]:
unit_test_system_prompt_string = """
You are a specialized assistant designed to create thorough unit tests for functions involving regular expressions (regex). \
Your task is to generate comprehensive, self-contained unit tests based on a natural language \
description of the function's intended behavior.

Instructions:
1. Carefully analyze the given description.
2. Create diverse unit tests that cover all aspects of the function's expected behavior, including:
   - Basic functionality
   - Edge cases
   - Corner cases
   - Boundary conditions
   - Invalid inputs
   - Empty inputs
   - Large inputs
   - Special characters
   - Unicode characters (if applicable)
   - Case sensitivity (if applicable)
   - Multiline inputs (if applicable)
   - Overlapping matches
   - Non-matching scenarios
3. If you need to use a unit test framework, use pytest.
4. Write individual test functions, not a test class.
5. Use descriptive test names that clearly indicate the specific scenario being tested.
6. Include detailed assertions that thoroughly check expected outcomes.
7. For regex functions, consider testing:
   - Pattern matching accuracy
   - Capturing groups
   - Non-capturing groups
   - Lookahead and lookbehind assertions
   - Greedy vs. non-greedy quantifiers
   - Character classes and negated character classes
   - Anchors (start, end, word boundaries)
   - Flags (e.g., re.IGNORECASE, re.MULTILINE, re.DOTALL)
   - Escape sequences
8. Do not write the actual function implementation or code to run the tests.
9. Write the name of the function you are testing as a comment at the top of the code block (do not write a place holder function).
10. Generate at least 15-20 diverse tests to ensure comprehensive coverage.

Provide your response as a Python code block containing only the unit tests. \
Ensure that the tests are varied and cover a wide range of scenarios to thoroughly validate the regex function.

Example input: "Create tests for a function that extracts all valid email addresses from a given text."

Your task is to generate appropriate unit tests based on similar natural language descriptions, \
focusing on comprehensive testing of regex functionality.
"""

unit_test_prompt = ChatPromptTemplate.from_messages(
    [("system", unit_test_system_prompt_string), ("user", "{input}")]
)

unit_test_chain = unit_test_prompt | llm
unit_test_llm_message = unit_test_chain.invoke(
    {
        "input": "Extract emails (including instances of 'name at email dot com') from a text."
    }
)

In [96]:
print(unit_test_llm_message.content)

```python
# Function to be tested: extract_emails

import re
import pytest

def test_extract_emails_basic():
    text = "Contact us at support@example.com for more info."
    expected = ["support@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_multiple():
    text = "Emails: first@example.com, second@example.org, third@example.net"
    expected = ["first@example.com", "second@example.org", "third@example.net"]
    assert extract_emails(text) == expected

def test_extract_emails_with_name_at_email_dot_com():
    text = "Reach out at john.doe at example dot com for details."
    expected = ["john.doe@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_mixed_formats():
    text = "Emails: jane.doe@example.com, jane.doe at example dot com"
    expected = ["jane.doe@example.com", "jane.doe@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_no_emails():
    text = "There are no emails in this te

In [97]:
def extract_python_code(text):
    pattern = r'```python\n(.*?)```'
    code_block = re.findall(pattern, text, re.DOTALL)[0]
    is_valid_python(code_block)
    return code_block

unit_test_code_as_string = extract_python_code(unit_test_llm_message.content)
print(unit_test_code_as_string)

# Function to be tested: extract_emails

import re
import pytest

def test_extract_emails_basic():
    text = "Contact us at support@example.com for more info."
    expected = ["support@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_multiple():
    text = "Emails: first@example.com, second@example.org, third@example.net"
    expected = ["first@example.com", "second@example.org", "third@example.net"]
    assert extract_emails(text) == expected

def test_extract_emails_with_name_at_email_dot_com():
    text = "Reach out at john.doe at example dot com for details."
    expected = ["john.doe@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_mixed_formats():
    text = "Emails: jane.doe@example.com, jane.doe at example dot com"
    expected = ["jane.doe@example.com", "jane.doe@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_no_emails():
    text = "There are no emails in this text."
    e

In [98]:
unit_test_function_names = extract_function_names(code_string=unit_test_code_as_string)
unit_test_function_names

['test_extract_emails_basic',
 'test_extract_emails_multiple',
 'test_extract_emails_with_name_at_email_dot_com',
 'test_extract_emails_mixed_formats',
 'test_extract_emails_no_emails',
 'test_extract_emails_empty_string',
 'test_extract_emails_special_characters',
 'test_extract_emails_unicode_characters',
 'test_extract_emails_case_sensitivity',
 'test_extract_emails_multiline',
 'test_extract_emails_overlapping_matches',
 'test_extract_emails_invalid_emails',
 'test_extract_emails_boundary_conditions',
 'test_extract_emails_with_subdomains',
 'test_extract_emails_with_numbers',
 'test_extract_emails_with_hyphens',
 'test_extract_emails_with_plus_sign',
 'test_extract_emails_with_dots_in_name',
 'test_extract_emails_large_input']

In [99]:
unit_test_imports = extract_imports(unit_test_code_as_string)
unit_test_imports

['import re', 'import pytest']

In [100]:
function_writer_system_prompt_string = """
You are a Python code generation assistant. Your task is to create a Python function that satisfies all the provided pytest unit tests. Follow these guidelines:

1. Analyze the given unit tests carefully to understand the function's required behavior.
2. Write a single Python function that passes all the provided tests.
3. Use type hints for parameters and return values.
4. Include a clear and concise docstring explaining the function's purpose and parameters.
5. Follow Python best practices and PEP 8 style guidelines.
6. Do not include comments within the function body.
7. Ensure the function handles all edge cases and scenarios covered in the tests.
8. If the tests imply the use of regular expressions, import the 're' module and use it appropriately.
9. Provide only the function definition and its implementation, nothing else.

Your response should be a Python code block containing only the requested function.
"""

function_writer_prompt = ChatPromptTemplate.from_messages(
    [("system", function_writer_system_prompt_string), ("user", "{input}")]
)

function_writer_chain = function_writer_prompt | llm
function_writer_llm_message = function_writer_chain.invoke(
    {
        "input": unit_test_code_as_string
    }
)

In [101]:
# print(function_writer_llm_message.content)
target_function_code_as_string = extract_python_code(function_writer_llm_message.content)
print(target_function_code_as_string)

import re
from typing import List

def extract_emails(text: str) -> List[str]:
    """
    Extracts email addresses from the given text. Supports both standard email formats
    and obfuscated formats like 'name at domain dot com'.

    Parameters:
    text (str): The input text from which to extract email addresses.

    Returns:
    List[str]: A list of extracted email addresses.
    """
    standard_email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    obfuscated_email_pattern = r'([a-zA-Z0-9._%+-]+)\s+at\s+([a-zA-Z0-9.-]+)\s+dot\s+([a-zA-Z]{2,})'
    
    standard_emails = re.findall(standard_email_pattern, text)
    obfuscated_emails = re.findall(obfuscated_email_pattern, text)
    
    for match in obfuscated_emails:
        standard_emails.append(f"{match[0]}@{match[1]}.{match[2]}")
    
    return standard_emails



In [149]:
function_code_with_unit_tests = FunctionCodeWithUnitTests(
    function_code=[target_function_code_as_string],
    unit_tests={
        name: [
            extract_function_by_name(
                code_string=unit_test_code_as_string, function_name=name
            )
        ]
        for name in unit_test_function_names
    },
    unit_test_imports="\n".join(unit_test_imports)
)


In [170]:
def find_first_failing_unit_test(
    function_code_with_unit_tests: FunctionCodeWithUnitTests,
) -> str:

    for unit_test_name in function_code_with_unit_tests.unit_tests:
        # Example code to execute
        code_to_run = f"""

{function_code_with_unit_tests.unit_test_imports}

{function_code_with_unit_tests.get_function_code()}

{function_code_with_unit_tests.get_unit_test_code(unit_test_name=unit_test_name)}

try:
    {unit_test_name}()
    message = "test passed"
except AssertionError as e:
    message = "Assertion failed: " + str(e)
except Exception as e:
    message = "Error: " + str(e)

print(message)
        """

        # Execute the code
        # print(code_to_run)
        output = python_repl.run(code_to_run)[:-1]
        if not output.endswith("test passed"):
            return unit_test_name + "(): " + output

    return "All Tests Pass"

In [171]:
find_first_failing_unit_test(function_code_with_unit_tests)

'test_extract_emails_unicode_characters(): Assertion failed: '

In [114]:
# TODO update to take the function and one unit test at a time.

def fixer_llm(
    llm,
    target_function_and_unit_test_code_as_string: str,
    failing_unit_tests_and_error_message: List[str],
) -> str:

    code_fixer_system_prompt_string = """
    You are a Python code generation assistant.
    You will be given a function and many unit tests to test that function.
    You will be told which unit tests have failed.

    Instructions;

    1. Consider carefully the function and unit tests to determine what went wrong. The error could be caused by a bug in the function, or a bug in the unit test.
    2. Rewrite the function and all the unit tests correctly, so that all the unit tests will pass.
    3. Do not write and code to run the tests.

    Your response should be a single Python code block containing only the requested function and unit tests.
    """

    failed_unit_tests = "\n".join(failing_unit_tests_and_error_message)

    code_fixer_input_string = f"""
    {target_function_and_unit_test_code_as_string}

    {failed_unit_tests}
    """

    code_fixer_prompt = ChatPromptTemplate.from_messages(
        [("system", code_fixer_system_prompt_string), ("user", "{input}")]
    )

    code_fixer_chain = code_fixer_prompt | llm
    code_fixer_llm_message = code_fixer_chain.invoke({"input": code_fixer_input_string})
    code_as_string = extract_python_code(code_fixer_llm_message.content)
    return code_as_string

In [115]:
updated_target_function_and_unit_test_code_as_string = fixer_llm(
    llm=llm,
    target_function_and_unit_test_code_as_string=target_function_and_unit_test_code_as_string,
    failing_unit_tests_and_error_message=failing_unit_tests_and_error_message,
)
print(updated_target_function_and_unit_test_code_as_string)

import re
from typing import List

def extract_emails(text: str) -> List[str]:
    """
    Extracts email addresses from the given text. Supports both standard email formats
    and obfuscated formats like 'name at domain dot com'.

    Parameters:
    text (str): The input text from which to extract email addresses.

    Returns:
    List[str]: A list of extracted email addresses.
    """
    standard_email_pattern = r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}\b'
    obfuscated_email_pattern = r'\b([a-zA-Z0-9._%+-]+)\s+at\s+([a-zA-Z0-9.-]+)\s+dot\s+([a-zA-Z]{2,})\b'
    
    standard_emails = re.findall(standard_email_pattern, text)
    obfuscated_emails = re.findall(obfuscated_email_pattern, text)
    
    for match in obfuscated_emails:
        standard_emails.append(f"{match[0]}@{match[1]}.{match[2]}")
    
    return standard_emails


# Function to be tested: extract_emails

import pytest

def test_extract_emails_basic():
    text = "Contact us at support@example.com for mo

In [116]:
failing_unit_tests_and_error_message = failing_unit_tests(updated_target_function_and_unit_test_code_as_string)
failing_unit_tests_and_error_message


['test_extract_emails_unicode_characters(): Assertion failed: ',
 'test_extract_emails_invalid_emails(): Assertion failed: ',
 'test_extract_emails_boundary_conditions(): Assertion failed: ',
 'test_extract_emails_with_subdomains(): Assertion failed: ']

In [113]:

import re
from typing import List

def extract_emails(text: str) -> List[str]:
    """
    Extracts email addresses from the given text. Supports both standard email formats
    and obfuscated formats like 'name at domain dot com'.

    Parameters:
    text (str): The input text from which to extract email addresses.

    Returns:
    List[str]: A list of extracted email addresses.
    """
    standard_email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    obfuscated_email_pattern = r'([a-zA-Z0-9._%+-]+)\s+at\s+([^\s]+(?:\s+dot\s+[^\s]+)*)\s+dot\s+([a-zA-Z]{2,})'
    
    standard_emails = re.findall(standard_email_pattern, text, re.UNICODE)
    obfuscated_emails = re.findall(obfuscated_email_pattern, text, re.UNICODE)
    
    result = standard_emails.copy()
    
    for match in obfuscated_emails:
        domain_parts = re.split(r'\s+dot\s+', match[1])
        domain = '.'.join(domain_parts)
        result.append(f"{match[0]}@{domain}.{match[2]}")
    
    # Filter out invalid email addresses
    valid_email_pattern = r'^[a-zA-Z0-9._%+-]+@[^\s@]+\.[^\s@]{2,}$'
    result = [email for email in result if re.match(valid_email_pattern, email, re.UNICODE)]
    
    return result
# Function to be tested: extract_emails

import re
import pytest

def test_extract_emails_basic():
    text = "Contact us at support@example.com for more info."
    expected = ["support@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_multiple():
    text = "Emails: first@example.com, second@example.org, third@example.net"
    expected = ["first@example.com", "second@example.org", "third@example.net"]
    assert extract_emails(text) == expected

def test_extract_emails_with_name_at_email_dot_com():
    text = "Reach out at john.doe at example dot com for details."
    expected = ["john.doe@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_mixed_formats():
    text = "Emails: jane.doe@example.com, jane.doe at example dot com"
    expected = ["jane.doe@example.com", "jane.doe@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_no_emails():
    text = "There are no emails in this text."
    expected = []
    assert extract_emails(text) == expected

def test_extract_emails_empty_string():
    text = ""
    expected = []
    assert extract_emails(text) == expected

def test_extract_emails_special_characters():
    text = "Special chars: !@#$%^&*()_+=-{}[]|\\:;\"'<>,.?/~`"
    expected = []
    assert extract_emails(text) == expected

def test_extract_emails_unicode_characters():
    text = "Contact: user@exámple.com, user at exámple dot com"
    expected = ["user@exámple.com", "user@exámple.com"]
    assert extract_emails(text) == expected

def test_extract_emails_case_sensitivity():
    text = "Emails: User@Example.Com, user at example dot com"
    expected = ["User@Example.Com", "user@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_multiline():
    text = """Emails:
    first@example.com
    second at example dot com"""
    expected = ["first@example.com", "second@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_overlapping_matches():
    text = "Emails: user@example.com, user at example dot com"
    expected = ["user@example.com", "user@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_invalid_emails():
    text = "Invalid emails: user@@example.com, user@example..com"
    expected = []
    assert extract_emails(text) == expected

def test_extract_emails_boundary_conditions():
    text = "Emails: .user@example.com, user.@example.com"
    expected = []
    assert extract_emails(text) == expected

def test_extract_emails_with_subdomains():
    text = "Emails: user@mail.example.com, user at mail dot example dot com"
    expected = ["user@mail.example.com", "user@mail.example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_with_numbers():
    text = "Emails: user123@example.com, user123 at example dot com"
    expected = ["user123@example.com", "user123@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_with_hyphens():
    text = "Emails: user-name@example.com, user-name at example dot com"
    expected = ["user-name@example.com", "user-name@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_with_plus_sign():
    text = "Emails: user+name@example.com, user+name at example dot com"
    expected = ["user+name@example.com", "user+name@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_with_dots_in_name():
    text = "Emails: first.last@example.com, first.last at example dot com"
    expected = ["first.last@example.com", "first.last@example.com"]
    assert extract_emails(text) == expected

def test_extract_emails_large_input():
    text = "Emails: " + ", ".join([f"user{i}@example.com" for i in range(1000)])
    expected = [f"user{i}@example.com" for i in range(1000)]
    assert extract_emails(text) == expected




test_extract_emails_basic()
test_extract_emails_multiple()
test_extract_emails_with_name_at_email_dot_com()
test_extract_emails_mixed_formats
test_extract_emails_no_emails()
test_extract_emails_empty_string()
test_extract_emails_special_characters()
test_extract_emails_unicode_characters()
test_extract_emails_case_sensitivity()
test_extract_emails_multiline()
test_extract_emails_overlapping_matches()
# test_extract_emails_invalid_emails()
# test_extract_emails_boundary_conditions()
test_extract_emails_with_subdomains()
test_extract_emails_with_numbers()
test_extract_emails_with_hyphens()
test_extract_emails_with_plus_sign()
test_extract_emails_with_dots_in_name()
test_extract_emails_large_input()


AssertionError: 