In [3]:
# Sample unstructured text (could be loaded from a file in practice)
text = """
John Doe, born on 1990-05-15, lives at 123 Main St, Springfield. 
Contact: john.doe@email.com, Phone: (555) 123-4567.
Jane Smith, born on 1985-10-30, lives at 456 Elm St, Shelbyville. 
Contact: jane_smith22@mail.com, Phone: (555) 987-6543.
"""

import re
import pandas as pd

# Define regex patterns for extraction
name_pattern = r"([A-Z][a-z]+ [A-Z][a-z]+)"
dob_pattern = r"born on (\d{4}-\d{2}-\d{2})"
address_pattern = r"lives at ([\d\w\s,]+)\."
email_pattern = r"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
phone_pattern = r"Phone: ([\(\)\d\- ]+)"

# Find all matches with error handling
def safe_findall(pattern, text, expected_len):
    matches = re.findall(pattern, text)
    if len(matches) < expected_len:
        matches += [None] * (expected_len - len(matches))
    return matches

try:
    # Assume number of records is determined by number of names
    names = re.findall(name_pattern, text)
    n_records = len(names)
    dobs = safe_findall(dob_pattern, text, n_records)
    addresses = safe_findall(address_pattern, text, n_records)
    emails = safe_findall(email_pattern, text, n_records)
    phones = safe_findall(phone_pattern, text, n_records)

    # Combine extracted data into a structured format
    data = []
    for i in range(n_records):
        data.append({
            'Name': names[i],
            'DOB': dobs[i],
            'Address': addresses[i],
            'Email': emails[i],
            'Phone': phones[i]
        })

    df = pd.DataFrame(data)
    print(df)
except Exception as e:
    print(f"Error during extraction: {e}")

# Unit tests for extraction logic
import unittest

class TestExtraction(unittest.TestCase):
    def setUp(self):
        self.text = text

    def test_name_extraction(self):
        names = re.findall(name_pattern, self.text)
        self.assertIn("John Doe", names)
        self.assertIn("Jane Smith", names)

    def test_dob_extraction(self):
        dobs = re.findall(dob_pattern, self.text)
        self.assertIn("1990-05-15", dobs)
        self.assertIn("1985-10-30", dobs)

    def test_email_extraction(self):
        emails = re.findall(email_pattern, self.text)
        self.assertIn("john.doe@email.com", emails)
        self.assertIn("jane_smith22@mail.com", emails)

    def test_phone_extraction(self):
        phones = re.findall(phone_pattern, self.text)
        self.assertIn("(555) 123-4567", phones)
        self.assertIn("(555) 987-6543", phones)

if __name__ == "__main__":
    unittest.main(argv=[''], exit=False)

....
----------------------------------------------------------------------
Ran 4 tests in 0.003s

OK


         Name         DOB                   Address                  Email  \
0    John Doe  1990-05-15  123 Main St, Springfield     john.doe@email.com   
1     Main St  1985-10-30   456 Elm St, Shelbyville  jane_smith22@mail.com   
2  Jane Smith        None                      None                   None   
3      Elm St        None                      None                   None   

            Phone  
0  (555) 123-4567  
1  (555) 987-6543  
2            None  
3            None  
