# Regex using Python library

In [2]:
import re

In [3]:
pattern = r"hello"
text = "hello World"
match = re.search(pattern,text)
print(match)

<re.Match object; span=(0, 5), match='hello'>


In [5]:
pattern = r"[a-z]"
text = "hello"
match = re.findall(pattern, text)
print(match)


['h', 'e', 'l', 'l', 'o']


In [10]:
pattern = r"\d"
text = "order number 123"
match = re.findall(pattern, text)
print(match)

['1', '2', '3']


In [13]:
pattern = r"\d{2,4}"
text = "123 12345 1234 12"
match = re.findall(pattern, text)
print(match)

['123', '1234', '1234', '12']


In [16]:
#lookahead
pattern = r"foo(?=bar)"
text = "foobar"
match = re.search(pattern, text)
print(match.group())

foo


In [15]:
#lookbehind
pattern = r"(?<=foo)bar"
text = "foobar"
match = re.search(pattern, text)
print(match.group())

bar


In [18]:
pattern = r"\d+"
text = "there are 123 apples"
match = re.match(pattern, text)
print(match)

match = re.search(pattern, text)
print(match.group())

matches = re.findall(pattern, text)
print(matches)

replaced = re.sub(pattern, '***', text)
print(replaced)

None
123
['123']
there are *** apples


In [3]:
email_text = """
Hi Team,

Please note the following updates for our project:

1. The next meeting is scheduled for 2024-08-15 at our main office. Please confirm your availability.
2. We have received a new batch of feedback from clients. Some of the notable ones include:
   - "The service was excellent and the response time was quick."
   - "Please contact me at john.doe@example.com for further discussions."
3. Our support team can be reached at:
   - Phone: 123-456-7890 (John Doe)
   - Phone: 987-654-3210 (Jane Smith)
4. The project deadline has been moved to 2024-12-31. Ensure all deliverables are completed by then.
5. For any urgent issues, please email support@project.com or call our hotline at 555-123-4567.
6. The previous meeting minutes are available at 2023-07-25. Please review them before the next meeting.

Best regards,
Project Manager
"""

In [6]:
# extract Dates: ['2024-08-15', '2024-12-31', '2023-07-25']
date_pattern = r"\d{4}-\d{2}-\d{2}"
date_matches = re.findall(date_pattern, email_text)
print(date_matches)

# extract Phone Numbers: ['123-456-7890', '987-654-3210', '555-123-4567']
phone_pattern = r"\d{3}-\d{3}-\d{4}"
phone_matches = re.findall(phone_pattern, email_text)
print(phone_matches)

# extract Emails: ['john.doe@example.com', 'support@project.com']
email_pattern = r"[A-Za-z0-9._-]+@[a-z]+.[a-z]+"
email_matches = re.findall(email_pattern, email_text)
print(email_matches)

['2024-08-15', '2024-12-31', '2023-07-25']
['123-456-7890', '987-654-3210', '555-123-4567']
['john.doe@example.com', 'support@project.com']


# Regex using Pandas

In [2]:
import pandas as pd

data = {
    "review": [
        "Great product! Contact me at john.doe@example.com for more details.",
        "My credit card number is 1234-5678-9101-1121. Please keep it safe.",
        "Wonderful service, will buy again! SSN: 987-65-4321.",
        "Loved it! For issues, email support@company.com.",
        "Quick delivery and excellent customer support.",
        "My email is jane.doe@work.net and I had a great experience.",
        "Call me at 555-123-4567 for any further questions.",
        "I lost my credit card, the number was 4321-8765-0987-6543.",
        "SSN: 123-45-6789 should not be shared openly.",
        "Best purchase ever! Reach me at user123@domain.org."
    ]
}

df = pd.DataFrame(data)
print(df)

                                              review
0  Great product! Contact me at john.doe@example....
1  My credit card number is 1234-5678-9101-1121. ...
2  Wonderful service, will buy again! SSN: 987-65...
3   Loved it! For issues, email support@company.com.
4     Quick delivery and excellent customer support.
5  My email is jane.doe@work.net and I had a grea...
6  Call me at 555-123-4567 for any further questi...
7  I lost my credit card, the number was 4321-876...
8      SSN: 123-45-6789 should not be shared openly.
9  Best purchase ever! Reach me at user123@domain...


In [8]:
df['email'] = df['review'].str.extract(r"(\b[A-Za-z0-9._-]+@[a-z]+.[a-z]+\b)")
display(df)

Unnamed: 0,review,email,creditcard
0,Great product! Contact me at john.doe@example....,john.doe@example.com,
1,My credit card number is 1234-5678-9101-1121. ...,,
2,"Wonderful service, will buy again! SSN: 987-65...",,
3,"Loved it! For issues, email support@company.com.",support@company.com,
4,Quick delivery and excellent customer support.,,
5,My email is jane.doe@work.net and I had a grea...,jane.doe@work.net,
6,Call me at 555-123-4567 for any further questi...,,
7,"I lost my credit card, the number was 4321-876...",,
8,SSN: 123-45-6789 should not be shared openly.,,
9,Best purchase ever! Reach me at user123@domain...,user123@domain.org,


In [12]:
df['creditcard'] = df['review'].str.extract(r"(\b\d{4}-\d{4}-\d{4}-\d{4}\b)")
display(df)

Unnamed: 0,review,email,creditcard
0,Great product! Contact me at john.doe@example....,john.doe@example.com,
1,My credit card number is 1234-5678-9101-1121. ...,,1234-5678-9101-1121
2,"Wonderful service, will buy again! SSN: 987-65...",,
3,"Loved it! For issues, email support@company.com.",support@company.com,
4,Quick delivery and excellent customer support.,,
5,My email is jane.doe@work.net and I had a grea...,jane.doe@work.net,
6,Call me at 555-123-4567 for any further questi...,,
7,"I lost my credit card, the number was 4321-876...",,4321-8765-0987-6543
8,SSN: 123-45-6789 should not be shared openly.,,
9,Best purchase ever! Reach me at user123@domain...,user123@domain.org,


In [16]:
df['redact'] = df['review'].str.replace(r"(\b[A-Za-z0-9._-]+@[a-z]+.[a-z]+\b)", "[REDACTED_EMAIL]", regex=True)
display(df)

Unnamed: 0,review,email,creditcard,redact
0,Great product! Contact me at john.doe@example....,john.doe@example.com,,Great product! Contact me at [REDACTED_EMAIL] ...
1,My credit card number is 1234-5678-9101-1121. ...,,1234-5678-9101-1121,My credit card number is 1234-5678-9101-1121. ...
2,"Wonderful service, will buy again! SSN: 987-65...",,,"Wonderful service, will buy again! SSN: 987-65..."
3,"Loved it! For issues, email support@company.com.",support@company.com,,"Loved it! For issues, email [REDACTED_EMAIL]."
4,Quick delivery and excellent customer support.,,,Quick delivery and excellent customer support.
5,My email is jane.doe@work.net and I had a grea...,jane.doe@work.net,,My email is [REDACTED_EMAIL] and I had a great...
6,Call me at 555-123-4567 for any further questi...,,,Call me at 555-123-4567 for any further questi...
7,"I lost my credit card, the number was 4321-876...",,4321-8765-0987-6543,"I lost my credit card, the number was 4321-876..."
8,SSN: 123-45-6789 should not be shared openly.,,,SSN: 123-45-6789 should not be shared openly.
9,Best purchase ever! Reach me at user123@domain...,user123@domain.org,,Best purchase ever! Reach me at [REDACTED_EMAIL].


In [19]:
output = pd.DataFrame(df['redact'])
output['redact'] = df['redact'].str.replace(r"(\b\d{4}-\d{4}-\d{4}-\d{4}\b)", "[REDACTED_CC]", regex=True)
output.rename(columns={'redact':'review'}, inplace =True)
display(output)


Unnamed: 0,review
0,Great product! Contact me at [REDACTED_EMAIL] ...
1,My credit card number is [REDACTED_CC]. Please...
2,"Wonderful service, will buy again! SSN: 987-65..."
3,"Loved it! For issues, email [REDACTED_EMAIL]."
4,Quick delivery and excellent customer support.
5,My email is [REDACTED_EMAIL] and I had a great...
6,Call me at 555-123-4567 for any further questi...
7,"I lost my credit card, the number was [REDACTE..."
8,SSN: 123-45-6789 should not be shared openly.
9,Best purchase ever! Reach me at [REDACTED_EMAIL].


In [21]:
review_csv = output.to_csv('redacted_reviews.csv', index = False)


CSV String:
 None
