In [None]:
import re
import pandas as pd
from datetime import datetime

In [None]:
def mask_nric(text):

    nric_regex = re.compile('([STFG])\d{4}(\d{3}[A-Z])')
    if re.findall('.*[STFG]\d{4}\d{3}[A-Z].*', text):
        return nric_regex.sub(r'\1XXX\2', text)
    else:
        return text

In [None]:
def mask_nric(text):

    nric_regex = re.compile('([STFG])\d{4}(\d{3}[A-Z])')
    return nric_regex.sub(r'\1XXXX\2', text)

In [None]:
text = 'S8034567A was paid $100. T9254321E was paid $300'

In [None]:
mask_nric(text)

'SXXXX567A was paid $100. TXXXX321E was paid $300'

In [None]:
text2 = 'John was paid $100. Jane was paid $300'

In [None]:
mask_nric(text2)

'John was paid $100. Jane was paid $300'

In [None]:
string_date_pattern = re.compile(r'''
(\d{1,2})?
[\s-]?
((?:(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|Jun(?:e)?|Jul(?:y)?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)))
[\w]?
['\s-]?
['\s-]?
(20\d{2}|\d{2})
(?:\s|[A-Za-z]|[\(\)\.\]]|[-_/]|$)
''', re.VERBOSE | re.IGNORECASE)

In [None]:
day_first_pattern = re.compile(r'''
([0-3]?[0-9])
[./]
([01]?[0-9])
[./]
(20\d{2}|\d{2})
(?:\s|[A-Za-z]|[\(\)\.\]]|[-_]|$)
''', re.VERBOSE | re.IGNORECASE)

In [None]:
month_first_pattern = re.compile(r'''
([01]?[0-9])
[./]
([0-3]?[0-9])
[./]
(20\d{2}|\d{2})
(?:\s|[A-Za-z]|[\(\)\.\]]|[-_]|$)
''', re.VERBOSE | re.IGNORECASE)

In [None]:
q_year_pattern = re.compile(r'''
Q
([1-4])
[\s-]?
(20\d{2}|\d{2})
''', re.VERBOSE | re.IGNORECASE)

In [None]:
def check_string_date(compiled_pattern, text):

    # List to contain output date format 
    list_output_date = []

    # Use text matching to match details 
    list_dates = compiled_pattern.findall(text)
    
    for record in list_dates:
        day, month, year = record
        flag_blank_day = (len(day)==0)
        #Putting the default date to 1 if only Month and Year details are present
        if flag_blank_day:
            day = "01"
        if len(year) == 2:
            year = '20' + year
        month = month.capitalize()

        str_date = ' '.join([day,month,year])
        #Handle Scenario where the month is is 3 Letter Short Form 
        if len(month) == 3:
            try:
                list_output_date.append(datetime.strptime(str_date, "%d %b %Y"))
            #In the event that the date is keyed in out of range 
            except ValueError:
                str_date = ' '.join(["01",month,year])
                list_output_date.append(datetime.strptime(str_date, "%d %b %Y"))

        #Handle Scenario where the month is is in long Form 
        else:
            try:
                list_output_date.append(datetime.strptime(str_date, "%d %B %Y"))
            #In the event that the date is keyed in out of range 
            except ValueError:
                str_date = ' '.join(["01",month,year])
                list_output_date.append(datetime.strptime(str_date, "%d %B %Y"))

    if len(list_output_date) > 0:
        return (list_output_date)
    else:
        return [pd.NaT]

In [None]:
def check_numeric_date(compiled_pattern, text, match_type='day_first'):

    # List to contain output date format 
    list_output_date = []

     # Use text matching to match details 
    list_dates = compiled_pattern.findall(text)

    for record in list_dates:
        if match_type == 'day_first':
            day, month, year = record
            day = int(day)
            month = int(month)
            if len(year) == 2:
                year = int('20' + year)
            else:
                year = int(year)

            try:
                list_output_date.append(datetime(year,month,day))
            #In the event that the date is keyed in out of range 
            except ValueError:
                print(f"Invalid date: {record}")

        if match_type == 'month_first':
            day, month, year = record
            day = int(day)
            month = int(month)
            if len(year) == 2:
                year = int('20' + year)
            else:
                year = int(year)

            try:
                list_output_date.append(datetime(year,month,day))
            #In the event that the date is keyed in out of range 
            except ValueError:
                print(f"Invalid date: {record}")
        
    if len(list_output_date) > 0:
        return (list_output_date)
    else:
        return [pd.NaT]

In [None]:
def check_quarter_year(compiled_pattern, text):

    # List to contain output date format 
    list_output_date = []

    # Use text matching to match details 
    list_dates = compiled_pattern.findall(text)

    for record in list_dates:

        quarter, year = record
        quarter = int(quarter)
        #Get the starting month of the quarter
        month = 1 + (quarter-1) * 3
        if len(year) == 2:
            year = int('20' + year)
        else:
            year = int(year)
        
        list_output_date.append(datetime(year,month,1))

    if len(list_output_date) > 0:
        return (list_output_date)
    else:
        return [pd.NaT]

In [None]:
text_with_date = '30 February 2021 23 Jan 2020 is earlier than 30 February 2021'

In [None]:
string_date_pattern.findall(text_with_date)

[('30', 'February', '2021'), ('23', 'Jan', '2020'), ('30', 'February', '2021')]

In [None]:
check_string_date(string_date_pattern, text_with_date)

[datetime.datetime(2021, 2, 1, 0, 0),
 datetime.datetime(2020, 1, 23, 0, 0),
 datetime.datetime(2021, 2, 1, 0, 0)]

In [None]:
check_string_date(string_date_pattern, 'Hahaha')

[NaT]

In [None]:
text_numeric_date = '09/08/2020  and 22.12.22 are valid dates, 13/13/2020 is an invalid date'

In [None]:
check_numeric_date(day_first_pattern, text_numeric_date)

Invalid date: ('13', '13', '2020')


[datetime.datetime(2020, 8, 9, 0, 0), datetime.datetime(2022, 12, 22, 0, 0)]

In [None]:
text_quarter_date = 'Q1 2022, Q2-2021, Q32020 are all valid variations'

In [None]:
check_quarter_year(q_year_pattern, text_quarter_date)

[datetime.datetime(2022, 1, 1, 0, 0),
 datetime.datetime(2021, 4, 1, 0, 0),
 datetime.datetime(2020, 7, 1, 0, 0)]