In [28]:
import re
import pandas as pd
df = pd.read_csv("date_parser_testcases.csv")

In [29]:
# Define a dictionary to map month names and abbreviations to their numerical representations
MONTHS = {
    'january': '01', 'jan': '01',
    'february': '02', 'feb': '02',
    'march': '03', 'mar': '03',
    'april': '04', 'apr': '04',
    'may': '05',
    'june': '06', 'jun': '06',
    'july': '07', 'jul': '07',
    'august': '08', 'aug': '08',
    'september': '09', 'sep': '09', 'sept': '09',
    'october': '10', 'oct': '10',
    'november': '11', 'nov': '11',
    'december': '12', 'dec': '12',
}

In [30]:
# Define a function to parse various date formats from a given text string
def parse_date(text):
    # Convert text to lowercase, remove leading/trailing whitespace, and reduce multiple spaces to single spaces
    text = re.sub(r'\s+', ' ', text.lower().strip())

    # Attempt to match and parse dates in the format DD/MM/YYYY, DD-MM-YYYY, DD.MM.YYYY, or DD_MM_YYYY
    match = re.search(r'\b(\d{1,2})[\/\-\._](\d{1,2})[\/\-\._](\d{2,4})\b', text)
    if match:
        d, m, y = match.groups() # Extract day, month, and year
        # Handle two-digit year formats, assuming 19xx for years > 30 and 20xx for years <= 30
        y = f"19{y}" if len(y) == 2 and int(y) > 30 else f"20{y}" if len(y) == 2 else y
        return f"{int(d):02d}/{int(m):02d}/{y}" # Return date in DD/MM/YYYY format

    # Attempt to match and parse dates in the format YYYY/MM/DD, YYYY-MM-DD, YYYY.MM.DD, or YYYY_MM_DD
    match = re.search(r'\b(\d{4})[\/\-\._](\d{1,2})[\/\-\._](\d{1,2})\b', text)
    if match:
        y, m, d = match.groups() # Extract year, month, and day
        return f"{int(d):02d}/{int(m):02d}/{y}" # Return date in DD/MM/YYYY format

    # Attempt to match and parse dates in the format DD[st|nd|rd|th] [of] Month, YYYY
    match = re.search(r'\b(\d{1,2})(?:st|nd|rd|th)?(?:\s+of)?\s+([a-zA-Z]+),?\s+(\d{4})\b', text)
    if match:
        d, mon, y = match.groups() # Extract day, month name, and year
        m = MONTHS.get(mon[:3], '00') # Get numerical month from MONTHS dictionary (using first 3 letters of month name)
        return f"{int(d):02d}/{m}/{y}" # Return date in DD/MM/YYYY format

    # Attempt to match and parse dates in the format Month DD[st|nd|rd|th], YYYY
    match = re.search(r'\b([a-zA-Z]+)\s+(\d{1,2})(?:st|nd|rd|th)?,?\s+(\d{4})\b', text)
    if match:
        mon, d, y = match.groups() # Extract month name, day, and year
        m = MONTHS.get(mon[:3], '00') # Get numerical month from MONTHS dictionary (using first 3 letters of month name)
        return f"{int(d):02d}/{m}/{y}" # Return date in DD/MM/YYYY format

    # Attempt to match and parse dates in the format DD Month YYYY
    match = re.search(r'\b(\d{1,2})\s+([a-zA-Z]+)\s+(\d{4})\b', text)
    if match:
        d, mon, y = match.groups() # Extract day, month name, and year
        m = MONTHS.get(mon[:3], '00') # Get numerical month from MONTHS dictionary (using first 3 letters of month name)
        return f"{int(d):02d}/{m}/{y}" # Return date in DD/MM/YYYY format

    # Attempt to match and parse dates in the format "the DD[st|nd|rd|th] day of Month, YYYY"
    match = re.search(r'\bthe\s+(\d{1,2})(?:st|nd|rd|th)?\s+day\s+of\s+([a-zA-Z]+),?\s+(\d{4})\b', text)
    if match:
        d, mon, y = match.groups() # Extract day, month name, and year
        m = MONTHS.get(mon[:3], '00') # Get numerical month from MONTHS dictionary (using first 3 letters of month name)
        return f"{int(d):02d}/{m}/{y}" # Return date in DD/MM/YYYY format

    return "Invalid Format"

In [31]:
# Apply the parse_date function to the 'Input' column of the DataFrame
# and store the results in a new column called 'Parsed Output'
df['Parsed Output'] = df['Input'].apply(parse_date)

In [32]:
# Compare the 'Parsed Output' with the 'Expected Output' and store the result in a new 'Correct' column
df['Correct'] = df['Parsed Output'] == df['Expected Output']
# Calculate the number of correct parses
correct = df['Correct'].sum()
# Get the total number of test cases
total = len(df)
# Calculate the accuracy percentage
accuracy = correct / total * 100
# Print the accuracy results
print(f"\n Accuracy: {correct}/{total} correct ({accuracy:.2f}%)")


 Accuracy: 89/100 correct (89.00%)


In [33]:
# Filter the DataFrame to show only the rows where the 'Correct' column is False (mismatches)
mismatches = df[df['Correct'] == False]
# Display the filtered DataFrame containing the mismatches
display(mismatches)

Unnamed: 0,Input,Expected Output,Parsed Output,Correct
15,Submit your report by 08/31/2021.,31/08/2021,08/31/2021,False
20,The seminar is on 03/14/2022.,14/03/2022,03/14/2022,False
25,They moved in on 12/25/2019.,25/12/2019,12/25/2019,False
39,Vacation starts on 07/15/2021.,15/07/2021,07/15/2021,False
50,Input,Expected Output,Invalid Format,False
59,"We celebrate Christmas every year on 25th Dec,...",25/12/2024,Invalid Format,False
65,"The submission deadline, noted as 08/31/2021, ...",31/08/2021,08/31/2021,False
67,We celebrate Independence Day on the 4th of Ju...,04/07/2022,Invalid Format,False
70,"We scheduled the seminar for 03/14/2022, don't...",14/03/2022,03/14/2022,False
75,They officially moved in on 12/25/2019.,25/12/2019,12/25/2019,False
