In [66]:
import spacy
import pandas as pd

nlp = spacy.load('nl_core_news_lg')

In [67]:
# Sample text for testing
sample_text = """
11/1/22 of ma 11 jan 2022 of van 20 t/m 25 feb 22
Voor de tijdsaanduiding wordt gebruikt: 9u50 en van 14 tot 15u. In een tabel mag het afgekort worden tot 14.50 u.
"""

# Define the days and relevant connectors
days_of_week = {"ma", "di", "wo", "do", "vr", "za", "zo", "woe", "wo"}
connectors = {"van", "tot", "t/m", "tot en met"}

# Function to identify and label date entities
def label_dates(doc):
    entities = []
    spans = []
    
    for token in doc:
        # Check if the token is a connector or day of the week
        if token.text in connectors or token.text in days_of_week:
            spans.append((token.text, token.pos_))
        elif token.ent_type_ == "DATE" or (token.pos_ == "NUM" and len(spans) > 0):
            # If the token is a date or a number, create the date entity
            date_span = []
            date_span += spans
            date_span.append((token.text, token.pos_))
            
            # Check the next token to see if it continues the date range
            next_token = doc[token.i + 1] if token.i + 1 < len(doc) else None
            
            if next_token and next_token.text in {"tot", "t/m", "tot en met"}:
                # If the next token indicates a range, continue capturing
                spans.append((next_token.text, next_token.pos_))
                if len(doc) > token.i + 2:  # Check for a following date or number
                    following_date = doc[token.i + 2]
                    if following_date.ent_type_ == "DATE" or following_date.pos_ == "NUM":
                        spans.append((following_date.text, following_date.pos_))
                
            combined_date = " ".join([span[0] for span in spans])
            entities.append((combined_date, "DATE"))
            spans = []  # Reset spans after capturing a date entity
        else:
            spans.append((token.text, token.pos_))

    return entities

doc = nlp(text=sample_text)

labeled_dates = label_dates(doc)

df_dates = pd.DataFrame(labeled_dates, columns=["Entity", "Label"])

In [68]:
df_dates

Unnamed: 0,Entity,Label
0,\n,DATE
1,of ma,DATE
2,,DATE
3,,DATE
4,of van t/m 25,DATE
5,t/m,DATE
6,feb,DATE
7,\n Voor de tijdsaanduiding wordt gebruikt :,DATE
8,en van tot,DATE
9,tot 15u . In een tabel mag het afgekort worden...,DATE


In [69]:
import re
import spacy
from datetime import datetime
from dateutil import parser

# Load the Dutch language model in spaCy
nlp = spacy.blank("nl")

# Sample text with various Dutch-written dates
text = """
11 januari 2022
maandag 11 januari 2022
van 20 tot en met 25 februari 2022
ma 11 jan 2022
van 20 t/m 25 feb
van 20 t/m 25 feb 22
van 20 tot en met 25 feb
van 20 tot en met 25 feb 22
"""

# Regular expression to match date ranges
range_pattern = re.compile(r"""
    van\s+(\d{1,2})\s+(tot\s+en\s+met|t/m)\s+(\d{1,2})\s*                               # "van...tot en met/t/m" range
    (januari|februari|maart|april|mei|juni|juli|augustus|september|oktober|november|december| # Full month names
    jan|feb|mrt|apr|mei|jun|jul|aug|sep|okt|nov|dec)?\s*                                 # Optional month names
    (\d{2,4})?                                                                          # Optional year
""", re.VERBOSE | re.IGNORECASE)

# Helper function to parse Dutch month names into month numbers
def parse_month(month_str):
    return parser.parse(month_str, fuzzy=False).month

# Function to convert text and regex matches into datetime objects
def extract_dates(text):
    doc = nlp(text)
    
    date_objects = []
    current_year = datetime.now().year

    # Step 1: Extract date entities using spaCy NER
    for ent in doc.ents:
        if ent.label_ == "DATE":
            date_str = ent.text
            try:
                # Use dateutil.parser to convert the entity into a datetime object
                date_obj = parser.parse(date_str, dayfirst=True, fuzzy=False)
                date_objects.append({
                    "original": date_str,
                    "date_object": date_obj
                })
            except ValueError:
                pass  # Ignore if parsing fails
    
    # Step 2: Use regex to find date ranges (van ... tot en met/t/m ...)
    matches = range_pattern.finditer(text)
    for match in matches:
        start_day = int(match.group(1))
        end_day = int(match.group(3))
        month = match.group(4) or ""  # Month can be optional
        year = match.group(5) or current_year  # Default to current year if not present

        if isinstance(year, str) and len(year) == 2:
            year = int("20" + year)
        else:
            year = int(year)

        if month:
            month_num = parse_month(month)
        else:
            month_num = datetime.now().month  # If no month is provided, assume the current month
        
        # Create start and end date objects
        start_date = datetime(year, month_num, start_day)
        end_date = datetime(year, month_num, end_day)
        date_objects.append({
            "original": match.group(0).strip(),
            "date_range": (start_date, end_date)
        })

    return date_objects

# Extract dates from the text
extracted_dates = extract_dates(text)

# Output the extracted date objects
for date in extracted_dates:
    if "date_object" in date:
        print(f"Original: {date['original']} -> Date Object: {date['date_object']}")
    elif "date_range" in date:
        print(f"Original: {date['original']} -> Date Range: {date['date_range'][0]} to {date['date_range'][1]}")

ParserError: Unknown string format: februari