In [54]:
import spacy
import pandas as pd

nlp = spacy.load('nl_core_news_lg')

In [55]:
# Sample text for testing
sample_text = """
11/1/22 of ma 11 jan 2022 of van 20 t/m 25 feb 22
Voor de tijdsaanduiding wordt gebruikt: 9u50 en van 14 tot 15u. In een tabel mag het afgekort worden tot 14.50 u.
"""

# Define the days and relevant connectors
days_of_week = {"ma", "di", "wo", "do", "vr", "za", "zo", "woe", "wo"}
connectors = {"van", "tot", "t/m", "tot en met"}

# Function to identify and label date entities
def label_dates(doc):
    entities = []
    spans = []
    
    for token in doc:
        # Check if the token is a connector or day of the week
        if token.text in connectors or token.text in days_of_week:
            spans.append((token.text, token.pos_))
        elif token.ent_type_ == "DATE" or (token.pos_ == "NUM" and len(spans) > 0):
            # If the token is a date or a number, create the date entity
            date_span = []
            date_span += spans
            date_span.append((token.text, token.pos_))
            
            # Check the next token to see if it continues the date range
            next_token = doc[token.i + 1] if token.i + 1 < len(doc) else None
            
            if next_token and next_token.text in {"tot", "t/m", "tot en met"}:
                # If the next token indicates a range, continue capturing
                spans.append((next_token.text, next_token.pos_))
                if len(doc) > token.i + 2:  # Check for a following date or number
                    following_date = doc[token.i + 2]
                    if following_date.ent_type_ == "DATE" or following_date.pos_ == "NUM":
                        spans.append((following_date.text, following_date.pos_))
                
            combined_date = " ".join([span[0] for span in spans])
            entities.append((combined_date, "DATE"))
            spans = []  # Reset spans after capturing a date entity
        else:
            spans.append((token.text, token.pos_))

    return entities

doc = nlp(text=sample_text)

labeled_dates = label_dates(doc)

df_dates = pd.DataFrame(labeled_dates, columns=["Entity", "Label"])

In [56]:
df_dates

Unnamed: 0,Entity,Label
0,\n,DATE
1,of ma,DATE
2,,DATE
3,,DATE
4,of van t/m 25,DATE
5,t/m,DATE
6,feb,DATE
7,\n Voor de tijdsaanduiding wordt gebruikt :,DATE
8,en van tot,DATE
9,tot 15u . In een tabel mag het afgekort worden...,DATE
