## Generate the Dataset

In [2]:
import csv
from datetime import datetime, timedelta
import random

In [3]:
# Define lists of values
verbs = ["create", "add", "organize", "schedule", "set up", "plan", "book", "reserve", "arrange", "fix", "hold"]
relative_dates = [
    "next Monday", "tomorrow", "next Friday", "first Monday of July", "2nd August", "this Saturday", 
    "next Sunday", "next weekend", "weekend evening", "this Friday", "coming weekend", "next month", 
    "the day after tomorrow", "this Wednesday", "next Thursday", "next year", "this afternoon", 
    "next morning", "coming Friday", "this evening"
]

In [4]:
# Function to get a future weekend date
def get_future_weekend_dates():
    weekend_dates = []
    today = datetime.now()
    for i in range(30):  # Generate dates for the next 30 days
        future_date = today + timedelta(days=i)
        if future_date.weekday() >= 5:  # Saturday (5) or Sunday (6)
            weekend_dates.append(future_date.strftime("%Y-%m-%d"))
    return weekend_dates


In [5]:
# Generate a list of specific dates including weekends
specific_dates = [(datetime.now() + timedelta(days=random.randint(1, 30))).strftime("%Y-%m-%d") for _ in range(10)] + get_future_weekend_dates()
times = [
    "09:00", "10:00", "afternoon", "09:00 AM", "evening", "morning", "14:00", "12:00", "weekend evening", 
    "07:30", "18:45", "midnight", "noon", "early morning", "late evening", "03:15 PM", "10:30 PM", 
    "04:00", "21:00", "11:00 PM", "02:00 AM", "08:45", "16:20", "06:00", "23:59"
]

In [6]:
countries = ["Afghanistan","Albania","Algeria","Andorra","Angola","Anguilla","Antigua &amp; Barbuda","Argentina","Armenia","Aruba","Australia","Austria","Azerbaijan","Bahamas","Bahrain","Bangladesh","Barbados","Belarus","Belgium","Belize","Benin","Bermuda","Bhutan","Bolivia","Bosnia &amp; Herzegovina","Botswana","Brazil","British Virgin Islands","Brunei","Bulgaria","Burkina Faso","Burundi","Cambodia","Cameroon","Cape Verde","Cayman Islands","Chad","Chile","China","Colombia","Congo","Cook Islands","Costa Rica","Cote D Ivoire","Croatia","Cruise Ship","Cuba","Cyprus","Czech Republic","Denmark","Djibouti","Dominica","Dominican Republic","Ecuador","Egypt","El Salvador","Equatorial Guinea","Estonia","Ethiopia","Falkland Islands","Faroe Islands","Fiji","Finland","France","French Polynesia","French West Indies","Gabon","Gambia","Georgia","Germany","Ghana","Gibraltar","Greece","Greenland","Grenada","Guam","Guatemala","Guernsey","Guinea","Guinea Bissau","Guyana","Haiti","Honduras","Hong Kong","Hungary","Iceland","India","Indonesia","Iran","Iraq","Ireland","Isle of Man","Israel","Italy","Jamaica","Japan","Jersey","Jordan","Kazakhstan","Kenya","Kuwait","Kyrgyz Republic","Laos","Latvia","Lebanon","Lesotho","Liberia","Libya","Liechtenstein","Lithuania","Luxembourg","Macau","Macedonia","Madagascar","Malawi","Malaysia","Maldives","Mali","Malta","Mauritania","Mauritius","Mexico","Moldova","Monaco","Mongolia","Montenegro","Montserrat","Morocco","Mozambique","Namibia","Nepal","Netherlands","Netherlands Antilles","New Caledonia","New Zealand","Nicaragua","Niger","Nigeria","Norway","Oman","Pakistan","Palestine","Panama","Papua New Guinea","Paraguay","Peru","Philippines","Poland","Portugal","Puerto Rico","Qatar","Reunion","Romania","Russia","Rwanda","Saint Pierre &amp; Miquelon","Samoa","San Marino","Satellite","Saudi Arabia","Senegal","Serbia","Seychelles","Sierra Leone","Singapore","Slovakia","Slovenia","South Africa","South Korea","Spain","Sri Lanka","St Kitts &amp; Nevis","St Lucia","St Vincent","St. Lucia","Sudan","Suriname","Swaziland","Sweden","Switzerland","Syria","Taiwan","Tajikistan","Tanzania","Thailand","Timor L'Este","Togo","Tonga","Trinidad &amp; Tobago","Tunisia","Turkey","Turkmenistan","Turks &amp; Caicos","Uganda","Ukraine","United Arab Emirates","United Kingdom","Uruguay","Uzbekistan","Venezuela","Vietnam","Virgin Islands (US)","Yemen","Zambia","Zimbabwe"]


In [7]:
rows = []

# Generate rows with relative and specific dates, including weekends
for _ in range(5000):  # Number of rows in the dataset
    row = [
        random.choice(verbs),
        random.choice(relative_dates + specific_dates),
        random.choice(times),
        random.choice(countries)
    ]
    rows.append(row)

In [8]:
# Save to CSV
with open('event_data.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(["Verb", "Date", "Time", "Country"])
    writer.writerows(rows)

Read and Parse the CSV Data

In [8]:
# !pip install transformers==4.28.0

In [9]:
import csv
import spacy
from spacy.training import offsets_to_biluo_tags, Example
from spacy.tokens import DocBin
from spacy.util import filter_spans
import random
from spacy.util import minibatch, compounding
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

In [10]:
# Load the CSV file
with open('event_data.csv', mode='r') as file:
    reader = csv.reader(file)
    next(reader)  # Skip the header
    rows = [row for row in reader]

In [11]:
# Load a blank English model
nlp = spacy.blank("en")

def has_overlap(entity1, entity2):
    return not (entity1[1] <= entity2[0] or entity2[1] <= entity1[0])


In [12]:
# Annotate and check for overlapping entities
annotated_data = []

for row in rows:
    verb, date, time, country = row
    sentence = f"{verb} an event on {date} at {time} in {country}"
    start_date = sentence.find(date)
    end_date = start_date + len(date)
    start_time = sentence.find(time)
    end_time = start_time + len(time)
    
    entities = [(start_date, end_date, "DATE"), (start_time, end_time, "TIME")]
    
    # Check for overlaps
    valid_entities = True
    for i, ent1 in enumerate(entities):
        for ent2 in entities[i+1:]:
            if has_overlap(ent1, ent2):
                valid_entities = False
                break
        if not valid_entities:
            break
    
    if not valid_entities:
        print(f"Skipping overlapping entities in: {sentence}")
        continue
    
    annotated_data.append((sentence, entities))


Skipping overlapping entities in: hold an event on weekend evening at weekend evening in St Vincent
Skipping overlapping entities in: fix an event on weekend evening at evening in Bulgaria


In [13]:
# Annotate with BILUO tags and filter out misaligned entities
aligned_data = []

for sentence, entities in annotated_data:
    doc = nlp.make_doc(sentence)
    tags = offsets_to_biluo_tags(doc, entities)
    
    if '-' in tags:
        print(f"Skipping misaligned entities in: {sentence}")
        continue
    
    aligned_data.append((sentence, {"entities": [(start, end, label) for start, end, label in entities]}))


In [14]:

# Create a DocBin to hold the examples
doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"], store_user_data=True)


In [15]:
# Convert to spaCy Examples
for text, annotations in aligned_data:
    doc = nlp.make_doc(text)
    entities = annotations["entities"]
    
    # Filter out any None spans and add to doc.ents
    spans = [doc.char_span(start, end, label=label) for start, end, label in entities]
    spans = [span for span in spans if span is not None]
    filtered_spans = filter_spans(spans)
    
    doc.ents = filtered_spans
    example = Example.from_dict(doc, {"entities": [(span.start_char, span.end_char, span.label_) for span in filtered_spans]})
    doc_bin.add(doc)


In [16]:
# Save the DocBin to disk
doc_bin.to_disk("train_data.spacy")


In [17]:

# Load a blank model
nlp = spacy.blank("en")

# Add the NER pipeline component
ner = nlp.add_pipe("ner", last=True)


In [18]:
# Add labels
ner.add_label("DATE")
ner.add_label("TIME")


1

In [19]:
# Load the DocBin
doc_bin = DocBin().from_disk("train_data.spacy")
docs = list(doc_bin.get_docs(nlp.vocab))


In [20]:

# Create Examples from the Docs
examples = [Example.from_dict(doc, {"entities": [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]}) for doc in docs]
# print(examples)
subset_examples = examples[:50]
# print(subset_examples)

In [21]:
from spacy.util import minibatch, compounding

In [22]:
# Training loop
optimizer = nlp.begin_training()

for itn in range(20):  # Number of iterations
    random.shuffle(examples)
    losses = {}
    batches = minibatch(examples, size=compounding(4.0, 32.0, 1.001))
    # batches = minibatch(subset_examples, size=4)
    for batch in batches:
        nlp.update(batch, drop=0.5, losses=losses)
    print(f"Iteration {itn}, Loss: {losses}")

# # Reduce the number of training iterations
# for itn in range(10):  # Lower the number of iterations
#     random.shuffle(subset_examples)
#     losses = {}
#     batches = minibatch(subset_examples, size=compounding(2.0, 16.0, 1.001))
#     for batch in batches:
#         nlp.update(batch, drop=0.5, losses=losses)
#     print(f"Iteration {itn}, Loss: {losses}")



Iteration 0, Loss: {'ner': 1325.7439578518347}
Iteration 1, Loss: {'ner': 32.80667736957226}
Iteration 2, Loss: {'ner': 7.950145536365379}
Iteration 3, Loss: {'ner': 10.956873692909708}
Iteration 4, Loss: {'ner': 13.37251582275271}
Iteration 5, Loss: {'ner': 14.134979961108288}
Iteration 6, Loss: {'ner': 5.351147739555626}
Iteration 7, Loss: {'ner': 4.650054430943933}
Iteration 8, Loss: {'ner': 0.009476099322786553}
Iteration 9, Loss: {'ner': 0.3401773981752194}
Iteration 10, Loss: {'ner': 0.0005184109806227835}
Iteration 11, Loss: {'ner': 5.927365525646753}
Iteration 12, Loss: {'ner': 18.236063249840424}
Iteration 13, Loss: {'ner': 12.005968177637772}
Iteration 14, Loss: {'ner': 10.703453215710304}
Iteration 15, Loss: {'ner': 18.69195145987872}
Iteration 16, Loss: {'ner': 7.797556806981088}
Iteration 17, Loss: {'ner': 11.687306719701766}
Iteration 18, Loss: {'ner': 6.048206476929498}
Iteration 19, Loss: {'ner': 4.130986206203999e-05}


In [23]:
# Save the trained model
nlp.to_disk("event_date_time_model_with_alignment")