In [23]:
import torch
import os
import re
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, EncoderDecoderModel
from nltk.tokenize import sent_tokenize
from temporal_taggers.evaluation import clean_predictions

In [2]:

def find_timex_in_text(timex_preds, input_text, model_type):
    if model_type == "bert":
        original_paragraph = input_text.lower()
    else:
        original_paragraph = input_text
    end_previous_timex = 0
    previous_timex_cleaned_text = ""
    new_text = ""
    index = 0
    for timex in timex_preds:
        cleaned_text = timex.text.replace("<", "").replace(">", "").replace("\"", "").strip()
        # sometimes the cleaned text has "leftovers"
        if cleaned_text.startswith("- "):
            cleaned_text = cleaned_text[2:]

        if len(cleaned_text) < 2:
            continue

        beginning_timex = original_paragraph[end_previous_timex:].find(cleaned_text)
        if cleaned_text == "day" and beginning_timex != -1 and \
                original_paragraph[beginning_timex - 2:beginning_timex] == "to":
            cleaned_text = "today"
            beginning_timex = original_paragraph[end_previous_timex:].find(cleaned_text)

        # if the model predicted a full year instead of the last two digits
        if beginning_timex == -1 and len(cleaned_text) == 4 and cleaned_text.isdigit():
            beginning_timex = original_paragraph[end_previous_timex:].find(cleaned_text[2:])
            cleaned_text = cleaned_text[2:].strip()

        # if the model predicted full year with an extra repetition
        if beginning_timex == -1 and len(cleaned_text) == 6 and cleaned_text.isdigit():
            beginning_timex = original_paragraph[end_previous_timex:].find(cleaned_text[:-2])
            cleaned_text = cleaned_text[:-2].strip()

        # if the first word is repeating
        elif beginning_timex == -1 and len(cleaned_text.split(" ")) > 1 and \
                cleaned_text.split(" ")[0] == cleaned_text.split(" ")[1]:
            cleaned_text = ' '.join(cleaned_text.split(" ")[:-1])
            beginning_timex = original_paragraph[end_previous_timex:].find(cleaned_text)

        # if the first and last word is repeating
        elif beginning_timex == -1 and len(cleaned_text.split(" ")) > 1 and \
                cleaned_text.split(" ")[0] == cleaned_text.split(" ")[-1]:
            cleaned_text = ' '.join(cleaned_text.split(" ")[1:])
            beginning_timex = original_paragraph[end_previous_timex:].find(cleaned_text)
        # if its single word separated by "-"
        elif beginning_timex == -1 and len(cleaned_text.split(" ")) < 2 and len(cleaned_text.split("-")) > 1:
            for word in cleaned_text.split("-"):
                if word in original_paragraph[end_previous_timex:]:
                    cleaned_text = word
                    beginning_timex = original_paragraph[end_previous_timex:].find(cleaned_text)
                    break
        # more than one words the first one is a digit
        elif beginning_timex == -1 and len(cleaned_text.split(" ")) < 2 and len(cleaned_text) > 2 and \
                not cleaned_text[:1].isdigit() and cleaned_text[-1].isdigit():
            word = cleaned_text[:-1]
            if word.lower() in original_paragraph[end_previous_timex:].lower():
                cleaned_text = word
                beginning_timex = original_paragraph[end_previous_timex:].lower().find(cleaned_text.lower())
                break;
        # if its just a single word
        elif beginning_timex == -1 and len(cleaned_text.split(" ")) < 2 and len(cleaned_text) > 2 and \
                not cleaned_text[0].isdigit() and cleaned_text[-1].isdigit():
            for i in range(2, len(cleaned_text)):
                word = cleaned_text[:i]
                if " " + word + " " in original_paragraph[end_previous_timex:] or \
                        " " + word + "." in original_paragraph[end_previous_timex:] or \
                        " " + word + "," in original_paragraph[end_previous_timex:]:
                    cleaned_text = word
                    beginning_timex = original_paragraph[end_previous_timex:].find(cleaned_text)
                    break;

        # if its just a single word ending with digits
        if beginning_timex == -1 and len(cleaned_text.split(" ")) < 2:
            for i in range(2, len(cleaned_text)):
                word = cleaned_text[:i]
                if " " + word + " " in original_paragraph[end_previous_timex:] or \
                        " " + word + "." in original_paragraph[end_previous_timex:] or \
                        " " + word + "," in original_paragraph[end_previous_timex:]:
                    cleaned_text = word
                    beginning_timex = original_paragraph[end_previous_timex:].find(cleaned_text)
                    break;
        # if you can not find it, see if you can match the first word in the multi word one
        if beginning_timex == -1 and len(cleaned_text.split(" ")) > 1:
            for word in cleaned_text.split(" "):
                if word in original_paragraph[end_previous_timex:] and word not in ["a", "-", ".", "the",
                                                                                    "in", "then", "'s",
                                                                                    "have", "at", "be"]:
                    cleaned_text = word
                    beginning_timex = original_paragraph[end_previous_timex:].find(cleaned_text)
                    break

        if beginning_timex == -1 and cleaned_text.lower() in original_paragraph[
                                                             end_previous_timex:].lower():
            beginning_timex = original_paragraph[end_previous_timex:].lower().find(cleaned_text.lower())

        # avoid tag repetition
        if cleaned_text == previous_timex_cleaned_text:
            continue

        previous_timex_cleaned_text = cleaned_text

        # if there is still no match, just forget it.
        if beginning_timex == -1:
            continue

        index = index + 1
        beginning_timex = beginning_timex + end_previous_timex
        # if the word ended with one of these symbols do not put a space after timex tag
        if original_paragraph[beginning_timex - 1:beginning_timex] in ["\n", "'", "-", ",", "\"", "("] or \
                original_paragraph[beginning_timex - 1:beginning_timex].isdigit():
            new_text += f'{input_text[end_previous_timex:beginning_timex]}<TIMEX3 tid="t{index + 1}" ' \
                        f'type="{timex.attrs["type"].upper()}" ' \
                        f'value="{timex.attrs["value"].strip().replace("</timex3>", "").replace("<", "").replace(">", "").replace(" ", "").upper()}">{input_text[beginning_timex:beginning_timex + len(cleaned_text)]}' \
                        f'</TIMEX3>'

        else:  # otherwise put a space
            new_text += f'{input_text[end_previous_timex:beginning_timex]} <TIMEX3 tid="t{index + 1}" ' \
                        f'type="{timex.attrs["type"].upper()}" ' \
                        f'value="{timex.attrs["value"].strip().replace("</timex3>", "").replace("<", "").replace(">", "").replace(" ", "").upper()}">{input_text[beginning_timex:beginning_timex + len(cleaned_text)]}' \
                        f'</TIMEX3>'

        end_previous_timex = beginning_timex + len(cleaned_text)

    new_text += input_text[end_previous_timex:]
    return new_text

In [3]:
db_output = {
  "pageId": "18160671",
  "title": "1995 NCAA Division I Men's Lacrosse Championship",
  "text": "The 1995 NCAA Division I men's lacrosse tournament was the 25th annual Division I NCAA Men's Lacrosse Championship tournament. Twelve NCAA Division I college men's lacrosse teams met after having played their way through a regular season, and for some, a conference tournament.\nThe championship game was played at Maryland's home field, Byrd Stadium, in front of 26,229 fans. The game saw Syracuse University defeat University of Maryland by the score of 13–9. Despite the loss, Maryland goalie Brian Dougherty was named the tournament's Most Outstanding Player. Dougherty was outstanding in the semi-finals, showing why he earned the award as Division I goalie of the year, making 23 saves on 59 shots. In the first quarter, Hopkins' attack took 19 shots with Dougherty making 12 saves, allowing Maryland to take a 4-1 first-quarter lead.\nIn the finals, Maryland led 4 to 2 with a minute left in the first half before the Orange exploded to three straight goals to claim a 5-4 half time lead and that momentum led to the victory. Dougherty again had 23 saves in the finals.\nDespite being on the losing side, Maryland goalie Brian Dougherty was named most\noutstanding player. Also honored on the All-Tournament team were Syracuse’s Mark Fietta, Nick Licameli, Rob Kavovit, Ric Beardsley and Casey Powell, as well as Dan Radebaugh, Matt Hahn, Peter Hilgartner and Rob Chomo for the Terrapins.\nFor the 25th Division I tournament, a 25th anniversary All-time team was selected. This team was nominated and selected based on the voting of all current and past Division I head coaches and all current and past members of the NCAA Men’s Lacrosse Committee. The members of the team, followed by their last year of competition included: Scott Bacigalupo Princeton 1994, Tom Cafaro Army 1971, John DeTomasso Johns Hopkins 1986, Del Dressel Johns Hopkins 1986, Mike Federico Johns Hopkins 1980, Mike French Cornell 1976, Gary Gait Syracuse 1990, Paul Gait Syracuse 1990, Mark Greenberg Johns Hopkins 1980, Tom Haus North Carolina 1980, Chris Kane Cornell 1979, Brad Kotz Syracuse 1985, Richard Kowalchuk Johns Hopkins 1974, Dan MacKesey Cornell 1977, Eamon McEneaney Cornell 1977, David Morrow Princeton 1993, Tim Nelson Syracuse 1985, Mike O’Neill Johns Hopkins 1978, Dave Pietramala Johns Hopkins 1989, Larry Quinn Johns Hopkins 1985, Jonathan Reese Yale 1990, Brendan Schneck Johns Hopkins 1981, Tom Sears North Carolina 1983, Jack Thomas Johns Hopkins 1974, Frank Urso Maryland 1976.\n\n\n== Bracket ==\n*  =  Overtime\n\n\n=== Box scores ===\nTournament Finals\n\nTournament Semi-finals\n\nTournament Quarterfinals\n\nTournament First Round\n\n\n== All-Tournament Team ==\nBrian Dougherty, Maryland (Named the tournament's Most Outstanding Player)\nMark Fietta, Syracuse\nNick Licameli, Syracuse\nRob Kavovit, Syracuse\nRic Beardsley, Syracuse\nCasey Powell, Syracuse\nDan Radebaugh, Maryland\nMatt Hahn, Maryland\nPeter Hilgartner, Maryland\nRob Chomo, Maryland\n\n\n== See also ==\n1995 NCAA Division I Women's Lacrosse Championship\n1995 NCAA Division II Lacrosse Championship\n\n\n== References =="
}

In [4]:
model_type = "roberta"
tokenizer = AutoTokenizer.from_pretrained("satyaalmasian/temporal_tagger_roberta2roberta")
model = EncoderDecoderModel.from_pretrained("satyaalmasian/temporal_tagger_roberta2roberta")


In [5]:

# --- if you want to use the bert model, uncomment the following lines
# model_type="bert"
# tokenizer = AutoTokenizer.from_pretrained("satyaalmasian/temporal_tagger_bert2bert")
# model = EncoderDecoderModel.from_pretrained("satyaalmasian/temporal_tagger_bert2bert")

input_texts = ["I lived in New York for 10 years."]
input_texts += ["Cumbre Vieja last erupted in 1971 and in 1949."]
input_texts += ["The club's founding date, 15 January, was intentional."]
input_texts += ["Police were first called to the scene just after 7.25am this morning, Sunday, September 19, "
                "and have confirmed they will continue to remain in the area for some time."]


In [6]:

for input_text in input_texts:
    model_inputs = tokenizer(input_text, truncation=True, return_tensors="pt")
    out = model.generate(**model_inputs)
    decoded_preds = tokenizer.batch_decode(out, skip_special_tokens=True)
    pred_soup = BeautifulSoup(clean_predictions(decoded_preds[0]), "lxml")
    timex_preds = pred_soup.findAll("timex3")
    new_text = find_timex_in_text(timex_preds, input_text, model_type)
    print(new_text)



I lived in New York for  <TIMEX3 tid="t2" type="DURATION" value="P10Y">10 years</TIMEX3>.
Cumbre Vieja last erupted in  <TIMEX3 tid="t2" type="DATE" value="1971-XX">1971</TIMEX3> and in 1949.
The club's founding date,  <TIMEX3 tid="t2" type="DATE" value="1999-01-15">15</TIMEX3> January, was intentional.
Police were first called to the scene just after  <TIMEX3 tid="t2" type="TIME" value="1998-09-21T19:00">7.25am this morning, Sunday, September 19</TIMEX3>, and have confirmed they will continue to remain in the area for some time.


In [12]:
# split the text into sentences
sentences = sent_tokenize(db_output["text"], )


In [18]:
for sentence in sentences:
    model_inputs = tokenizer(sentence, truncation=True, return_tensors="pt")
    out = model.generate(**model_inputs)
    decoded_preds = tokenizer.batch_decode(out, skip_special_tokens=True)
    pred_soup = BeautifulSoup(clean_predictions(decoded_preds[0]), "lxml")
    timex_preds = pred_soup.findAll("timex3")
    new_text = find_timex_in_text(timex_preds, sentence, model_type)
    print(new_text)




The  <TIMEX3 tid="t2" type="DATE" value="1995">1995</TIMEX3> NCAA Division I men's lacrosse tournament was the 25th annual Division I NCAA Men's Lacrosse Championship tournament.
Twelve NCAA Division I college men's lacrosse teams met after having played their way through a regular season, and for some, a conference tournament.
The championship game was played at Maryland's home field, Byrd Stadium, in front of 26,229 fans.
The game saw Syracuse University defeat University of Maryland by the score of 13–9.
 <TIMEX3 tid="t2" type="DATE" value="FUTURE_REF">Despite</TIMEX3> the loss, Maryland goalie Brian Dougherty was named the tournament's Most Outstanding Player.
Dougherty was outstanding in the semi-finals, showing why he earned the award as Division I goalie of the  <TIMEX3 tid="t2" type="DATE" value="1989-10-25">year</TIMEX3>, making 23 saves on 59 shots.
In  <TIMEX3 tid="t2" type="DATE" value="1989-Q1">the first quarter</TIMEX3>, Hopkins' attack took 19 shots with Dougherty making

In [14]:
timex_preds

[<timex3 type="TIME" value="2014-08-30TNI"> 5 = 8 Overtime </timex3>]

In [15]:
# split the text into pieces less than 512 tokens

def split_text(text, max_length=512):
    """
    Split the text into pieces less than max_length
    """
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]



In [16]:
x = split_text(db_output["text"])

In [17]:
x

["The 1995 NCAA Division I men's lacrosse tournament was the 25th annual Division I NCAA Men's Lacrosse Championship tournament. Twelve NCAA Division I college men's lacrosse teams met after having played their way through a regular season, and for some, a conference tournament.\nThe championship game was played at Maryland's home field, Byrd Stadium, in front of 26,229 fans. The game saw Syracuse University defeat University of Maryland by the score of 13–9. Despite the loss, Maryland goalie Brian Dougherty w",
 "as named the tournament's Most Outstanding Player. Dougherty was outstanding in the semi-finals, showing why he earned the award as Division I goalie of the year, making 23 saves on 59 shots. In the first quarter, Hopkins' attack took 19 shots with Dougherty making 12 saves, allowing Maryland to take a 4-1 first-quarter lead.\nIn the finals, Maryland led 4 to 2 with a minute left in the first half before the Orange exploded to three straight goals to claim a 5-4 half time lea

In [19]:
# split the text into 500 word chunks

def split_text(text, max_length=10):
    """
    Split the text into pieces less than max_length
    """
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]


In [21]:
chunks = split_text(db_output["text"])

In [22]:
for chunk in chunks:
    model_inputs = tokenizer(chunk, truncation=True, return_tensors="pt")
    out = model.generate(**model_inputs)
    decoded_preds = tokenizer.batch_decode(out, skip_special_tokens=True)
    pred_soup = BeautifulSoup(clean_predictions(decoded_preds[0]), "lxml")
    timex_preds = pred_soup.findAll("timex3")
    new_text = find_timex_in_text(timex_preds, chunk, model_type)
    print(new_text)



The  <TIMEX3 tid="t2" type="DATE" value="1995">1995</TIMEX3> NCAA Division I men's lacrosse tournament was the 25th annual Division I NCAA Men's Lacrosse Championship tournament. Twelve NCAA Division I college men's lacrosse teams met after having played their way through a regular season, and for some, a conference tournament.
The championship game was played at Maryland's home field, Byrd Stadium, in front of 26,229 fans. The game saw Syracuse University defeat University of Maryland by the score of 13–9. Despite the loss, Maryland goalie Brian
 Dougherty was named the tournament's Most Outstanding Player. Dougherty was outstanding in the semi-finals, showing why he earned the award as Division I goalie of the year, making 23 saves on 59 shots. In  <TIMEX3 tid="t2" type="DATE" value="1989-Q1">the first quarter</TIMEX3>, Hopkins' attack took 19 shots with Dougherty making 12 saves, allowing Maryland to take a 4-1 first-quarter lead.
In the finals, Maryland led 4 to 2 with a minute lef

In [29]:
# split text by punctuation and delete the punctuation

def split_on_punctuation(text):
    """
    Split the text into pieces on punctuation
    """
    full_list = re.split(r"([,.!?])", text)
    return [x for x in full_list if x not in [",", ".", "!", "?"]]


In [31]:
shorts = split_on_punctuation(db_output["text"])

In [32]:
for short in shorts:
    model_inputs = tokenizer(short, truncation=True, return_tensors="pt")
    out = model.generate(**model_inputs)
    decoded_preds = tokenizer.batch_decode(out, skip_special_tokens=True)
    pred_soup = BeautifulSoup(clean_predictions(decoded_preds[0]), "lxml")
    timex_preds = pred_soup.findAll("timex3")
    new_text = find_timex_in_text(timex_preds, short, model_type)
    print(new_text)



The  <TIMEX3 tid="t2" type="DATE" value="1995">1995</TIMEX3> NCAA Division I men's lacrosse tournament was the 25th annual Division I NCAA Men's Lacrosse Championship tournament
 Twelve NCAA Division I college men's lacrosse teams met after having played their way through  <TIMEX3 tid="t2" type="DURATION" value="P1C">a regular season</TIMEX3>
 and for some
 a conference  <TIMEX3 tid="t2" type="DATE" value="2015-FA">tournament</TIMEX3>

The championship game was played at Maryland's home field
  <TIMEX3 tid="t2" type="DATE" value="1793-11">Byrd Stadium</TIMEX3>
 in front of 26
 <TIMEX3 tid="t2" type="DATE" value="1998-09-23">229</TIMEX3> fans
 The game saw Syracuse University defeat University of Maryland by the score of 13–9
  <TIMEX3 tid="t2" type="DATE" value="2014-10-25">Despite the loss</TIMEX3>
 Maryland goalie Brian Dougherty was named the tournament's Most Outstanding Player
 Dougherty was outstanding in the  <TIMEX3 tid="t2" type="DURATION" value="P5S">semi-finals</TIMEX3>
 sho

In [33]:
all_timex = []
for short in shorts:
    model_inputs = tokenizer(short, truncation=True, return_tensors="pt")
    out = model.generate(**model_inputs)
    decoded_preds = tokenizer.batch_decode(out, skip_special_tokens=True)
    pred_soup = BeautifulSoup(clean_predictions(decoded_preds[0]), "lxml")
    timex_preds = pred_soup.findAll("timex3")
    all_timex += timex_preds



In [34]:
all_timex

[<timex3 type="DATE" value="1995"> 1995 </timex3>,
 <timex3 type="DURATION" value="P1C"> a regular season </timex3>,
 <timex3 type="DATE" value="2014-05-23"> </timex3>,
 <timex3 type="DATE" value="2015-FA"> tournament </timex3>,
 <timex3 type="DATE" value="2015-09-23"> </timex3>,
 <timex3 type="DATE" value="1793-11"> Byrd Stadium </timex3>,
 <timex3 type="DATE" value="1998-09-23"> 229  fans&gt;... <duration> fans  fan fans )&gt;..</duration></timex3>,
 <timex3 type="DURATION" value="P9Y"> 9 </timex3>,
 <timex3 type="DATE" value="2014-10-25"> Despite the loss </timex3>,
 <timex3 type="DURATION" value="P5S"> semi-finals . &gt; "finals "<time sort="SET" value="XXXX-XX-3" x3=""> Semi-months </time></timex3>,
 <timex3 type="DURATION" value="P1Y-#1"> the year  </timex3>,
 <timex3 type="DURATION" value="PT59M"> </timex3>,
 <timex3 type="DATE" value="2009-Q1"> the first quarter </timex3>,
 <timex3 type="DATE" value="2014-12-25"> 12 </timex3>,
 <timex3 type="DURATION" value="P4Q"> 4-1 first-qua

In [35]:
pred_soup

<html><body><p>Maryland, e== See also ==-1995 NCAA Division I Women's Lacrosse Championship ( <timex3 type="DATE" value="1995">1995 </timex3> MC Division II Lacesis Championship-HK== References == <timesx0497></timesx0497></p></body></html>

In [36]:
decoded_preds

[' Maryland, e== See also ==-1995 NCAA Division I Women\'s Lacrosse Championship ( <timex3 type="DATE" value="1995">1995 </timeX3>  MC Division II Lacesis Championship-HK== References ==  <timesx0497">']

In [39]:
all_timex = []
for short in shorts:
    model_inputs = tokenizer(short, truncation=True, return_tensors="pt")
    out = model.generate(**model_inputs)
    decoded_preds = tokenizer.batch_decode(out, skip_special_tokens=True, )
    # extract the timex tags
    for pred in decoded_preds:
        timex_preds = pred.split("<timex3")[1]
        print(timex_preds)
        break

    # pred_soup = BeautifulSoup(clean_predictions(decoded_preds[0]), "lxml")
    # timex_preds = pred_soup.findAll("timex3")
    # all_timex += timex_preds



 type="DATE" value="1995"> 1995 </timeX3>  NCAA Division I men's lacrosse tournament was the 25th annual DivisionI NCAA Men's Lacrosse Championship tournament  "<timeux="REF"></timeXX">
 type="DURATION" value="P1C"> a regular season </timeX3>  >.  regular Division  "<time x3 quant="E
 type="DATE" value="2014-05-23"> </timeX3>.  <DURATION" Value="PXM"> Some for SOME </D"> and FOR some </> >>...  </dd
 type="DATE" value="2015-FA"> tournament </time x3> >.  conference tournament  this conference tournaments  </>> " <D conferenceTD  a Conference tournament</timeq="TAST_REF">
 type="DATE" value="2015-09-23"> </time x3>  at home's homes field field</timeX>'s home Field field <at'>>
 type="DATE" value="1793-11"> Byrd Stadium </time x3>   <DAST_REF"> Stadium Stadium</time03>. <TIME" valuation="2015-03-23TNI"> Night Stadium> )


IndexError: list index out of range