In [1]:
import spacy
nlp = spacy.load("en_core_web_trf")

  from .autonotebook import tqdm as notebook_tqdm


In [32]:
text = "Although Poland, which had been debating the proposal for months despite mounting public pressure and increasingly contradictory expert testimony, finally reached a tentative agreement late Tuesday evening, several members of community—concerned about the long-term economic repercussions that critics argued had been deliberately minimized in earlier reports—insisted that a final vote be postponed until a more comprehensive analysis could be completed." # The conditions of the injured have not been confirmed. A suspect is still on the loose and police say they believe the shooting may have been \"targeted\". The city's deputy mayor earlier said the shooting had occurred \"at a children's birthday party\". Police have not confirmed the type of event at which the shooting occurred, beyond it being a family gathering. \"[A] birthday party should never be a place where families fear for their lives,\" Deputy Mayor Jason Lee wrote on social media. The San Joaquin County Sheriff's Office said the shooting happened shortly before 18:00 local time (02:00 GMT on Sunday), and is appealing to anyone with \"information, video footage, or who may have witnessed any part of the incident\" to come forward. Spokeswoman Heather Brent described the incident as \"unfathomable\", adding: \"This is a very active and ongoing investigation, and information remains limited. \"Early indications suggest this may be a targeted incident, and investigators are exploring all possibilities.\" Stockton's Mayor Christina Fugazi called the shooting \"unacceptable\". \"Families should be together instead of at the hospital, standing next to their loved one, praying that they survive.\" California has some of the strictest firearm laws in the US, and has in recent years faced challenges to it. In 2021, a federal judge overturned the state's ban on assault weapons, such as the AR-15 rifle. The following year, the US Supreme Court expanded gun rights as it struck down a New York law restricting gun-carrying rights, jeopardising similar regulation in California."
text = "Chancellor Rachel Reeves says she can be trusted with the country's finances and has been \"clear\" about reasons for her decisions, following claims she misled the public in the run-up to her Budget."
text = "US President Donald Trump's overseas envoy will travel to Germany this weekend to meet Ukrainian President Volodymyr Zelensky and European leaders for more talks on ending the war."
text = "DiCaprio's One Battle After Another leads Golden Globe nominations Leonardo DiCaprio's latest film One Battle After Another is leading the charge into Hollywood's award season, after receiving nine nominations for this year's Golden Globes on Monday. The movie, about the kidnap of a former revolutionary's daughter, is nominated for best musical/comedy film - while DiCaprio and his co-stars Sean Penn, Teyana Taylor, Benicio Del Toro and Chase Infiniti are all up for acting awards."
doc = nlp(text)

In [28]:
def extract_claim(doc):
    pairs = []

    SUBJECT_DEPS = {
        'nsubj', 'nsubjpass', 'csubj', 'csubjpass', 'expl', 'agent'
    }

    OBJECT_DEPS = {
        'obj', 'iobj', 'attr', 'dobj', 'pobj', 'dative', 'oprd'
    }

    MODIFIER_DEPS = {
        'det', 'amod', 'compound', 'nummod', 'quantmod'
    }

    MODIFIER_DEEP_DEPS = {
        'appos', 'prep', 'poss', 'acl', 'relcl'
    }

    VERB_AUX = {'aux', 'auxpass', 'cop', 'advcl', 'prt'}

    LOCATION_ENTS = {"GPE", "LOC", "FAC", "ORG"}
    TIME_ENTS = {"DATE", "TIME", "EVENT"}


    for token in doc:
        if token.dep_ not in SUBJECT_DEPS:
            continue

        # =========================
        # ACTOR
        # =========================
        actor_tokens = {token}

        for child in token.children:
            if child.dep_ in MODIFIER_DEPS:
                actor_tokens.add(child)

            elif child.dep_ in MODIFIER_DEEP_DEPS:
                for t in child.subtree:
                    actor_tokens.add(t)

        actor_tokens = sorted(actor_tokens, key=lambda x: x.i)
        full_actor = " ".join(t.text for t in actor_tokens)

        # =========================
        # ACTION
        # =========================
        actions = []
        head = token.head

        # Verbal predicate
        if head.pos_ == "VERB":
            actions.append(head)

        # Copular / adjectival predicate
        elif head.pos_ in ("ADJ", "NOUN"):
            if any(c.dep_ == "cop" for c in head.children):
                actions.append(head)

        for action in actions:

            # -------------------------
            # VERB PHRASE
            # -------------------------
            verb_tokens = []
            verb_extra = []
            for child in action.children:
                if child.dep_ in VERB_AUX:
                    verb_tokens.append(child)
            for subverb in verb_tokens:
                for child in subverb.children:
                  if child.dep_ in VERB_AUX:
                      verb_tokens.append(child)
            
            verb_tokens.append(action)
            for subverb in verb_tokens:
              for child in subverb.children:
                if child.dep_ == 'prep':
                    verb_extra += [t for t in child.subtree]



            all_verb_tokens = sorted(verb_tokens + verb_extra, key=lambda x: x.i)
            full_verb = " ".join(t.text for t in all_verb_tokens)

            # -------------------------
            # OBJECT
            # -------------------------
            obj_tokens = set()
            objects = []
            for verb in verb_tokens:
                for child in verb.children:

                    if child.dep_ in OBJECT_DEPS:
                        for t in child.subtree:
                            obj_tokens.add(t)

                    elif child.dep_ in ('ccomp', 'xcomp'):
                        clause = sorted(child.subtree, key=lambda x: x.i)
                        objects.append(" ".join(t.text for t in clause))

            if obj_tokens:
                obj_tokens = sorted(obj_tokens, key=lambda x: x.i)
                objects.append(" ".join(t.text for t in obj_tokens))

            full_object = " ".join(objects).strip()

            # -------------------------
            # LOCATION & TIME
            # -------------------------
            locations = []
            times = []

            for child in action.children:

                # Prepositional phrases
                if child.dep_ == "prep":
                    subtree = list(child.subtree)

                    if any(t.ent_type_ in LOCATION_ENTS for t in subtree):
                        locations.append(" ".join(t.text for t in subtree))

                    elif any(t.ent_type_ in TIME_ENTS for t in subtree):
                        times.append(" ".join(t.text for t in subtree))

                # Temporal adverbials
                elif child.dep_ in ("npadvmod", "tmod"):
                    times.append(" ".join(t.text for t in child.subtree))

            pairs.append({
                'actor': full_actor,
                'action': full_verb,
                'object': full_object,
                'location': " | ".join(locations),
                'time': " | ".join(times)
            })

    return pairs


In [33]:
[extract_claim(sent) for sent in doc.sents]

[[{'actor': "DiCaprio 's One Battle After Another",
   'action': 'leads',
   'object': 'Golden Globe nominations',
   'location': '',
   'time': ''}],
 [{'actor': "Leonardo DiCaprio 's latest film One Battle After Another",
   'action': "is leading into Hollywood 's award season after receiving nine nominations for this year 's Golden Globes on Monday",
   'object': 'the charge',
   'location': "into Hollywood 's award season",
   'time': "after receiving nine nominations for this year 's Golden Globes on Monday"}],
 [{'actor': "The movie about the kidnap of a former revolutionary 's daughter",
   'action': 'is nominated for best musical / comedy film are up for acting awards',
   'object': '',
   'location': '',
   'time': ''}]]