In [10]:
import spacy
import spacy_transformers
from spacy.matcher import Matcher
from spacy.tokens import Span
import pandas as pd
import numpy as np
import csv
import datetime

In [11]:
nlp = spacy.load("en_core_web_trf", disable=["textcat"])

In [12]:
sentence = f"""Alphabet's Google might soon layoff nearly 6 per cent or 10,000 of its ‘poor performing' employees starting early 2023.
According to a report by ‘The Information’ Google’s managers have been asked to analyse and rank the 'poor performing' employees. 
Alphabet currently employs around 1,87,000 employees. 10.0 of them are laid off.
Google will use a ranking system and the lowest-ranked employees are expected to be fired from the company. 
Google had earlier announced that it will be slowing down the hiring process in the fourth quarter of the year. 
With this, Google will join other big tech companies, including Meta, Twitter, Amazon, etc, that have announced layoffs in the recent weeks."""

In [13]:
sentence = "Amsterdam is the capital and largest city in the European country of the Netherlands.Amsterdam is famous for its canals and dikes. Unlike in capitals of most other countries, the national government, parliament, government ministries, supreme court, royal family and embassies are not in Amsterdam, but in The Hague. Located in the Dutch province of North Holland, Amsterdam is colloquially referred to as the 'Venice of the North'. The only diplomatic offices present in Amsterdam are consulates. The city hosts two universities (the University of Amsterdam and the Free University Amsterdam) and an international airport 'Schiphol Airport' built in 1923."

In [14]:
doc = nlp(sentence)
sentences = [sentence for sentence in doc.sents]

In [15]:
pattern1 = [[
#     {"POS": "VERB"},
#     {"TAG": {"IN": ["MD", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]}},
    {"DEP": "ROOT"},
    {"POS": "PART", "OP": "?"},
    {"POS": "ADV",  "OP": "?"},
    {"POS": {"IN": ["PART", "ADP"]}, "OP": "?"}
]]

pattern3 = [[
    {"POS": "VERB"},
    {"POS": "PART", "OP": "?"},
    {"POS": "ADV",  "OP": "?"},
    {"POS": {"IN": ["ADJ", "ADV", "NOUN", "PRON", "DET"]}, "OP": "*"},
    {"POS": {"IN": ["PART", "ADP"]}, "OP": "?"}    
]]

# pattern = [{'DEP':'ROOT'}, 
#         {'DEP':'prep','OP':"?"},
#         {'DEP':'agent','OP':"?"},  
#         {'POS':'ADJ','OP':"?"}] 

matcher = Matcher(nlp.vocab)
matcher.add("pattern3", pattern3)

In [16]:
for i, sentence in enumerate(sentences):
    print(f'{sentence}\n')
    print([(e.text, e.label_) for e in sentence.ents])

Amsterdam is the capital and largest city in the European country of the Netherlands.

[('Amsterdam', 'GPE'), ('European', 'NORP'), ('Netherlands', 'GPE')]
Amsterdam is famous for its canals and dikes.

[('Amsterdam', 'GPE')]
Unlike in capitals of most other countries, the national government, parliament, government ministries, supreme court, royal family and embassies are not in Amsterdam, but in The Hague.

[('Amsterdam', 'GPE'), ('The Hague', 'GPE')]
Located in the Dutch province of North Holland, Amsterdam is colloquially referred to as the 'Venice of the North'.

[('Dutch', 'NORP'), ('North Holland', 'GPE'), ('Amsterdam', 'GPE'), ("the 'Venice of the North", 'GPE')]
The only diplomatic offices present in Amsterdam are consulates.

[('Amsterdam', 'GPE')]
The city hosts two universities (the University of Amsterdam and the Free University Amsterdam) and an international airport 'Schiphol Airport' built in 1923.

[('two', 'CARDINAL'), ('the University of Amsterdam', 'ORG'), ('the Fre

In [17]:
def get_relations(sentence):
    spans = []
    relations = []
    matches = matcher(sentence)
    spans = [sentence[start:end] for _, start, end in matches]
    spans = spacy.util.filter_spans(spans)
    return spans

In [18]:
ner_pos = {"PROPN", "NOUN", "NUM"}

def extract_relations(s):
    relations = []
    ss = sentence.start
    
    # Get all named entities that has at least one NOURN/PROPN
    ents = [e for e in s.ents if {t.pos_ for t in s[e.start - ss: e.end - ss]} & ner_pos]
        
    # Stop if there are less than 2 named entities
    if len(ents) < 2:
        return relations
    
    # Extract all possible relations in the sentence
    spans = get_relations(s)
    
    for span in spans:
        rs = span.start
        
        left = []
        right = []
        for e in ents:
            offset = e.start - rs
            if offset < 0:
                left.append((-offset, e))
            else:
                right.append((offset, e))
        if len(left) and len(right):
            sorted_left = [x for _, x in sorted(left)]
            sorted_right = [x for _, x in sorted(right)]
            e1, e2 = sorted_left[0], sorted_right[0]
            relations.append((e1, span.text.lower(), e2))
    return relations

In [19]:
extract_relation(sentence)

[('Schiphol Airport, 'built in', 1923)]

In [20]:
def get_pos(sentence):
    ts = [(t.text, t.pos_, t.tag_, t.dep_, t.lemma_) for t in sentence if t.pos_]
    return pd.DataFrame(ts, columns=['text', 'pos', 'tag', 'dep', 'lemma'])

In [21]:
import csv

filename = "pre-proc/warcs-20221210-141217.csv"
rows = []

with open(filename, newline='') as file:
    csv_reader = csv.reader(file, quoting=csv.QUOTE_NONE, escapechar='\\')
    c = 0
    for row in csv_reader:
        document = nlp(row[-1])
        sentences = [s for s in document.sents]
        for sentence in sentences:
            relations = extract_relations(sentence)
            for e1, r, e2 in relations:
                rows.append((row[0], sentence, f"{e1}-{r}-{e2}"))
        if (c := c + 1) % 20 == 0:
            print(c)

[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument
[E thread_pool.cpp:113] Exception in thread pool task: mutex lock failed: Invalid argument


KeyboardInterrupt: 

In [125]:
filename = f'relation-reverb-{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}'
res_directory = "relations"

with open(f"{res_directory}/{filename}.csv", 'w', newline='', encoding='UTF-8') as file:
        writer = csv.writer(file)
        writer.writerows([row for row in rows])

In [129]:
# pos = {'NOUN', "PROPN", "NUM"}

# s = sentences[3]
# s_idx = s.start
# ents = s.ents
# spans, relations = get_relations(s)

# span1 = spans[1]
# span2 = spans[3]

# span = span1

# # Get entities that are noun / proper noun / or number
# ents = [ent for ent in ents 
#         if {t.pos_ for t in s[ent.start - s_idx : ent.end - s_idx]} & pos]

# # assert sure there are at least 2 entities left
# assert len(ents) >= 2

# # Get two entities closest to the relatioin (?? how about A, B relation C)
# offsets = [abs(ent.start - span.start) for ent in ents]
# sorted_ents = [x for _, x in sorted(zip(offsets, ents))]
# e1, e2 = sorted_ents[0], sorted_ents[1]
# print(e1, e2)

# # Check if one sided and comma between entities
# if (e1.start - span.start < 0) == (e2.start - span.start < 0) and any(t.text == ',' for t in s[e1.end-s.start:e2.start-s.start]):
#     e1, e2 = e2, e1

# print(f'{e1} - {span.text.lower()} - {e2}')

# # get one closest entity from either side of the relation
# left = []
# right = []
# for ent in ents:
#     offset = ent.start - span.start
#     if  offset < 0:
#         left.append((-offset, ent))
#     else:
#         right.append((offset, ent))

# sorted_left = [x for _, x in sorted(left)]
# sorted_right = [x for _, x in sorted(right)]

# e1, e2 = sorted_left[0], sorted_right[0]

# print(f'{e1} - {span.text.lower()} - {e2}')