In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 5.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 46.8 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 26.0 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch

In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

Downloading:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [None]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

In [None]:
class KB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r1):
        r2 = [r for r in self.relations
              if self.are_relations_equal(r1, r)][0]
        spans_to_add = [span for span in r1["meta"]["spans"]
                        if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

In [41]:
def from_text_to_kb(text, span_length=128, verbose=False):
    # tokenize whole text
    inputs = tokenizer([text], return_tensors="pt")
    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    # span_length = num_tokens
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) / 
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    kb = KB()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                "spans": [spans_boundaries[current_span_index]]
            }
            kb.add_relation(relation)
        i += 1

    return kb

## Bonds

In [42]:
text = """
Bonds are a basic form of investment that typically include a straightforward financial agreement between issuer and purchaser. Nevertheless, the terminology surrounding bonds is unique and rather extensive. Much of the specialized vocabulary surrounding bonds is designed to convey the concept that a bond is similar to other financial instruments in that it is an investment that can be bought and sold. Much of this unique terminology will be covered later in this chapter, but we can set out some of the basics here with an example.
"""

kb = from_text_to_kb(text, verbose=True)
kb.print()

Input has 98 tokens
Input has 1 spans
Span boundaries are [[0, 128]]
Relations:
  {'head': 'Bond', 'type': 'subclass of', 'tail': 'investment', 'meta': {'spans': [[0, 128]]}}
  {'head': 'Investment', 'type': 'subclass of', 'tail': 'financial instrument', 'meta': {'spans': [[0, 128]]}}
  {'head': 'bond', 'type': 'subclass of', 'tail': 'financial instrument', 'meta': {'spans': [[0, 128]]}}


## Corporate Structure

In [None]:
text = """
Within the general field of corporate public relations is a specific subdivision referred to as investor relations (IR). IR involves elements of communication, marketing, and finance and is designed to control the flow of information from the management of a public corporation to its investors and stakeholders. Because the investment community plays such a critical role in the overall growth and success of any corporation, it is imperative that firms maintain strong and open relationships with their shareholder or potential investor audience. IR was developed to take responsibility for achieving and maintaining these crucial relationships.
"""

kb = from_text_to_kb(text, verbose=True)
kb.print()

Input has 107 tokens
Input has 1 spans
Span boundaries are [[0, 128]]
Relations:
  {'head': 'marketing', 'type': 'subclass of', 'tail': 'communication', 'meta': {'spans': [[0, 128]]}}
  {'head': 'marketing', 'type': 'facet of', 'tail': 'communication', 'meta': {'spans': [[0, 128]]}}
  {'head': 'investor relations', 'type': 'subclass of', 'tail': 'corporate public relations', 'meta': {'spans': [[0, 128]]}}


## Economics

In [None]:
text = """
An interest rate is the rental price of money. The concepts of supply, demand and equilibrium apply in this market just as they do in other markets. This market is referred to as the market for loanable funds. In the market for loanable funds, the suppliers of funds are economic entities that currently have a surplus in their budget. In other words, they have more income than they currently want to spend; they would like to save some of their money and spend it in future time periods. Instead of just putting these savings in a box on a shelf for safekeeping until they want to spend it, they can let someone else borrow that money. In essence, they are renting that money to someone else, who pays a rental price called the interest rate.
"""

kb = from_text_to_kb(text, verbose=True)
kb.print()

Input has 156 tokens
Input has 2 spans
Span boundaries are [[0, 128], [28, 156]]
Relations:
  {'head': 'supply', 'type': 'opposite of', 'tail': 'demand', 'meta': {'spans': [[0, 128]]}}
  {'head': 'equilibrium', 'type': 'has part', 'tail': 'supply', 'meta': {'spans': [[0, 128]]}}
  {'head': 'equilibrium', 'type': 'has part', 'tail': 'demand', 'meta': {'spans': [[0, 128]]}}
  {'head': 'budget', 'type': 'subclass of', 'tail': 'income', 'meta': {'spans': [[28, 156]]}}
  {'head': 'surplus', 'type': 'subclass of', 'tail': 'income', 'meta': {'spans': [[28, 156]]}}
  {'head': 'budget', 'type': 'facet of', 'tail': 'economic entities', 'meta': {'spans': [[28, 156]]}}


## Accounting

In [None]:
text = """
Cash-basis accounting can be more efficient and well-suited for certain types of businesses, such as farming or professional services provided by lawyers and doctors. However, the accrual basis of accounting is theoretically preferable to the cash basis of accounting because it takes into account the timing of the transactions (when goods and services are provided and when the cash involved in the transactions is received). Cash can often be received a significant amount of time after the initial transaction. Considering this amount allows accountants to provide, in a timely manner, relevant and complete information to stakeholders.
"""

kb = from_text_to_kb(text, verbose=True)
kb.print()

Input has 118 tokens
Input has 1 spans
Span boundaries are [[0, 128]]
Relations:
  {'head': 'accrual basis of accounting', 'type': 'opposite of', 'tail': 'Cash-basis accounting', 'meta': {'spans': [[0, 128]]}}
  {'head': 'Cash-basis accounting', 'type': 'opposite of', 'tail': 'accrual basis of accounting', 'meta': {'spans': [[0, 128]]}}
  {'head': 'accrual basis of accounting', 'type': 'opposite of', 'tail': 'cash basis of accounting', 'meta': {'spans': [[0, 128]]}}


## Financial Statement

In [None]:
text = """
People say that accounting is the “language of business.” Using the language of business, accountants are able to communicate the financial performance and health of a firm via four key financial statements. These statements are the income statement, balance sheet, statement of owner’s equity, and statement of cash flows. Each statement provides different insights into a firm’s performance and financial health. Though some users may favor one or two statements over another, they are best used together to get a full picture.
"""

kb = from_text_to_kb(text, verbose=True)
kb.print()

Input has 107 tokens
Input has 1 spans
Span boundaries are [[0, 128]]
Relations:
  {'head': 'income statement', 'type': 'subclass of', 'tail': 'financial statement', 'meta': {'spans': [[0, 128]]}}
  {'head': 'statement of owner’s equity', 'type': 'subclass of', 'tail': 'financial statement', 'meta': {'spans': [[0, 128]]}}
  {'head': 'statement of cash flows', 'type': 'subclass of', 'tail': 'financial statement', 'meta': {'spans': [[0, 128]]}}
  {'head': 'income statement', 'type': 'instance of', 'tail': 'financial statement', 'meta': {'spans': [[0, 128]]}}
  {'head': 'statement of owner’s equity', 'type': 'instance of', 'tail': 'financial statement', 'meta': {'spans': [[0, 128]]}}
  {'head': 'statement of cash flows', 'type': 'instance of', 'tail': 'financial statement', 'meta': {'spans': [[0, 128]]}}
  {'head': 'income statement', 'type': 'subclass of', 'tail': 'financial statements', 'meta': {'spans': [[0, 128]]}}
  {'head': 'statement of owner’s equity', 'type': 'subclass of', 'tail

## Crypto

In [36]:
text = """
Most books on bitcoin feature a lengthy chapter about who Mr Nakamoto may be. Each has its own theory, often based on the same sources. Some locate him in Britain (because of his use of Britishisms, such as “bloody hard”). Others reckon he is somewhere in the eastern parts of the Americas (because of the timestamps on his e-mails). He has been variously identified as a Finnish sociologist, a Japanese mathematician and an Irish student. The names mentioned most often are Nick Szabo and Hal Finney, two American cryptographers, but the former denies it and the latter died in 2014. In March last year Newsweek, a magazine, identified a man living in California, named Dorian Satoshi Nakamoto, as the real Nakamoto—which turned out to be an embarrassing (and predictable) canard. Then there is the argument that Mr Nakamoto's bitcoin code is so good that it must have been written by more than one person.
"""

kb = from_text_to_kb(text, verbose=True)
kb.print()

Input has 203 tokens
Input has 2 spans
Span boundaries are [[0, 128], [75, 203]]
Relations:
  {'head': 'Nick Szabo', 'type': 'field of work', 'tail': 'cryptographer', 'meta': {'spans': [[0, 128], [75, 203]]}}
  {'head': 'Hal Finney', 'type': 'field of work', 'tail': 'cryptographer', 'meta': {'spans': [[0, 128], [75, 203]]}}
  {'head': 'Nick Szabo', 'type': 'field of work', 'tail': 'cryptographers', 'meta': {'spans': [[0, 128]]}}
  {'head': 'Hal Finney', 'type': 'field of work', 'tail': 'cryptographers', 'meta': {'spans': [[0, 128]]}}
  {'head': 'Hal Finney', 'type': 'date of death', 'tail': '2014', 'meta': {'spans': [[75, 203]]}}


In [35]:
text = """
Most books on bitcoin feature a lengthy chapter about who Mr Nakamoto may be. Each has its own theory, often based on the same sources. Some locate him in Britain (because of his use of Britishisms, such as “bloody hard”). Others reckon he is somewhere in the eastern parts of the Americas (because of the timestamps on his e-mails). He has been variously identified as a Finnish sociologist, a Japanese mathematician and an Irish student. The names mentioned most often are Nick Szabo and Hal Finney, two American cryptographers, but the former denies it and the latter died in 2014. In March last year Newsweek, a magazine, identified a man living in California, named Dorian Satoshi Nakamoto, as the real Nakamoto—which turned out to be an embarrassing (and predictable) canard. Then there is the argument that Mr Nakamoto's bitcoin code is so good that it must have been written by more than one person.
"""

## Pre-Processing
import string

text_punc_removed = text.translate(str.maketrans('', '', string.punctuation))

kb = from_text_to_kb(text_punc_removed, verbose=True)
kb.print()

Input has 180 tokens
Input has 2 spans
Span boundaries are [[0, 128], [52, 180]]
Relations:
  {'head': 'Nick Szabo', 'type': 'date of death', 'tail': '2014', 'meta': {'spans': [[0, 128], [52, 180]]}}
  {'head': 'Hal Finney', 'type': 'date of death', 'tail': '2014', 'meta': {'spans': [[0, 128], [52, 180]]}}
  {'head': 'Nick Szabo', 'type': 'field of work', 'tail': 'American cryptographers', 'meta': {'spans': [[52, 180]]}}
