In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 5.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 38.1 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 49.8 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [3]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch

In [4]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

Downloading:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/123 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/344 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

In [5]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

In [6]:
class KB():
    def __init__(self):
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r1):
        r2 = [r for r in self.relations
              if self.are_relations_equal(r1, r)][0]
        spans_to_add = [span for span in r1["meta"]["spans"]
                        if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add

    def add_relation(self, r):
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        for r in self.relations:
            print(f"  {(r['head'],r['type'],r['tail'])} spans:{r['meta']['spans']}")

In [7]:
def from_text_to_kb(text, gen_kwargs, span_length=128, verbose=False):
    # tokenize whole text
    inputs = tokenizer([text], return_tensors="pt")
    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    # span_length = num_tokens
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) / 
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = gen_kwargs["num_return_sequences"]
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    kb = KB()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                "spans": [spans_boundaries[current_span_index]]
            }
            kb.add_relation(relation)
        i += 1

    return kb

## Hyperparameter Tuning

In [11]:
documents = {
    'corporate_structure': 'Within the general field of corporate public relations is a specific subdivision referred to as investor relations (IR). IR involves elements of communication, marketing, and finance and is designed to control the flow of information from the management of a public corporation to its investors and stakeholders. Because the investment community plays such a critical role in the overall growth and success of any corporation, it is imperative that firms maintain strong and open relationships with their shareholder or potential investor audience. IR was developed to take responsibility for achieving and maintaining these crucial relationships.',
    'economics': 'An interest rate is the rental price of money. The concepts of supply, demand and equilibrium apply in this market just as they do in other markets. This market is referred to as the market for loanable funds. In the market for loanable funds, the suppliers of funds are economic entities that currently have a surplus in their budget. In other words, they have more income than they currently want to spend; they would like to save some of their money and spend it in future time periods. Instead of just putting these savings in a box on a shelf for safekeeping until they want to spend it, they can let someone else borrow that money. In essence, they are renting that money to someone else, who pays a rental price called the interest rate.',
    'accounting': 'Cash-basis accounting can be more efficient and well-suited for certain types of businesses, such as farming or professional services provided by lawyers and doctors. However, the accrual basis of accounting is theoretically preferable to the cash basis of accounting because it takes into account the timing of the transactions (when goods and services are provided and when the cash involved in the transactions is received). Cash can often be received a significant amount of time after the initial transaction. Considering this amount allows accountants to provide, in a timely manner, relevant and complete information to stakeholders.',
    'financial_statement': 'People say that accounting is the “language of business.” Using the language of business, accountants are able to communicate the financial performance and health of a firm via four key financial statements. These statements are the income statement, balance sheet, statement of owner’s equity, and statement of cash flows. Each statement provides different insights into a firm’s performance and financial health. Though some users may favor one or two statements over another, they are best used together to get a full picture.',
    'crypto': '''Most books on bitcoin feature a lengthy chapter about who Mr Nakamoto may be. Each has its own theory, often based on the same sources. Some locate him in Britain (because of his use of Britishisms, such as “bloody hard”). Others reckon he is somewhere in the eastern parts of the Americas (because of the timestamps on his e-mails). He has been variously identified as a Finnish sociologist, a Japanese mathematician and an Irish student. The names mentioned most often are Nick Szabo and Hal Finney, two American cryptographers, but the former denies it and the latter died in 2014. In March last year Newsweek, a magazine, identified a man living in California, named Dorian Satoshi Nakamoto, as the real Nakamoto—which turned out to be an embarrassing (and predictable) canard. Then there is the argument that Mr Nakamoto's bitcoin code is so good that it must have been written by more than one person.''',
    'jpmc': 'JPMorgan Chase & Co. is an American multinational investment bank and financial services holding company headquartered in New York City and incorporated in Delaware. As of 2022, JPMorgan Chase is the largest bank in the United States, the world’s largest bank by market capitalization, and the fifth largest bank in the world in terms of total assets with total assets if US$3.954 trillion. Additionally, JPMorgan Chase is ranked 24th on the Fortune 500 list of the largest United States corporations by total revenue.',
    'google': 'Sundar Pichai, CEO of Alphabet and Google, said: “In the second quarter of 2022 our performance was driven by Search and Cloud. The investments we’ve made over the years in AI and computing are helping to make our services particularly valuable for consumers, and highly effective for businesses of all sizes. As we sharpen our focus, we’ll continue to invest responsibly in deep computer science for the long-term.” Ruth Porat, CFO of Alphabet and Google, said: “Our consistent investments to support long-term growth are reflected in our solid performance in the second quarter of 2022, with revenues of $69.7 billion in the quarter, up 13% versus last year or 16% on a constant currency basis.”',
    'apple': '''Who Owns the Most Apple Stock?The biggest individual insider shareholder of Apple is Arthur Levinson, who has been the company's chair of the board since 2011. As of Feb. 2, 2021, Levinson owns 4.5 million shares of Apple stock.The biggest institutional shareholder of Apple is The Vanguard Group, which owns 1.3 billion shares, representing 7.8% of total shares outstanding, according to the company's most recent proxy filing, reflecting the number of shares as of Jan. 5, 2021.'''
}

for doc_key in documents:
    print(f"{doc_key}")
    for beam_size in [5]:
        for length_penalty in [5]:
            for num_return_sequences in [1]:
                for span_length in [128]:
                    # num_return_sequences = 3 if beam_size >=3 else 1
                    print(f" beam_size: {beam_size}, length_penalty: {length_penalty}, num_return_sequences: {num_return_sequences}")
                    gen_kwargs = {
                        "max_length": 256,
                        "length_penalty": length_penalty,
                        "num_beams": beam_size,
                        "num_return_sequences": num_return_sequences
                    }
                    kb = from_text_to_kb(documents[doc_key], gen_kwargs, span_length)
                    kb.print()


corporate_structure
 beam_size: 5, length_penalty: 5, num_return_sequences: 1
  ('investor relations', 'subclass of', 'corporate public relations') spans:[[0, 128]]
  ('marketing', 'subclass of', 'communication') spans:[[0, 128]]
  ('shareholder', 'subclass of', 'stakeholders') spans:[[0, 128]]
  ('potential investor', 'subclass of', 'investors') spans:[[0, 128]]
  ('stakeholders', 'subclass of', 'communication') spans:[[0, 128]]
  ('stakeholders', 'has part', 'potential investor') spans:[[0, 128]]
  ('IR', 'subclass of', 'corporate public relations') spans:[[0, 128]]
  ('IR', 'subclass of', 'investor relations') spans:[[0, 128]]
  ('shareholder', 'subclass of', 'investors') spans:[[0, 128]]
  ('IR', 'subclass of', 'investor') spans:[[0, 128]]
economics
 beam_size: 5, length_penalty: 5, num_return_sequences: 1
  ('supply', 'opposite of', 'demand') spans:[[0, 128]]
  ('equilibrium', 'has part', 'supply') spans:[[0, 128]]
  ('equilibrium', 'has part', 'demand') spans:[[0, 128]]
  ('marke