# 🚞 Zero-shot RE Training

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# # if you're running this in a colab notebook, you can run this cell to install the necessary dependencies
# pip install glirel
# !python -m spacy download en_core_web_sm

In [3]:
from glirel import GLiREL

save_path = 'logs/zero_rel/zero_rel-2024-06-10__23-00-15/model_75000'
model = GLiREL.from_pretrained(save_path)
# model = GLiREL.from_pretrained('jackboyla/glirel_beta')

config.json not found in /home/jackboylan/GLiREL/logs/zero_rel/zero_rel-2024-06-10__23-00-15/model_75000


# Inference

To infer, the model needs `tokens`, `NER`, and `zero shot labels`.

### Eval data

In [4]:
import json
with open('./data/few_rel_all.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

i = 0

tokens = data[i]['tokenized_text']
ner = data[i]['ner']
labels = list(set([r['relation_text'] for r in data[i]['relations']]))
print(tokens)
print()
print(ner)
print(labels)

['Derren', 'Nesbitt', 'had', 'a', 'history', 'of', 'being', 'cast', 'in', '"', 'Doctor', 'Who', '"', ',', 'having', 'played', 'villainous', 'warlord', 'Tegana', 'in', 'the', '1964', 'First', 'Doctor', 'serial', '"', 'Marco', 'Polo', '"', '.']

[[26, 27, 'Q2989881', 'Marco Polo'], [22, 23, 'Q2989412', 'First Doctor']]
['characters']


In [5]:
labels = ['country of origin', 'licensed to broadcast to', 'father', 'followed by'] + labels
print(labels)

['country of origin', 'licensed to broadcast to', 'father', 'followed by', 'characters']


In [10]:
relations = model.predict_relations(tokens, labels, threshold=0.0, ner=ner)

print('Number of relations:', len(relations))  # num entity pairs (both directions) * num classes.... provided they're over the threshold

sorted_data_desc = sorted(relations, key=lambda x: x['score'], reverse=True)
print("\nDescending Order by Score:")
for item in sorted_data_desc:
    print(item)

Number of relations: 18

Descending Order by Score:
{'head_pos': [26, 28], 'tail_pos': [22, 24], 'head_text': ['Marco', 'Polo'], 'tail_text': ['First', 'Doctor'], 'label': 'no relation', 'score': 0.9923334121704102}
{'head_pos': [22, 24], 'tail_pos': [26, 28], 'head_text': ['First', 'Doctor'], 'tail_text': ['Marco', 'Polo'], 'label': 'no relation', 'score': 0.9915636777877808}
{'head_pos': [26, 28], 'tail_pos': [22, 24], 'head_text': ['Marco', 'Polo'], 'tail_text': ['First', 'Doctor'], 'label': 'followed by', 'score': 0.00021728052524849772}
{'head_pos': [22, 24], 'tail_pos': [26, 28], 'head_text': ['First', 'Doctor'], 'tail_text': ['Marco', 'Polo'], 'label': 'followed by', 'score': 0.00017046951688826084}
{'head_pos': [26, 28], 'tail_pos': [22, 24], 'head_text': ['Marco', 'Polo'], 'tail_text': ['First', 'Doctor'], 'label': 'licensed to broadcast to', 'score': 0.00016753433737903833}
{'head_pos': [26, 28], 'tail_pos': [22, 24], 'head_text': ['Marco', 'Polo'], 'tail_text': ['First', 'Do

### Real-world example

Constrain the entity types that can associated with a relationship.
e.g:

`co-founder` can only have a head `PERSON` entity and a tail `ORG` entity.

In [7]:
# Real-world example
import spacy
from glirel.modules.utils import constrain_relations_by_entity_type

nlp = spacy.load('en_core_web_sm')


text = "Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976. The company is headquartered in Cupertino, California."

# text = "Jack Dorsey's father, Tim Dorsey, is a licensed pilot. Jack met his wife Sarah Paulson in New York in 2003. They have one son, Edward."

labels = {"glirel_labels": {
    'co-founder': {"allowed_head": ["PERSON"], "allowed_tail": ["ORG"]}, 
    'country of origin': {"allowed_head": ["PERSON", "ORG"], "allowed_tail": ["LOC", "GPE"]}, 
    'licensed to broadcast to': {"allowed_head": ["ORG"]},  
    'no relation': {},  
    'parent': {"allowed_head": ["PERSON"], "allowed_tail": ["PERSON"]}, 
    'followed by': {"allowed_head": ["PERSON", "ORG"], "allowed_tail": ["PERSON", "ORG"]},  
    'located in or next to body of water': {"allowed_head": ["LOC", "GPE", "FAC"], "allowed_tail": ["LOC", "GPE"]},  
    'spouse': {"allowed_head": ["PERSON"], "allowed_tail": ["PERSON"]},  
    'child': {"allowed_head": ["PERSON"], "allowed_tail": ["PERSON"]},  
    'founder': {"allowed_head": ["PERSON"], "allowed_tail": ["ORG"]},  
    'founded on date': {"allowed_head": ["ORG"], "allowed_tail": ["DATE"]},
    'headquartered in': {"allowed_head": ["ORG"], "allowed_tail": ["LOC", "GPE", "FAC"]},  
    'acquired by': {"allowed_head": ["ORG"], "allowed_tail": ["ORG", "PERSON"]},  
    'subsidiary of': {"allowed_head": ["ORG"], "allowed_tail": ["ORG", "PERSON"]}, 
    }
}


def predict_and_show(text, labels):
    doc = nlp(text)
    print(f"Text: {text}")

    tokens = [token.text for token in doc]

    # NOTE: the end index should be inclusive
    ner = [[ent.start, (ent.end - 1), ent.label_, ent.text] for ent in doc.ents]
    print(f"Entities detected: {ner}")

    labels_and_constraints = None
    if isinstance(labels, dict):
        labels = labels["glirel_labels"]
        labels_and_constraints = labels
        labels = list(labels.keys())

    relations = model.predict_relations(tokens, labels, threshold=0.0, ner=ner, top_k=1)

    if isinstance(labels_and_constraints, dict):
        print('Constraining relations by entity type')
        relations = constrain_relations_by_entity_type(doc.ents, labels_and_constraints, relations)

    print('Number of relations:', len(relations))

    sorted_data_desc = sorted(relations, key=lambda x: x['score'], reverse=True)
    print("\nDescending Order by Score:")
    for item in sorted_data_desc:
        print(f"{item['head_text']} --> {item['label']} --> {item['tail_text']} | score: {item['score']}")

predict_and_show(text, labels)

Text: Apple Inc. was founded by Steve Jobs, Steve Wozniak, and Ronald Wayne in April 1976. The company is headquartered in Cupertino, California.
Entities detected: [[0, 1, 'ORG', 'Apple Inc.'], [5, 6, 'PERSON', 'Steve Jobs'], [8, 9, 'PERSON', 'Steve Wozniak'], [12, 13, 'PERSON', 'Ronald Wayne'], [15, 16, 'DATE', 'April 1976'], [23, 23, 'GPE', 'Cupertino'], [25, 25, 'GPE', 'California']]
Constraining relations by entity type
Number of relations: 6

Descending Order by Score:
['Steve', 'Wozniak'] --> founder --> ['Apple', 'Inc.'] | score: 0.7134989500045776
['Steve', 'Jobs'] --> founder --> ['Apple', 'Inc.'] | score: 0.7015261054039001
['Ronald', 'Wayne'] --> founder --> ['Apple', 'Inc.'] | score: 0.6583943963050842
['Apple', 'Inc.'] --> headquartered in --> ['California'] | score: 0.6483288407325745
['Apple', 'Inc.'] --> headquartered in --> ['Cupertino'] | score: 0.6232856512069702
['Apple', 'Inc.'] --> founded on date --> ['April', '1976'] | score: 0.42589667439460754


A simple list of relation types can also be passed, although this generally results in noisier results.

In [11]:
text = "Jack knows Gill. They live in the same house in London. They are not related."
labels = ['family relation', 'knows', 'lives with', 'loves', 'licensed to broadcast to', 'father', 'followed by', 'no relation', 'lives in',]
predict_and_show(text, labels)

Text: Jack knows Gill. They live in the same house in London. They are not related.
Entities detected: [[0, 0, 'PERSON', 'Jack'], [2, 2, 'PERSON', 'Gill'], [11, 11, 'GPE', 'London']]
Number of relations: 6

Descending Order by Score:
['Jack'] --> lives in --> ['London'] | score: 0.8882972002029419
['Gill'] --> lives in --> ['London'] | score: 0.8862284421920776
['London'] --> lives in --> ['Jack'] | score: 0.8350544571876526
['London'] --> lives in --> ['Gill'] | score: 0.818295955657959
['Jack'] --> knows --> ['Gill'] | score: 0.7788900136947632
['Gill'] --> knows --> ['Jack'] | score: 0.7585681676864624


In [9]:
# model.save_pretrained(
#     './release_model/glirel_beta', 
#     push_to_hub=True, 
#     repo_id='jackboyla/glirel_beta'
# )