# REBEL MODEL 

## Import Packages

In [None]:
import pandas as pd
import ast
import re

from tqdm import tqdm
tqdm.pandas()

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

## Triplet Extraction Pipeline - https://huggingface.co/Babelscape/rebel-large

In [None]:
def extract_triplets(text):
    triplets = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    for token in text.replace("<s>", "").replace("<pad>", "").replace("</s>", "").split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        triplets.append({'head': subject.strip(), 'type': relation.strip(),'tail': object_.strip()})
    return triplets

## Define the Model Parameters

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")
gen_kwargs = {
    "max_length": 256,
    "length_penalty": 0,
    "num_beams": 20,
    "num_return_sequences": 3,
}

## Extract Triplets Function

In [None]:
def triplets_list(text):
  model_inputs = tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt')

  generated_tokens = model.generate(
    model_inputs["input_ids"].to(model.device),
    attention_mask=model_inputs["attention_mask"].to(model.device),
    **gen_kwargs,
)

  decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

  triplets = []

  for i in decoded_preds:
    triplets.append(extract_triplets(i))

  return triplets

In [None]:
# #Text to extract triplets from
# text = cleaned_sentences.iloc[10,4]

# # Tokenizer text
# model_inputs = tokenizer(text, max_length=256, padding=True, truncation=True, return_tensors = 'pt')

# # Generate
# generated_tokens = model.generate(
#     model_inputs["input_ids"].to(model.device),
#     attention_mask=model_inputs["attention_mask"].to(model.device),
#     **gen_kwargs,
# )

# # Extract text
# decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=False)

# # Extract triplets
# for idx, sentence in enumerate(decoded_preds):
#     print(f'Prediction triplets sentence {idx}')
#     print(extract_triplets(sentence))

In [None]:
triplets_list(cleaned_sentences.iloc[10,4])

[[{'head': 'lease', 'type': 'subclass of', 'tail': 'rental'}],
 [{'head': 'lease', 'type': 'subclass of', 'tail': 'usufruct'}],
 [{'head': 'lease', 'type': 'subclass of', 'tail': 'lease'}]]

In [None]:
cleaned_sentences.head()

Unnamed: 0,Title,Articles,coreference_list_ready,coreference_paragraph_clean,coreference_sentence_clean,coreference_sentence
0,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 1 Subject matter,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘applicant’ means a natural person or an entit...,applicant means a natural person or an entity ...,applicant means a natural person or an entity ...,‘applicant’ means a natural person or an entit...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘application document’ means a tender, a reque...",application document means a tender a request ...,application document means a tender a request ...,"‘application document’ means a tender, a reque..."
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘award procedure’ means a procurement procedur...,award procedure means a procurement procedure ...,award procedure means a procurement procedure ...,‘award procedure’ means a procurement procedur...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘basic act’ means a legal act, other than a re...",basic act means a legal act other than a recom...,basic act means a legal act other than a recom...,"‘basic act’ means a legal act, other than a re..."


# Fit REBEL to Extract Triplets from the Coreferenced Text 

- First Fit REBEL on Paragraphs
- Second Fit REBEL on Sentences

In [None]:
coreferenced_text = pd.read_csv(r'/content/coreferenced_final_dataframe.csv')

In [None]:
def convert_to_list(column):
    return column.apply(ast.literal_eval)

coreferenced_text['Content'] = convert_to_list(coreferenced_text['Content'])
coreferenced_text['coreference_list_ready'] = convert_to_list(coreferenced_text['coreference_list_ready'])

In [None]:
coreferenced_text.head()

Unnamed: 0,Title,Articles,Content,Cleaned_Content,whole_text,coreference_text,coreference_list_ready
0,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 1 Subject matter,[\nThis Regulation lays down the rules for the...,['this regulation lays down the rules for the ...,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...,[this regulation lays down the rules for the e...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,[ ‘applicant’ means a natural person or an ent...,['‘applicant’ means a natural person or an ent...,‘applicant’ means a natural person or an entit...,‘applicant’ means a natural person or an entit...,[‘applicant’ means a natural person or an enti...
2,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 3 Compliance of secondary legislation...,[ \nProvisions concerning the implementation o...,['provisions concerning the implementation of ...,provisions concerning the implementation of th...,provisions concerning the implementation of th...,[provisions concerning the implementation of t...
3,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...","Article 4 Periods, dates and time limits",[\nUnless otherwise provided in this Regulatio...,['unless otherwise provided in this regulation...,"unless otherwise provided in this regulation, ...","unless otherwise provided in this regulation, ...","[unless otherwise provided in this regulation,..."
4,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 5 Protection of personal data,[\nThis Regulation is without prejudice to Reg...,['this regulation is without prejudice to regu...,this regulation is without prejudice to regula...,this regulation is without prejudice to regula...,[this regulation is without prejudice to regul...


In [None]:
coreferenced_text_paragraph = coreferenced_text[['Title','Articles','coreference_list_ready']]

In [None]:
coreferenced_text_paragraph.head()

Unnamed: 0,Title,Articles,coreference_list_ready
0,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 1 Subject matter,[this regulation lays down the rules for the e...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,[‘applicant’ means a natural person or an enti...
2,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 3 Compliance of secondary legislation...,[provisions concerning the implementation of t...
3,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...","Article 4 Periods, dates and time limits","[unless otherwise provided in this regulation,..."
4,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 5 Protection of personal data,[this regulation is without prejudice to regul...


## Remove Punctuation From each Paragraph of each Article

In [None]:
def remove_punct(text):
    pattern_a = r'\s\([a-z]\)'
    pattern_b = r'\n'
    text = re.sub(pattern_a, '', text)
    text = re.sub(pattern_b, '', text)
    text = re.sub("  ", '', text)
    cleaned = ""
    punctuation = '!@#$%^&*_-+={}[];"\'|<>,?/~`’‘'
    for i in text:
        if i not in punctuation:
            cleaned = cleaned + i
    return cleaned.strip()

coreferenced_text_paragraph = coreferenced_text_paragraph.explode('coreference_list_ready')
coreferenced_text_paragraph['coreference_paragraph_clean'] = coreferenced_text_paragraph['coreference_list_ready'].apply(remove_punct)

In [None]:
coreferenced_text_paragraph.head()

Unnamed: 0,Title,Articles,coreference_list_ready,coreference_paragraph_clean
0,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 1 Subject matter,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘applicant’ means a natural person or an entit...,applicant means a natural person or an entity ...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘application document’ means a tender, a reque...",application document means a tender a request ...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘award procedure’ means a procurement procedur...,award procedure means a procurement procedure ...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘basic act’ means a legal act, other than a re...",basic act means a legal act other than a recom...


## Fit REBEL to the Paragraph of each Article

In [None]:
coreferenced_text_paragraph["triplets_clean_parag"] = coreferenced_text_paragraph["coreference_paragraph_clean"].progress_apply(triplets_list)

100%|██████████| 1106/1106 [1:18:44<00:00,  4.27s/it]


In [None]:
coreferenced_text_paragraph.iloc[0,3]

'this regulation lays down the rules for the establishment and the implementation of the general budget of the european union and of the european atomic energy community (the budget) and the presentation and auditing of the european union and of the european atomic energy community accounts.'

In [None]:
triplets_list(coreferenced_text_paragraph.iloc[0,3])

[[{'head': 'european atomic energy community',
   'type': 'part of',
   'tail': 'european union'}],
 [{'head': 'european atomic energy community',
   'type': 'parent organization',
   'tail': 'european union'}],
 [{'head': 'european atomic energy community accounts',
   'type': 'country',
   'tail': 'european union'}]]

In [None]:
coreferenced_text_paragraph

Unnamed: 0,Title,Articles,coreference_list_ready,coreference_paragraph_clean,triplets_clean_parag
0,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 1 Subject matter,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...,"[[{'head': 'european atomic energy community',..."
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘applicant’ means a natural person or an entit...,applicant means a natural person or an entity ...,"[[{'head': 'natural person', 'type': 'subclass..."
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘application document’ means a tender, a reque...",application document means a tender a request ...,"[[{'head': 'grant application', 'type': 'subcl..."
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘award procedure’ means a procurement procedur...,award procedure means a procurement procedure ...,"[[{'head': 'procurement', 'type': 'part of', '..."
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘basic act’ means a legal act, other than a re...",basic act means a legal act other than a recom...,"[[{'head': 'euratom treaty', 'type': 'followed..."
...,...,...,...,...,...
280,TITLE XVI \nINFORMATION REQUESTS AND DELEGATED...,Article 281 Repeal,"without prejudice to article 279(3), the commi...",without prejudice to article 279(3) the commis...,[[{'head': 'delegated regulation(eu) no 126820...
280,TITLE XVI \nINFORMATION REQUESTS AND DELEGATED...,Article 281 Repeal,references to the repealed regulation shall be...,references to the repealed regulation shall be...,"[[{'head': 'correlation table', 'type': 'part ..."
281,TITLE XVI \nINFORMATION REQUESTS AND DELEGATED...,Article 282 Entry into force and application,this regulation shall enter into force on the ...,this regulation shall enter into force on the ...,"[[{'head': 'official journal', 'type': 'publis..."
281,TITLE XVI \nINFORMATION REQUESTS AND DELEGATED...,Article 282 Entry into force and application,this regulation shall apply from 2 august 2018.,this regulation shall apply from 2 august 2018.,"[[{'head': '2 august 2018', 'type': 'point in ..."


## Save as CSV - REBEL_triples_per_paragraph.csv

In [None]:
coreferenced_text_paragraph.to_csv('REBEL_triples_per_paragraph.csv', index=False)

## Fit REBEL on Clean and Not Clean Sentences


In [None]:
coreferenced_text_paragraph_sent = coreferenced_text_paragraph[['Title','Articles','coreference_list_ready','coreference_paragraph_clean']]

coreferenced_text_paragraph_sent['coreference_sentence_clean'] = coreferenced_text_paragraph_sent['coreference_paragraph_clean'].apply(lambda x: x.split('.'))
coreferenced_text_paragraph_sent['coreference_sentence'] = coreferenced_text_paragraph_sent['coreference_list_ready'].apply(lambda x: x.split('.'))


In [None]:
coreferenced_text_paragraph.head()

Unnamed: 0,Title,Articles,coreference_list_ready,coreference_paragraph_clean
0,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 1 Subject matter,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘applicant’ means a natural person or an entit...,applicant means a natural person or an entity ...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘application document’ means a tender, a reque...",application document means a tender a request ...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘award procedure’ means a procurement procedur...,award procedure means a procurement procedure ...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘basic act’ means a legal act, other than a re...",basic act means a legal act other than a recom...


In [None]:
coreferenced_text_paragraph_sent.head()

Unnamed: 0,Title,Articles,coreference_list_ready,coreference_paragraph_clean,coreference_sentence_clean,coreference_sentence
0,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 1 Subject matter,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...,[this regulation lays down the rules for the e...,[this regulation lays down the rules for the e...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘applicant’ means a natural person or an entit...,applicant means a natural person or an entity ...,[applicant means a natural person or an entity...,[‘applicant’ means a natural person or an enti...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘application document’ means a tender, a reque...",application document means a tender a request ...,[application document means a tender a request...,"[‘application document’ means a tender, a requ..."
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘award procedure’ means a procurement procedur...,award procedure means a procurement procedure ...,[award procedure means a procurement procedure...,[‘award procedure’ means a procurement procedu...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘basic act’ means a legal act, other than a re...",basic act means a legal act other than a recom...,[basic act means a legal act other than a reco...,"[‘basic act’ means a legal act, other than a r..."


## Preserve only Triplets that contain text elements

In [None]:
def keep_text_elements(text_list):
    cleaned_text_list = []
    if isinstance(text_list,list):
        for element in text_list:
            if len(re.findall(r'[a-zA-Z]', element)) > 1:
                cleaned_text_list.append(element.strip())
    return cleaned_text_list

coreferenced_text_paragraph_sent['coreference_sentence_clean'] = coreferenced_text_paragraph_sent['coreference_sentence_clean'].apply(keep_text_elements)
coreferenced_text_paragraph_sent['coreference_sentence'] = coreferenced_text_paragraph_sent['coreference_sentence'].apply(keep_text_elements)

In [None]:
cleaned_sentences = coreferenced_text_paragraph_sent[['Title', 'Articles', 'coreference_list_ready','coreference_paragraph_clean','coreference_sentence_clean']]
cleaned_sentences = cleaned_sentences.explode('coreference_sentence_clean')
cleaned_sentences.shape

(2396, 5)

In [None]:
sentences_not_cleaned = coreferenced_text_paragraph_sent[['Title', 'Articles', 'coreference_list_ready','coreference_paragraph_clean','coreference_sentence']]
sentences_not_cleaned = sentences_not_cleaned.explode('coreference_sentence')
sentences_not_cleaned.shape

(2396, 5)

In [None]:
cleaned_sentences["coreference_sentence"] = sentences_not_cleaned["coreference_sentence"]


In [None]:
cleaned_sentences.head()

Unnamed: 0,Title,Articles,coreference_list_ready,coreference_paragraph_clean,coreference_sentence_clean,coreference_sentence
0,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 1 Subject matter,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘applicant’ means a natural person or an entit...,applicant means a natural person or an entity ...,applicant means a natural person or an entity ...,‘applicant’ means a natural person or an entit...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘application document’ means a tender, a reque...",application document means a tender a request ...,application document means a tender a request ...,"‘application document’ means a tender, a reque..."
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘award procedure’ means a procurement procedur...,award procedure means a procurement procedure ...,award procedure means a procurement procedure ...,‘award procedure’ means a procurement procedur...
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘basic act’ means a legal act, other than a re...",basic act means a legal act other than a recom...,basic act means a legal act other than a recom...,"‘basic act’ means a legal act, other than a re..."


In [None]:
cleaned_sentences.iloc[3,5]

'‘award procedure’ means a procurement procedure, a grant award procedure, a contest for prizes, or a procedure for the selection of experts or persons or entities implementing the budget pursuant to point (c) of the first subparagraph of article 62(1);'

In [None]:
cleaned_sentences["triplets_clean_sent"] = cleaned_sentences["coreference_sentence_clean"].progress_apply(triplets_list)

100%|██████████| 2396/2396 [2:36:53<00:00,  3.93s/it]


Unnamed: 0,triplets_clean_sent
0,"[[{'head': 'european atomic energy community',..."
1,"[[{'head': 'natural person', 'type': 'subclass..."
1,"[[{'head': 'grant application', 'type': 'subcl..."
1,"[[{'head': 'procurement', 'type': 'part of', '..."
1,"[[{'head': 'euratom treaty', 'type': 'followed..."
...,...
281,"[[{'head': 'member states', 'type': 'subclass ..."
281,"[[{'head': 'brussels', 'type': 'point in time'..."
281,"[[{'head': 'european parliament', 'type': 'off..."
281,"[[{'head': 'president', 'type': 'part of', 'ta..."


In [None]:
cleaned_sentences.head()

Unnamed: 0,Title,Articles,coreference_list_ready,coreference_paragraph_clean,coreference_sentence_clean,coreference_sentence,triplets_clean_sent
0,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 1 Subject matter,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...,this regulation lays down the rules for the es...,"[[{'head': 'european atomic energy community',..."
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘applicant’ means a natural person or an entit...,applicant means a natural person or an entity ...,applicant means a natural person or an entity ...,‘applicant’ means a natural person or an entit...,"[[{'head': 'natural person', 'type': 'subclass..."
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘application document’ means a tender, a reque...",application document means a tender a request ...,application document means a tender a request ...,"‘application document’ means a tender, a reque...","[[{'head': 'grant application', 'type': 'subcl..."
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,‘award procedure’ means a procurement procedur...,award procedure means a procurement procedure ...,award procedure means a procurement procedure ...,‘award procedure’ means a procurement procedur...,"[[{'head': 'procurement', 'type': 'part of', '..."
1,"TITLE I \nSUBJECT MATTER, DEFINITIONS AND GENE...",Article 2 Definitions,"‘basic act’ means a legal act, other than a re...",basic act means a legal act other than a recom...,basic act means a legal act other than a recom...,"‘basic act’ means a legal act, other than a re...","[[{'head': 'euratom treaty', 'type': 'followed..."


## Save CSV - REBEL_triples_per_sentence.csv

In [None]:
cleaned_sentences.to_csv('REBEL_triples_per_sentence.csv', index=False)