# KeyBERT for key extraction from sentences

## Installation

In [None]:
!pip install keybert

### Usage example

In [None]:
from keybert import KeyBERT

doc = """
         Supervised learning is the machine learning task of learning a function that
         maps an input to an output based on example input-output pairs. It infers a
         function from labeled training data consisting of a set of training examples.
         In supervised learning, each example is a pair consisting of an input object
         (typically a vector) and a desired output value (also called the supervisory signal).
         A supervised learning algorithm analyzes the training data and produces an inferred function,
         which can be used for mapping new examples. An optimal scenario will allow for the
         algorithm to correctly determine the class labels for unseen instances. This requires
         the learning algorithm to generalize from the training data to unseen situations in a
         'reasonable' way (see inductive bias).
      """
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(doc)

In [None]:
extracted_keywords = kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=3)

In [None]:
extracted_keywords

[('supervised', 0.6676), ('labeled', 0.4896), ('learning', 0.4813)]

In [None]:
for i in extracted_keywords:
    if i[1]>0.5:
        print(i[0])

supervised


## SAMsum implementation

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from keybert import KeyBERT
kw_model = KeyBERT()

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [5]:
import re

def remove_between_tags(sentence):
    """
        given an imput string remove the emoji part marked by <E>...</E>
    """
    pattern = r'<E>.*?<\\E>'
    result = re.sub(pattern, '', sentence)
    return result

def extract_text_after_said(sentence):
    """
        given an imput string remove the /said part which is always present in all
        the sentences
    """
    pattern = r'said "(.*?)"'
    match = re.search(pattern, sentence)
    if match:
        return match.group(1)
    else:
        return sentence

def order_keywords(phrase, keys):
    """
        given a list of keys extracted by keyBERT and the original phrase
        the function returns the key sorted as in the original phrase
    """
    keyword_positions = {key: phrase.find(key) for key in keys}
    sorted_keys = sorted(keys, key=lambda key: keyword_positions[key])
    return sorted_keys

In [6]:
def extract_keywords(sentence, thr):
    sentence1 = remove_between_tags(sentence)
    sentence = extract_text_after_said(sentence1)

    extracted_keywords = kw_model.extract_keywords(sentence, keyphrase_ngram_range=(1, 1), stop_words='english', use_maxsum=True)
    keywords_over_thresh = [keyword[0] for keyword in extracted_keywords if keyword[1]>thr]
    keywords_over_thresh = order_keywords(sentence, keywords_over_thresh)

    if len(keywords_over_thresh)!=0:
        return "<K>" + ' '.join(keywords_over_thresh) + "<\K>"
    return ""

In [7]:
def append_after_said(original_string, value_to_append):
    """
    Insert the keywords right after "said \"
    """

    index = original_string.find("said \"")

    if index != -1:
        modified_string = (
            original_string[: index + len("said \"")]
            + " "
            + value_to_append
            + original_string[index + len("said \"") :]
        )
        return modified_string
    else:
        return original_string

In [22]:
def remove_inside_e_tags(sentence):
    # Use regular expression to find and remove content inside <E>...</E> tags
    cleaned_sentence = re.sub(r'<E>.*?<\\E>', '', sentence)
    return cleaned_sentence

In [34]:
def get_keyword_no_emoji_from_json_store_new_file(split):
    """
        Function to remove emojies from the keyword file in order to
        train the model without emojies but with keys
    """

    split_str = f'preprocessed_keywords_dialog_{split}_split5_collated.json'

    with open(f"/content/drive/MyDrive/NLP-project/COMET_data/paracomet/dialogue/samsum/{split_str}", 'r') as file:
        data = json.load(file)

        new_data = {}
        for key, value in data.items():
            new_d = {}
            for k, v in value.items():
                new_value = {}
                for inner_key, inner_value in v.items():
                    if inner_key == "sentence":
                        new_value[inner_key] = remove_inside_e_tags(inner_value)
                    else:
                        new_value[inner_key] = inner_value
                new_d[k] = new_value
            new_data[key] = new_d

    new_json_file_path = f'preprocessed_keywords_no_emoji_dialog_{split}_split5_collated.json'
    with open(new_json_file_path, 'w') as new_file:
        json.dump(new_data, new_file, indent=2)

In [None]:
import json
def get_keyword_from_json_store_new_file(split):

    THR = 0.35
    split_str = f'preprocessed_dialog_{split}_split5_collated.json'

    with open(f"/content/drive/MyDrive/COMET_data/paracomet/dialogue/samsum/{split_str}", 'r') as file:
        data = json.load(file)

        new_data = {}
        for key, value in data.items():
            new_d = {}
            for k, v in value.items():
                new_value = {}
                for inner_key, inner_value in v.items():
                    if inner_key == "sentence":
                        new_value[inner_key] = append_after_said(inner_value, extract_keywords(inner_value, THR))
                    else:
                        new_value[inner_key] = inner_value
                new_d[k] = new_value
            new_data[key] = new_d

    new_json_file_path = f'preprocessed_keywords_dialog_{split}_split5_collated.json'
    with open(new_json_file_path, 'w') as new_file:
        json.dump(new_data, new_file, indent=2)

In [None]:
split = "valid"
get_keyword_from_json_store_new_file(split)

In [38]:
get_keyword_no_emoji_from_json_store_new_file(split)

In [None]:
split = ["train", "test", "validation"]
for s in split:
    get_keyword_from_json_store_new_file(s)

In [39]:
# move file to drive!
import shutil
dest_folder = "/content/drive/MyDrive/"
source_folder = f"/content/preprocessed_keywords_no_emoji_dialog_{split}_split5_collated.json"
shutil.copy(source_folder, dest_folder)

'/content/drive/MyDrive/preprocessed_keywords_no_emoji_dialog_test_split5_collated.json'

In [None]:
# alternative JSON representation suitable to be the input
import json
import re

input_file_path = "/content/drive/MyDrive/COMET_data/paracomet/dialogue/samsum/preprocessed_keywords_no_emoji_dialog_train_split5_collated.json"
output_file_path = "/content/drive/MyDrive/COMET_data/paracomet/dialogue/samsum/cristante.json"

with open(input_file_path, "r") as file:
    data = json.load(file)

output_data = {}

for key, value in data.items():
    output_data[key] = {}
    for sub_key, sub_value in value.items():
        sentence = sub_value["sentence"]
        match = re.search(r'<K>(.*?)<\\K>', sentence)
        if match:
            out_value = match.group(1)
            sentence = re.sub(r'<K>.*?<\\K>', '', sentence)
        else:
            out_value = ""
        output_data[key][sub_key] = {
            "sentence": sentence,
            "relation": "Keywords",
            "out": out_value
        }

# Save the updated data to a new JSON file
with open(output_file_path, "w") as output_file:
    json.dump(output_data, output_file, indent=2)

print(f"Updated data saved to {output_file_path}")