# KeyBERT for key extraction from sentences

## Installation

In [None]:
!pip install keybert

## SAMsum implementation

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from keybert import KeyBERT
kw_model = KeyBERT()

In [4]:
import re

def remove_between_tags(sentence):
    """
        given an imput string remove the emoji part marked by <E>...</E>
    """
    pattern = r'<E>.*?<\\E>'
    result = re.sub(pattern, '', sentence)
    return result

def extract_text_after_said(sentence):
    """
        given an imput string remove the /said part which is always present in all
        the sentences
    """
    pattern = r'said "(.*?)"'
    match = re.search(pattern, sentence)
    if match:
        return match.group(1)
    else:
        return sentence

def order_keywords(phrase, keys):
    """
        given a list of keys extracted by keyBERT and the original phrase
        the function returns the key sorted as in the original phrase
    """
    keyword_positions = {key: phrase.find(key) for key in keys}
    sorted_keys = sorted(keys, key=lambda key: keyword_positions[key])
    return sorted_keys

In [5]:
def extract_keywords(sentence, thr):
    sentence1 = remove_between_tags(sentence)
    sentence = extract_text_after_said(sentence1)

    extracted_keywords = kw_model.extract_keywords(sentence, keyphrase_ngram_range=(1, 1), stop_words='english', use_maxsum=True)
    keywords_over_thresh = [keyword[0] for keyword in extracted_keywords if keyword[1]>thr]
    keywords_over_thresh = order_keywords(sentence, keywords_over_thresh)

    if len(keywords_over_thresh)!=0:
        return "<K>" + ' '.join(keywords_over_thresh) + "<\K>"
    return ""

In [6]:
import json
def get_keyword_from_json_store_new_file(split):

    split_str = f'preprocessed_dialog_{split}_split5_collated.json'

    with open(f"/content/drive/MyDrive/NLP-project/COMET_data/paracomet/dialogue/samsum/{split_str}", 'r') as file:
        data = json.load(file)

        new_data = {}
        for key, value in data.items():
            new_d = {}
            for k, v in value.items():
                new_value = {}
                for inner_key, inner_value in v.items():
                    if inner_key == "sentence":
                        new_value[inner_key] = extract_keywords(inner_value, 0.35) + inner_value
                    else:
                        new_value[inner_key] = inner_value
                new_d[k] = new_value
            new_data[key] = new_d

    new_json_file_path = f'preprocessed_keywords_dialog_{split}_split5_collated.json'
    with open(new_json_file_path, 'w') as new_file:
        json.dump(new_data, new_file, indent=2)


In [None]:
split = "train"
get_keyword_from_json_store_new_file(split)

In [None]:
split = ["train", "test", "validation"]
for s in split:
    get_keyword_from_json_store_new_file(s)

In [None]:
# move file to drive!
import shutil
dest_folder = "/content/drive/MyDrive/"
source_folder = f"/content/preprocessed_keyword_dialog_{split}_split5_collated.json"
shutil.copy(source_folder, dest_folder)