# core

> contains functions to extract keywords

In [19]:
#| default_exp core

In [20]:
#| hide
from nbdev.showdoc import *

In [21]:
#| export
import pandas as pd
import numpy as np
import gradio as gr
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
from functools import partial

In [22]:
#| export
def read_in_text(file_path:str): # path of text file
    "Read in the text file"
    with open(file_path, 'r') as f: return f.read()

In [23]:
#| export
def is_word_in_range(word:str, # input word
                     min_len:int, # Min length of word
                     max_len:int): # Max length of word
    "returns True if word is in range (min_len, max_len) both inclusive"
    if (len(word) <= max_len) and (len(word) >= min_len): return True
    else: return False

Example usage of extract_keywords

In [24]:
from functools import partial

In [25]:
is_word_in_range = partial(is_word_in_range, min_len=4, max_len=16)

In [26]:
# For single word
is_word_in_range("Elephant")

True

In [27]:
is_word_in_range("Elephant Rhinoceros Octopus")

False

In [28]:
# For list of words
words = ['new moon', 'half moon', 'moon', 'lunar month', 'waxing crescent', 'distinct phases', 'crazy red moon in space']
[is_word_in_range(word) for word in words]

[True, True, True, True, True, True, False]

In [29]:
#| export
def remove_singular_words(word_list:list): # List of words
    "Removes singular words when they have a corresponding plural word in a list of words."
    plural_words = set()
    singular_words = []
    
    for word in word_list:
        # Check if the word is in plural form by adding 's'
        plural_form = word + 's'
        
        if plural_form in word_list:
            # If the plural form is in the list, add it to the set of plural words
            plural_words.add(plural_form)
        else:
            # If the word is not in plural form, add it to the list of singular words
            singular_words.append(word)
    
    return singular_words

Example showing how to remove singular words if both singular and plural is present in list

In [30]:
word_list = ['cat', 'dog', 'cats', 'apples', 'apple']
result = remove_singular_words(word_list)
print(result)  # Output: ['dog', 'apples']

['dog', 'cats', 'apples']


In [31]:
#| export
def extract_keywords(text:str, # input text
                     n:int, # number of keywords
                     min_len:int, # minimum length of word
                     max_len:int): # maximum length of word
    "Extract n keywords from text in range (min_len, max_len) both inclusive"
    kw_extractor = KeyBERT('valurank/MiniLM-L6-Keyword-Extraction')
    keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=n*2)
    keywords = [i for i,j in keywords] #removing confidence score
    keywords = remove_singular_words(keywords)
    keywords = [word for word in keywords if is_word_in_range(word, min_len=4, max_len=16)]
    return keywords[:n]

Sample text for extracting keywords

In [32]:
text = """
The moon is Earth's only natural satellite, and it plays a
significant role in our lives. One of the fascinating things about
the moon is the way it changes shape throughout the month.
These changes are called the phases of the moon. There are
eight distinct phases, and each phase has its own unique
appearance and name.
The first phase of the moon is called the New Moon. During
this phase, the moon is not visible from Earth because the side
of the moon that faces us is not illuminated by the sun. It is a
dark circle in the sky. The New Moon marks the beginning of
the lunar month.
The second phase is the Waxing Crescent. It occurs a few days after the New Moon.
During this phase, a small sliver of the moon becomes visible. The illuminated part of the
moon slowly increases each night.
Next comes the First Quarter, also known as the Half Moon. This phase occurs about a
week after the New Moon. During this phase, half of the moon is visible from Earth. It
looks like a semicircle in the sky.
"""

In [33]:
keywords = extract_keywords(text=text, n=10, min_len=4, max_len=16)
keywords

['new moon',
 'half moon',
 'moon',
 'lunar month',
 'waxing crescent',
 'distinct phases',
 'phases',
 'second phase',
 'first phase',
 'illuminated part']

In [34]:
#| hide
import nbdev; nbdev.nbdev_export()