# core

> contains functions to extract keywords

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT
from keyphrase_vectorizers import KeyphraseCountVectorizer
from functools import partial

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#| export
def read_in_text(file_path:str): # path of text file
    "Read in the text file"
    with open(file_path, 'r') as f: return f.read()

In [None]:
#| export
def is_word_in_range(word:str, # input word
                     min_len:int, # Min length of word
                     max_len:int): # Max length of word
    "returns True if word is in range (min_len, max_len) both inclusive"
    if (len(word) <= max_len) and (len(word) >= min_len): return True
    else: return False

Example usage of extract_keywords

In [None]:
from functools import partial

In [None]:
is_word_in_range = partial(is_word_in_range, min_len=4, max_len=16)

In [None]:
# For single word
is_word_in_range("Elephant")

True

In [None]:
is_word_in_range("Elephant Rhinoceros Octopus")

False

In [None]:
# For list of words
words = ['new moon', 'half moon', 'moon', 'lunar month', 'waxing crescent', 'distinct phases', 'crazy red moon in space']
[is_word_in_range(word) for word in words]

[True, True, True, True, True, True, False]

In [None]:
#| export
def remove_singular_words(word_list:list): # List of words
    "Removes singular words when they have a corresponding plural word in a list of words."
    plural_words = set()
    singular_words = []
    
    for word in word_list:
        # Check if the word is in plural form by adding 's'
        plural_form = word + 's'
        
        if plural_form in word_list:
            # If the plural form is in the list, add it to the set of plural words
            plural_words.add(plural_form)
        else:
            # If the word is not in plural form, add it to the list of singular words
            singular_words.append(word)
    
    return singular_words

Example showing how to remove singular words if both singular and plural is present in list

In [None]:
word_list = ['cat', 'dog', 'cats', 'apples', 'apple']
result = remove_singular_words(word_list)
print(result)  # Output: ['dog', 'apples']

['dog', 'cats', 'apples']


In [None]:
#| export
def word_contains_numbers(word:str): # input word
    "returns True if word contains numbers, False otherwise."
    return any(char.isdigit() for char in word)        

Example use of `word_contains_numbers` function

In [None]:
word = "Hello123"
word_contains_numbers(word)

True

In [None]:
mixed_list = ["apple", "banana", "cherry", "grape", "watermelon", "2% milk", "orange juice", "3 eggs", "strawberry jam", "4 potatoes"]
[word for word in mixed_list if not word_contains_numbers(word)]

['apple',
 'banana',
 'cherry',
 'grape',
 'watermelon',
 'orange juice',
 'strawberry jam']

In [None]:
#| export
def remove_duplicates_preserve_order(input_list:list):
    "removes duplicate items from a list while preserving order"
    return [x for i, x in enumerate(input_list) if x not in input_list[:i]]

In [None]:
# Example usage:
my_list = [3, 2, 1, 2, 3, 4, 5, 4, 6, 'apple', 'fruit', 'apple', 'banana']
remove_duplicates_preserve_order(my_list)

[3, 2, 1, 4, 5, 6, 'apple', 'fruit', 'banana']

In [None]:
#| export
def split_compound_words(input_list:list): # list of words
    "returns a new list made after splitting all the compound words, removing duplicate items. Preserves order."
    result = []
    for item in input_list: result.extend(item.split())
    result = remove_duplicates_preserve_order(result)
    return result

In [None]:
# Example usage:
mixed_list = ["apple", "banana", "2% milk", "orange juice", "strawberry jam", "4 potatoes", "internet", "speedy internet", "byke jam"]
split_compound_words(mixed_list)

['apple',
 'banana',
 '2%',
 'milk',
 'orange',
 'juice',
 'strawberry',
 'jam',
 '4',
 'potatoes',
 'internet',
 'speedy',
 'byke']

In [None]:
#| export
def generate_keywords(text:str, # input text
                      max_len:int, # maximum length of word
                     n:int=10, # number of keywords
                     min_len:int=3, # minimum length of word
                      compound_words=True, # include combination of words
                    keywords_with_numbers=True): # include those keywords that contain number
    "Extract n keywords from text in range (min_len, max_len) both inclusive"
    kw_extractor = KeyBERT('valurank/MiniLM-L6-Keyword-Extraction')
    keywords = kw_extractor.extract_keywords(text, vectorizer=KeyphraseCountVectorizer(), stop_words=None, top_n=n*2)
    keywords = [i for i,j in keywords] #removing confidence score
    if compound_words is False: keywords = split_compound_words(keywords) #removing compound words
    if keywords_with_numbers is False: keywords = [word for word in keywords if not word_contains_numbers(word)] #removing keywords with numbers
    keywords = remove_singular_words(keywords) #removing one of singluar/plural combination
    keywords = [word for word in keywords if is_word_in_range(word, min_len=min_len, max_len=max_len)] #removing words that are out of range
    return keywords[:n]

Sample text for extracting keywords

In [None]:
text = "In the 21st century, technology has rapidly advanced, bringing about significant changes in various industries. The digital revolution has led to the creation of smartphones, which have become an integral part of our daily lives. These devices offer a wide range of capabilities, from high-resolution cameras to powerful processors. Additionally, the internet has connected billions of people worldwide, enabling instant communication and access to a vast repository of information. Data analytics, driven by algorithms and machine learning, has revolutionized decision-making processes in business and science. With the advent of 5G technology, internet speeds have soared, paving the way for innovations like autonomous vehicles and the Internet of Things (IoT). As we continue into the 21st century, these technological advancements will shape our future in ways we can only begin to imagine."

In [None]:
# include keywords that has numbers
generate_keywords(text=text, n=10, min_len=4, max_len=15, keywords_with_numbers=True)

['smartphones',
 'internet speeds',
 'internet',
 'data analytics',
 'devices',
 'technology',
 '21st century',
 'information',
 'future',
 'algorithms']

In [None]:
# don't include keywords that has numbers
generate_keywords(text=text, n=10, min_len=4, max_len=15, keywords_with_numbers=False)

['smartphones',
 'internet speeds',
 'internet',
 'data analytics',
 'devices',
 'technology',
 'information',
 'future',
 'algorithms',
 'innovations']

In [None]:
# split compound words and don't include keywords that has numbers
generate_keywords(text=text, n=10, min_len=4, max_len=15, keywords_with_numbers=False, compound_words=False)

['smartphones',
 'internet',
 'speeds',
 'digital',
 'revolution',
 'machine',
 'learning',
 'data',
 'analytics',
 'technological']

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()