In [None]:
!pip install git+https://github.com/boudinfl/pke.git
!pip install matplotlib
!python -m spacy download en_core_web_sm
!pip install --user scipy==1.8.1

In [28]:
import pke
import collections


In [34]:
doc = """
         Supervised learning is the machine learning task of learning a function that
         maps an input to an output based on example input-output pairs. It infers a
         function from labeled training data consisting of a set of training examples.
         In supervised learning, each example is a pair consisting of an input object
         (typically a vector) and a desired output value (also called the supervisory signal).
         A supervised learning algorithm analyzes the training data and produces an inferred function,
         which can be used for mapping new examples. An optimal scenario will allow for the
         algorithm to correctly determine the class labels for unseen instances. This requires
         the learning algorithm to generalize from the training data to unseen situations in a
         'reasonable' way (see inductive bias).
      """

##topicRank

In [35]:
def topicRank(text):
  # initialize keyphrase extraction model, here TopicRank
  extractor = pke.unsupervised.TopicRank()

  # load the content of the document, here document is expected to be in raw
  # format (i.e. a simple text file) and preprocessing is carried out using spacy
  extractor.load_document(input=text, language='en')

  # keyphrase candidate selection, in the case of TopicRank: sequences of nouns
  # and adjectives (i.e. `(Noun|Adj)*`)
  extractor.candidate_selection()

  # candidate weighting, in the case of TopicRank: using a random walk algorithm
  extractor.candidate_weighting()

  # N-best selection, keyphrases contains the 10 highest scored candidates as
  # (keyphrase, score) tuples
  keyphrases = extractor.get_n_best(n=10)
  return [a_tuple[0] for a_tuple in keyphrases]

In [36]:
topicRank(doc)

['training examples',
 'training data',
 'supervised learning',
 'function',
 'example input-output pairs',
 'unseen instances',
 'input',
 'output',
 'algorithm',
 'set']

##yake

In [37]:
def yake(text):
  # initialize keyphrase extraction model, here TopicRank
  extractor = pke.unsupervised.YAKE()

  # load the content of the document, here document is expected to be in raw
  # format (i.e. a simple text file) and preprocessing is carried out using spacy
  extractor.load_document(input=text, language='en')

  # keyphrase candidate selection, in the case of TopicRank: sequences of nouns
  # and adjectives (i.e. `(Noun|Adj)*`)
  extractor.candidate_selection()

  # candidate weighting, in the case of TopicRank: using a random walk algorithm
  extractor.candidate_weighting()

  # N-best selection, keyphrases contains the 10 highest scored candidates as
  # (keyphrase, score) tuples
  keyphrases = extractor.get_n_best(n=10)
  return [a_tuple[0] for a_tuple in keyphrases]


In [38]:
yakeKeyWords = yake(doc)
print(yakeKeyWords)

['example input-output pairs', 'machine learning task', 'supervised learning', 'input-output pairs', 'learning', 'training data', 'supervised', 'output based', 'example input-output', 'function']


##multipartiteRank

In [39]:

def multipartiteRank(text):
  # initialize keyphrase extraction model, here TopicRank
  extractor = pke.unsupervised.MultipartiteRank()

  # load the content of the document, here document is expected to be in raw
  # format (i.e. a simple text file) and preprocessing is carried out using spacy
  extractor.load_document(input=text, language='en')

  # keyphrase candidate selection, in the case of TopicRank: sequences of nouns
  # and adjectives (i.e. `(Noun|Adj)*`)
  extractor.candidate_selection()

  # candidate weighting, in the case of TopicRank: using a random walk algorithm
  extractor.candidate_weighting()

  # N-best selection, keyphrases contains the 10 highest scored candidates as
  # (keyphrase, score) tuples
  keyphrases = extractor.get_n_best(n=10)
  return [a_tuple[0] for a_tuple in keyphrases]

In [40]:
multipartiteRankKeyWords = multipartiteRank(doc)
print(multipartiteRankKeyWords)

['supervised learning', 'training examples', 'function', 'training data', 'example input-output pairs', 'output', 'input', 'unseen instances', 'set', 'algorithm']


##TfIdf

In [41]:
def TfIdf(text):
  # initialize keyphrase extraction model, here TopicRank
  extractor = pke.unsupervised.TfIdf()

  # load the content of the document, here document is expected to be in raw
  # format (i.e. a simple text file) and preprocessing is carried out using spacy
  extractor.load_document(input=text, language='en' )

  # keyphrase candidate selection, in the case of TopicRank: sequences of nouns
  # and adjectives (i.e. `(Noun|Adj)*`)
  extractor.candidate_selection()

  # candidate weighting, in the case of TopicRank: using a random walk algorithm
  extractor.candidate_weighting()

  # N-best selection, keyphrases contains the 10 highest scored candidates as
  # (keyphrase, score) tuples
  keyphrases = extractor.get_n_best(n=10)
  return [a_tuple[0] for a_tuple in keyphrases]

In [42]:
tfidfKeyWords = TfIdf(doc)
print(tfidfKeyWords)



['supervised learning', 'supervised', 'training data', 'unseen', 'training', 'learning', 'output based', 'example input-output', 'example input-output pairs', 'input-output']


##KeyBERT

In [None]:
!pip install keybert
!pip install keybert[flair]
!pip install keybert[gensim]
!pip install keybert[spacy]
!pip install keybert[use]

In [43]:
from keybert import KeyBERT

kw_model = KeyBERT()
keywords = kw_model.extract_keywords(doc)


In [44]:
keywords

[('supervised', 0.6676),
 ('labeled', 0.4896),
 ('learning', 0.4813),
 ('training', 0.4134),
 ('labels', 0.3947)]

In [45]:
KeyBERT= kw_model.extract_keywords(doc, keyphrase_ngram_range=(1, 2) )
KeyBERT_ans=[]

for i in range(len(KeyBERT)):
  KeyBERT_ans.append(KeyBERT[i][0])
print(KeyBERT_ans)

['supervised learning', 'supervised', 'signal supervised', 'examples supervised', 'labeled training']
