In [None]:
doc = """
         Supervised learning is the machine learning task of
         learning a function that maps an input to an output based
         on example input-output pairs.[1] It infers a function
         from labeled training data consisting of a set of
         training examples.[2] In supervised learning, each
         example is a pair consisting of an input object
         (typically a vector) and a desired output value (also
         called the supervisory signal). A supervised learning
         algorithm analyzes the training data and produces an
         inferred function, which can be used for mapping new
         examples. An optimal scenario will allow for the algorithm
         to correctly determine the class labels for unseen
         instances. This requires the learning algorithm to
         generalize from the training data to unseen situations
         in a 'reasonable' way (see inductive bias).
      """

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

n_gram_range = (1,1)
stopwords = "english"

#Extraction candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stopwords).fit([doc])
candidates = count.get_feature_names_out()

In [None]:
candidates

array(['algorithm', 'allow', 'analyzes', 'based', 'bias', 'called',
       'class', 'consisting', 'correctly', 'data', 'desired', 'determine',
       'example', 'examples', 'function', 'generalize', 'inductive',
       'inferred', 'infers', 'input', 'instances', 'labeled', 'labels',
       'learning', 'machine', 'mapping', 'maps', 'new', 'object',
       'optimal', 'output', 'pair', 'pairs', 'produces', 'reasonable',
       'requires', 'scenario', 'set', 'signal', 'situations',
       'supervised', 'supervisory', 'task', 'training', 'typically',
       'unseen', 'used', 'value', 'vector', 'way'], dtype=object)

In [None]:
n_gram_range = (3,3)
#Extraction candidate words/phrases
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stopwords).fit([doc])
candidates_1 = count.get_feature_names_out()

In [None]:
candidates_1

array(['algorithm analyzes training', 'algorithm correctly determine',
       'algorithm generalize training', 'allow algorithm correctly',
       'analyzes training data', 'based example input',
       'called supervisory signal', 'class labels unseen',
       'consisting input object', 'consisting set training',
       'correctly determine class', 'data consisting set',
       'data produces inferred', 'data unseen situations',
       'desired output value', 'determine class labels',
       'example input output', 'example pair consisting',
       'examples optimal scenario', 'examples supervised learning',
       'function labeled training', 'function maps input',
       'function used mapping', 'generalize training data',
       'inferred function used', 'infers function labeled',
       'input object typically', 'input output based',
       'input output pairs', 'instances requires learning',
       'labeled training data', 'labels unseen instances',
       'learning algorithm ana

In [None]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m58.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence_transformers)
  Downloading huggingface_hub-0.15.1-py3-

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

Downloading (…)925a9/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)1a515925a9/README.md: 0.00B [00:00, ?B/s]

Downloading (…)515925a9/config.json:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)925a9/tokenizer.json: 0.00B [00:00, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)1a515925a9/vocab.txt: 0.00B [00:00, ?B/s]

Downloading (…)15925a9/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

In [None]:
keywords

['mapping', 'class', 'training', 'algorithm', 'learning']

In [None]:
distances

array([[0.45560035, 0.13360128, 0.3146087 , 0.13824822, 0.09117439,
        0.11441341, 0.40869993, 0.09669923, 0.13623556, 0.20845458,
        0.1888457 , 0.19503796, 0.16670373, 0.20907964, 0.23956464,
        0.21112515, 0.23319238, 0.20896894, 0.20128581, 0.27665883,
        0.12842578, 0.20875168, 0.18689685, 0.4604835 , 0.22884902,
        0.37005842, 0.2597358 , 0.15053248, 0.11785361, 0.1989449 ,
        0.19428237, 0.1733672 , 0.1812557 , 0.15908216, 0.12776855,
        0.14821349, 0.19003049, 0.15302044, 0.2340126 , 0.14074662,
        0.35380444, 0.22788446, 0.2615527 , 0.44877675, 0.14492646,
        0.0524976 , 0.14756116, 0.10999655, 0.12863496, 0.06691315]],
      dtype=float32)

In [None]:
candidates

array(['algorithm', 'allow', 'analyzes', 'based', 'bias', 'called',
       'class', 'consisting', 'correctly', 'data', 'desired', 'determine',
       'example', 'examples', 'function', 'generalize', 'inductive',
       'inferred', 'infers', 'input', 'instances', 'labeled', 'labels',
       'learning', 'machine', 'mapping', 'maps', 'new', 'object',
       'optimal', 'output', 'pair', 'pairs', 'produces', 'reasonable',
       'requires', 'scenario', 'set', 'signal', 'situations',
       'supervised', 'supervisory', 'task', 'training', 'typically',
       'unseen', 'used', 'value', 'vector', 'way'], dtype=object)

##**Max Sum Similarity**

In [None]:
import numpy as np
import itertools

In [None]:
def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
  #Calculate distances and extract keywords

  distances = cosine_similarity(doc_embedding, candidate_embeddings)
  distances_candidates = cosine_similarity(candidate_embeddings, candidate_embeddings)

  #Get top_n words as candidates based on cosine similarity
  words_idx = list(distances.argsort()[0][-nr_candidates:])
  words_vals = [candidates[index] for index in words_idx]
  distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

  #Calculate the combination of words that are the least similar to each other
  min_sim = np.inf
  candidate = None
  for combination in itertools.combination(range(len(words_idx)), top_n):
    sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
    if sim < min_sim:
      candidate = combination
      min_sim = sim

  return [words_vals[idx] for idx in candidate]

## **Maximum Marginal Relevance**

In [None]:
def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

  #Extract similarity within words, and between words and the document
  word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
  word_similarity = cosine_similarity(word_embeddings)

  #Initialize candidates and already choose best keyword/keyphrases
  keywords_idx = [np.argmax(word_doc_similarity)]
  candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

  for _ in range(top_n-1):
    #Extract similarities within candidates and between candidates and selected keywords/phrases
    candidate_similarities = word_doc_similarity[candidates_idx, :]
    target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis = 1)

    #Calculate MMR
    mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
    mmr_idx = candidates_idx[np.argmax(mmr)]

    #Update keywords & Candidates
    keywords_idx.append(mmr_idx)
    candidates_idx.remove(mmr_idx)

  return [words[idx] for idx in keywords_idx]