In [1]:
'''
doc="""
“Two weeks after the flu shot given to my son's right arm he told me that he felt his arm sleepy 
and he couldn't life his arm on it's own.  I thought is was a muscle tension so tried heating pad and 
massage the area. When I didn't see any improvement in 2 days I made an appointment  with a hand and 
upper extremity specialist and also neurologustbecause the area affected was his brachial plexus 
of his C5 and C6 with the supraspinatus and infradpinstus muscles where affected. Currently on 
occupational therapist and had a EMG done to check his nerve signals.”
"""
'''

doc="""
I went to get my second dose of the covid vaccine yesterday. My arm is now sore, and I have a slight headache."""



In [2]:
from sklearn.feature_extraction.text import CountVectorizer

n_gram_range = (1, 1) #length of keywords (no. of words)
#n_gram_range = (2, 2)

#In computing, stop words are words which are filtered out before or after processing of natural language data, 
#because they carry such little meaning
#Examples of stop words in English are “a”, “the”, “is”, “are”, “and” etc
stop_words = "english"

# Extract candidate words/phrases
#Scikit-learn’s CountVectorizer is used to convert a collection of text documents to a vector of term/token counts.
#fit() creates a dictionary of tokens (by default the tokens are words separated by spaces and punctuation) 
#that maps each single token to a position in the output matrix. Fitting on the training set and transforming 
#on the training and test set assures that, given a word, the word is correctly always mapped on the same column, 
#both in the training and test set.
count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names()
#print(candidates) #all possible features/words

In [6]:
#embed the document and the candidates

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

ModuleNotFoundError: No module named 'torch._C'

In [4]:
#finding the cosine similarity between the document and the candidates
#smaller the angle, higher the cosine similarity.

from sklearn.metrics.pairwise import cosine_similarity

top_n = 5   #number of keywords to be selected
distances = cosine_similarity(doc_embedding, candidate_embeddings)
#print(distances)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

NameError: name 'doc_embedding' is not defined

In [137]:
print(keywords)

['covid', 'sore', 'yesterday', 'vaccine', 'headache']


In [None]:
#below are diversification methods

In [128]:
#Max Sum Similarity

#The maximum sum distance between pairs of data is defined as 
#the pairs of data for which the distance between them is maximized. 
#In our case, we want to maximize the candidate similarity to the document whilst minimizing the similarity between candidates.
#To do this, we select the top 20 keywords/keyphrases, and from those 20, select the 5 that are the least similar to each other

import numpy as np
import itertools

def max_sum_sim(doc_embedding, word_embeddings, words, top_n, nr_candidates):
    # Calculate distances and extract keywords
    distances = cosine_similarity(doc_embedding, candidate_embeddings)
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # Get top_n words as candidates based on cosine similarity
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # Calculate the combination of words that are the least similar to each other
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=20)

['slight', 'covid', 'yesterday', 'vaccine', 'headache']

In [129]:
#Maximal Marginal Relevance

#We start by selecting the keyword/keyphrase that is the most similar to the document. 
#Then, we iteratively select new candidates that are both similar to the document 
#and not similar to the already selected keywords/keyphrases

import numpy as np

def mmr(doc_embedding, word_embeddings, words, top_n, diversity):

    # Extract similarity within words, and between words and the document
    word_doc_similarity = cosine_similarity(word_embeddings, doc_embedding)
    word_similarity = cosine_similarity(word_embeddings)

    # Initialize candidates and already choose best keyword/keyphras
    keywords_idx = [np.argmax(word_doc_similarity)]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    for _ in range(top_n - 1):
        # Extract similarities within candidates and
        # between candidates and selected keywords/phrases
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # Calculate MMR
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # Update keywords & candidates
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

mmr(doc_embedding, candidate_embeddings, candidates, top_n=5, diversity=0.7)

['headache', 'yesterday', 'vaccine', 'covid', 'second']