SpaCy keyword extraction

In [1]:
# !python -m spacy download en_core_web_sm

In [2]:
import spacy 

# Load the Spacy model and create a new document 
nlp = spacy.load("en_core_web_sm") 

nlp

<spacy.lang.en.English at 0x1e079457bb0>

In [3]:
doc = nlp("This is a sample text for keyword extraction.") 

doc

This is a sample text for keyword extraction.

In [4]:
doc.noun_chunks

<generator at 0x1e07935dbc0>

In [5]:
# Use the noun_chunks property of the document to identify the noun phrases in the text 
noun_phrases = [chunk.text for chunk in doc.noun_chunks] 

noun_phrases

['This', 'a sample text', 'keyword extraction']

In [6]:
# Use term frequency-inverse document frequency (TF-IDF) analysis to rank the noun phrases 
from sklearn.feature_extraction.text import TfidfVectorizer 
vectorizer = TfidfVectorizer() 
tfidf = vectorizer.fit_transform([doc.text]) 

tfidf

<1x7 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [7]:
vectorizer.vocabulary_

{'this': 6,
 'is': 2,
 'sample': 4,
 'text': 5,
 'for': 1,
 'keyword': 3,
 'extraction': 0}

In [8]:
# If your corpus only contains one document, the IDF part of the TF-IDF will be the same for all words 
#    (since every word appears in 100% of the documents), so you’re essentially only looking at term frequency. 
#    This could lead to many words having the same TF-IDF score.

[tfidf[0, vectorizer.vocabulary_[word]] for word in vectorizer.vocabulary_.keys()]

[0.3779644730092272,
 0.3779644730092272,
 0.3779644730092272,
 0.3779644730092272,
 0.3779644730092272,
 0.3779644730092272,
 0.3779644730092272]

In [9]:
# Get the top 3 most important noun phrases 
top_phrases = sorted(vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True)[:3] 

top_phrases

['this', 'is', 'sample']

NLTK keyword extraction

In [10]:
import nltk
nltk.download('averaged_perceptron_tagger')

# Preprocess the text by removing punctuation and converting to lowercase 
text = "This is a sample text for keyword extraction." 
text = text.lower().replace(".", "") 

text

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Alienware\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


'this is a sample text for keyword extraction'

In [11]:
# Tokenize the text into words 
tokens = nltk.word_tokenize(text) 

tokens

['this', 'is', 'a', 'sample', 'text', 'for', 'keyword', 'extraction']

In [12]:
# Use part-of-speech tagging to identify the nouns in the text 
tags = nltk.pos_tag(tokens) 

tags

[('this', 'DT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('sample', 'JJ'),
 ('text', 'NN'),
 ('for', 'IN'),
 ('keyword', 'NN'),
 ('extraction', 'NN')]

In [13]:
nouns = [word for (word, tag) in tags if tag == "NN"] 

nouns

['text', 'keyword', 'extraction']

In [14]:
# Use term frequency-inverse document frequency (TF-IDF) analysis to rank the nouns 
from sklearn.feature_extraction.text import TfidfVectorizer 
vectorizer = TfidfVectorizer() 
tfidf = vectorizer.fit_transform([text]) 

vectorizer.vocabulary_

{'this': 6,
 'is': 2,
 'sample': 4,
 'text': 5,
 'for': 1,
 'keyword': 3,
 'extraction': 0}

In [15]:
# If your corpus only contains one document, the IDF part of the TF-IDF will be the same for all words 
#    (since every word appears in 100% of the documents), so you’re essentially only looking at term frequency. 
#    This could lead to many words having the same TF-IDF score.

words_import = [tfidf[0, vectorizer.vocabulary_[word]] for word in vectorizer.vocabulary_.keys()]

words_import

[0.3779644730092272,
 0.3779644730092272,
 0.3779644730092272,
 0.3779644730092272,
 0.3779644730092272,
 0.3779644730092272,
 0.3779644730092272]

In [16]:
# Get the top 3 most important nouns 
top_nouns = sorted(vectorizer.vocabulary_, key=lambda x: tfidf[0, vectorizer.vocabulary_[x]], reverse=True)[:3] 

# Print the top 3 keywords 
print(top_nouns)

['this', 'is', 'sample']


BERT keyword extraction

In [17]:
import transformers 

# Load the BERT model and create a new tokenizer 
model = transformers.BertModel.from_pretrained("bert-base-uncased")  

model

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [18]:
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-uncased")

tokenizer

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [19]:
# Tokenize and encode the text 
input_ids = tokenizer.encode("This is a sample text for keyword extraction.", add_special_tokens=True) 

input_ids

[101, 2023, 2003, 1037, 7099, 3793, 2005, 3145, 18351, 14676, 1012, 102]

In [20]:
import torch

# Use BERT to encode the meaning and context of the words and phrases in the text 
outputs = model(torch.tensor([input_ids]), output_hidden_states=True, output_attentions=True) 


The outputs variable contains the output from the BERT model. Specifically, it’s a tuple containing the following elements:

Remember that hidden_size is 768 for BERT Base and 1024 for BERT Large. The batch size and sequence length depend on your input data.

In [21]:
# Sequence Output (each token encoding): 

# This is a tensor of shape (batch_size, sequence_length, hidden_size). 
# It represents the hidden states in the last layer for each token in the input sequence. 
# In other words, it’s a contextualized embedding for each token in the input.

print(outputs[0].shape)
outputs[0]

torch.Size([1, 12, 768])


tensor([[[-0.3476, -0.4094, -0.2909,  ..., -0.4571, -0.0713,  0.8328],
         [-0.3861, -0.6360, -0.2580,  ..., -0.2864,  0.7689,  0.4315],
         [-0.3096, -0.5142,  0.2317,  ..., -0.1392, -0.0091,  0.9189],
         ...,
         [-0.1831, -0.0702,  0.1542,  ..., -0.6890, -0.2608,  0.3044],
         [ 0.6661,  0.0375, -0.6640,  ...,  0.2270, -0.4533, -0.3437],
         [ 0.1989, -0.1620, -0.3995,  ...,  0.5692, -0.7531, -0.0771]]],
       grad_fn=<NativeLayerNormBackward0>)

In [22]:
# Pooled Output (all text encoding) :  

# This is a tensor of shape (batch_size, hidden_size). 
# It’s derived from the hidden state of the special [CLS] token in the last layer, 
# after being passed through a linear layer and a tanh activation function. 
# This output is often used for classification tasks.

print(outputs[1].shape)
outputs[1]

torch.Size([1, 768])


tensor([[-0.8658, -0.4209, -0.6395,  0.6057,  0.6033, -0.3086,  0.7101,  0.2981,
         -0.5178, -1.0000, -0.3377,  0.6333,  0.9709,  0.0152,  0.8384, -0.5312,
          0.0909, -0.5789,  0.4099, -0.1241,  0.4825,  0.9999,  0.3598,  0.4204,
          0.5166,  0.8455, -0.5856,  0.8948,  0.9299,  0.7788, -0.5277,  0.2765,
         -0.9860, -0.2951, -0.8095, -0.9860,  0.4320, -0.7537, -0.0907, -0.1106,
         -0.8812,  0.4814,  1.0000, -0.4155,  0.4344, -0.3932, -1.0000,  0.3595,
         -0.8635,  0.6884,  0.5974,  0.6493,  0.2502,  0.5112,  0.5414, -0.1730,
         -0.0789,  0.2362, -0.2428, -0.6733, -0.5817,  0.3656, -0.4781, -0.9050,
          0.6503,  0.4120, -0.2417, -0.3463, -0.1857,  0.0710,  0.7616,  0.2537,
         -0.0180, -0.7102,  0.3489,  0.3787, -0.6414,  1.0000, -0.2656, -0.9651,
          0.5244,  0.4501,  0.6277,  0.1057,  0.0520, -1.0000,  0.5785, -0.3051,
         -0.9830,  0.2097,  0.5389, -0.2841,  0.3116,  0.6470, -0.3391, -0.3244,
         -0.3486, -0.7008, -

In [23]:
# Hidden States (each token encod. in each BERT block) : 

# If you set output_hidden_states=True when loading the model, 
# outputs will also include the hidden states from all layers of the model. 
# This is a tuple of length num_hidden_layers + 1 (for the initial embeddings and all transformer layers), 
# where each element is a tensor of shape (batch_size, sequence_length, hidden_size).

print([outputs[2][i].shape for i in range(len(outputs[2]))])
outputs[2]

[torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768]), torch.Size([1, 12, 768])]


(tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
          [-0.6485,  0.6739, -0.0932,  ...,  0.4475,  0.6696,  0.1820],
          [-0.6270, -0.0633, -0.3143,  ...,  0.3427,  0.4636,  0.4594],
          ...,
          [-1.2840, -0.5970,  0.3023,  ...,  0.3650,  0.8621, -0.6559],
          [-0.3585,  0.2777, -0.1210,  ...,  0.5949,  0.6856,  0.7453],
          [-0.4771,  0.0871, -0.0770,  ..., -0.2191,  0.3020,  0.0196]]],
        grad_fn=<NativeLayerNormBackward0>),
 tensor([[[ 0.1370,  0.0181, -0.1411,  ...,  0.2278, -0.0924, -0.0353],
          [-0.6273,  0.3076,  0.2524,  ...,  0.1998,  0.2308,  0.0458],
          [-1.1002, -0.5674, -0.3320,  ...,  0.4697,  0.1516,  0.4446],
          ...,
          [-1.0307, -0.5239,  0.4133,  ...,  0.1205,  0.6193, -0.5466],
          [-0.4261, -0.0376,  0.0413,  ...,  0.3959,  0.3216,  0.5345],
          [-0.3650, -0.0494,  0.0122,  ..., -0.0882,  0.3893,  0.1930]]],
        grad_fn=<NativeLayerNormBackward0>),
 tensor([[[ 

In [24]:
# Attention Weights  (each token dependencies with all other tokens: measure of how much each token in a sentence is influenced by others) : 

# If you set output_attentions=True when loading the model,
# outputs will also include the attention weights from all layers of the model.
# This is a tuple of length num_hidden_layers, where each element is a tensor of shape 
# (batch_size, num_attention_heads, sequence_length, sequence_length).

print([outputs[3][i].shape for i in range(len(outputs[3]))])
# outputs[3]

[torch.Size([1, 12, 12, 12]), torch.Size([1, 12, 12, 12]), torch.Size([1, 12, 12, 12]), torch.Size([1, 12, 12, 12]), torch.Size([1, 12, 12, 12]), torch.Size([1, 12, 12, 12]), torch.Size([1, 12, 12, 12]), torch.Size([1, 12, 12, 12]), torch.Size([1, 12, 12, 12]), torch.Size([1, 12, 12, 12]), torch.Size([1, 12, 12, 12]), torch.Size([1, 12, 12, 12])]


In [25]:
outputs[3][0] # == attention_weights[0] below 

tensor([[[[5.1807e-02, 9.1523e-02, 3.6972e-02,  ..., 4.2061e-02,
           1.0813e-01, 2.7775e-01],
          [1.8288e-01, 3.7304e-02, 1.3999e-01,  ..., 4.8936e-02,
           8.8809e-02, 7.1775e-02],
          [1.2814e-01, 5.2379e-02, 1.1465e-01,  ..., 6.7462e-02,
           5.1583e-02, 4.8481e-02],
          ...,
          [2.9144e-02, 4.3075e-02, 4.1256e-02,  ..., 5.7146e-02,
           3.3228e-02, 9.3269e-02],
          [7.9261e-02, 8.3060e-02, 7.5335e-02,  ..., 6.6719e-02,
           1.3202e-01, 9.0765e-02],
          [8.8118e-02, 1.2103e-01, 6.8236e-02,  ..., 5.6765e-02,
           1.5303e-01, 1.2386e-01]],

         [[6.0402e-01, 6.7658e-03, 8.2498e-03,  ..., 1.3216e-02,
           3.3096e-02, 5.9370e-03],
          [9.8433e-03, 1.6769e-02, 1.3894e-01,  ..., 9.9515e-02,
           2.4098e-02, 4.1944e-02],
          [1.1230e-02, 3.0699e-02, 3.7648e-02,  ..., 1.4532e-01,
           2.0109e-02, 2.8752e-02],
          ...,
          [7.4355e-02, 4.2029e-02, 7.6803e-02,  ..., 6.3780

In [38]:
# Use the attention weights of the tokens to identify the most important words and phrases 
'''
The line top_tokens = sorted(attention_weights[0], key=lambda x: x[1], reverse=True)[:3] 
sorts these tuples based on the attention weight (the second element of each tuple) (?? to check ??)
'''

attention_weights = outputs[-1] 
top_tokens = sorted(attention_weights[0], key=lambda x: x[1], reverse=True)[:3] 

top_tokens

[tensor([[[5.1807e-02, 9.1523e-02, 3.6972e-02,  ..., 4.2061e-02,
           1.0813e-01, 2.7775e-01],
          [1.8288e-01, 3.7304e-02, 1.3999e-01,  ..., 4.8936e-02,
           8.8809e-02, 7.1775e-02],
          [1.2814e-01, 5.2379e-02, 1.1465e-01,  ..., 6.7462e-02,
           5.1583e-02, 4.8481e-02],
          ...,
          [2.9144e-02, 4.3075e-02, 4.1256e-02,  ..., 5.7146e-02,
           3.3228e-02, 9.3269e-02],
          [7.9261e-02, 8.3060e-02, 7.5335e-02,  ..., 6.6719e-02,
           1.3202e-01, 9.0765e-02],
          [8.8118e-02, 1.2103e-01, 6.8236e-02,  ..., 5.6765e-02,
           1.5303e-01, 1.2386e-01]],
 
         [[6.0402e-01, 6.7658e-03, 8.2498e-03,  ..., 1.3216e-02,
           3.3096e-02, 5.9370e-03],
          [9.8433e-03, 1.6769e-02, 1.3894e-01,  ..., 9.9515e-02,
           2.4098e-02, 4.1944e-02],
          [1.1230e-02, 3.0699e-02, 3.7648e-02,  ..., 1.4532e-01,
           2.0109e-02, 2.8752e-02],
          ...,
          [7.4355e-02, 4.2029e-02, 7.6803e-02,  ..., 6.378

In [39]:
# Decode the top tokens and print the top 3 keywords 
# top_keywords = [tokenizer.decode([token[0]]) for token in top_tokens] ----------------- to check ------------------
top_keywords = [tokenizer.decode([token[0][0][5]]) for token in top_tokens] 
print(top_keywords)



['[PAD]']
