## Keyword Extractor

[KE](https://github.com/protonx-tf-05-projects/vn-extract-keywords/blob/main/generate_keywords.py)

In [None]:
!pip3 install keyphrase-vectorizers

In [None]:
!pip3 install py_vncorenlp sentence-transformers

In [None]:
import py_vncorenlp
import os
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

stop_words = []
with open('../data/vietnamese-stopwords.txt', encoding='utf8') as f:
    for line in f:
        stop_words.append(line.strip())

doc = 'Túi xách là một trong những vật dụng cần thiết của hầu hết chị em phụ nữ mỗi khi ra đường.'

def removeStopWords(o_sen):
    words = [word for word in o_sen.split() if word not in stop_words]
    return " ".join(words)

py_vncorenlp.download_model(save_dir=os.path.abspath('./vncorenlp'))

# Load the word and sentence segmentation component
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir=os.path.abspath('./vncorenlp'))

doc_segmented = rdrsegmenter.word_segment(doc)
# Extract candidate words/phrases

count = CountVectorizer(ngram_range=(1,1)).fit([removeStopWords(doc_segmented[0])])
candidates = count.get_feature_names()

model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)


top_n = 10
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

print(keywords)

In [None]:
import underthesea
import os
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

stop_words = []
with open('../data/vietnamese-stopwords.txt', encoding='utf8') as f:
  for line in f:
    stop_words.append(line.strip().replace(' ', '_'))

def remove_vie_stop_words(o_sen):
  o_sen = o_sen.lower()
  words = [word for word in o_sen.split() if word not in stop_words]
  # print(words)
  return " ".join(words)


In [179]:
doc = '''Một nền tảng chung quy chính bản phần mềm dựa trên đám mây cho thị trường giáo dục chuyên nghiệp. 
Công ty khởi nghiệp này đang xây dựng cơ sở dữ liệu biểu đồ để phân tích dữ liệu, dữ liệu không gian địa lý 
và mô hình ngữ nghĩa. Họ cho biết nó sẽ cho phép các giảng viên phân tích sinh viên tốt hơn và cung cấp 
các công cụ giảng dạy tốt hơn.'''

In [None]:
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from keyphrase_vectorizers import KeyphraseCountVectorizer # No Vietnamese Spacy Pipeline

doc_segmented_list = [underthesea.word_tokenize(sentence, format='text') for sentence in doc.split('.')]
# doc_segmented = underthesea.word_tokenize(doc, format='text').split(' ')
# Extract candidate words/phrases

count = CountVectorizer(ngram_range=(1,1)).fit([remove_vie_stop_words(doc_segmented) for doc_segmented in doc_segmented_list])
candidates = count.get_feature_names_out()
# print(candidates)

# model = SentenceTransformer('keepitreal/vietnamese-sbert')

doc_embedding = model.encode(doc.split('.'))
candidate_embeddings = model.encode(candidates)

top_n = 10
distances = cosine_similarity(doc_embedding, candidate_embeddings)

#ndarray
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]

print(keywords)
# ['cơ_sở_dữ_liệu', 'không_gian', 'giảng_dạy', 'dựa', 'nền_tảng', 'đám', 'phần_mềm', 'mô_hình_ngữ', 'giáo_dục', 'mây']

In [None]:
from transformers import AutoTokenizer, AutoModel

# https://huggingface.co/keepitreal/vietnamese-sbert
# Load model from HuggingFace Hub
# tokenizer = AutoTokenizer.from_pretrained('keepitreal/vietnamese-sbert')
# model = AutoModel.from_pretrained('keepitreal/vietnamese-sbert')
model = SentenceTransformer('keepitreal/vietnamese-sbert')

In [None]:
import torch
from transformers import AutoModel, AutoTokenizer

phobert = AutoModel.from_pretrained("vinai/phobert-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

In [212]:
from underthesea import word_tokenize
import numpy as np

def phobert_sentence_embedding(raw_sent):
  # Word Segmented raw_sent
  text = word_tokenize(raw_sent, format="text")
  input_ids = torch.tensor([tokenizer.encode(line)])

  #disables gradient calculation.
  with torch.no_grad():
    features = phobert(input_ids).last_hidden_state[:,0,:] # Models outputs are now tuples

  # print(features.shape)
  # torch.Size([1, 768])
  return np.to_array(features)

In [213]:
phobert_sentence_embedding("Chào em")

tensor([[-4.0119e-01, -5.8289e-01,  3.3760e-01, -6.3421e-01, -1.0363e-01,
         -3.9937e-01,  4.5424e-01, -8.2677e-02,  2.8026e-01,  7.9030e-02,
         -3.0269e-02, -5.0870e-01,  7.3913e-02,  6.9086e-01, -2.5018e-01,
         -7.8047e-02, -7.4242e-01,  4.2725e-01, -4.2356e-01,  6.8896e-01,
          3.6071e-02, -2.9507e-01, -1.6285e-01, -2.6498e-01,  3.6117e-01,
         -5.6791e-01, -4.0655e-01, -1.7059e-03,  8.4485e-02, -1.2337e-01,
          2.1489e-03,  7.7132e-02,  1.1698e-01, -1.9466e-01,  2.1360e-01,
          2.5631e-01, -8.1675e-02, -3.5028e-02,  3.1785e-01,  1.0026e+00,
          3.6948e-01, -1.1465e-01, -7.6079e-02, -3.4069e-01,  2.1288e-01,
         -1.8685e-02,  4.0988e-01, -2.2896e-01, -6.5315e-02,  1.8802e-01,
          1.2441e-01, -1.1021e-01,  4.3317e-02, -3.3649e-02,  9.2148e-01,
          2.0216e-01,  4.9418e-02,  5.3662e-02, -3.9019e-01,  4.2857e-02,
         -4.1044e-01,  3.6343e-01, -5.7905e-02,  3.5092e-02,  1.8555e-01,
         -9.8274e-02,  2.7504e-01,  2.

In [209]:
def get_features_from_doc(raw_doc, word2vec) -> list:
  doc_segmented_list = [underthesea.word_tokenize(sentence, format='text') for sentence in raw_doc.split('.')]
  count = CountVectorizer(ngram_range=(1,1))
  matrix = count.fit_transform([remove_vie_stop_words(doc_segmented) for doc_segmented in doc_segmented_list])
  candidates = count.get_feature_names_out()

  words_embed = []
  for candidate in candidates:
    # try:
    #   word_embedding = word2vec[candidate]
    # except:
      word_embedding = phobert_sentence_embedding(candidate)[0]

      words_embed.append(word_embedding)


  return candidates, torch.tensor(words_embed)
  # return candidates

In [None]:
# https://github.com/sonvx/word2vecVN?fbclid=IwAR3oRcIyVSajJrolyQ2wJXvj5p1AfxGETZgJtkv0QpJTiZY6QvX3Y6tkhrk

from gensim.models import KeyedVectors
from gensim import models

word2vec_path='word2vec/baomoi.model.bin'
word_vectors = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [203]:
word_vectors["văn_bản"]

array([-3.6082964 ,  0.82996655,  0.739181  , -3.0137672 ,  0.9386643 ,
        0.5729815 ,  3.1587455 ,  1.1358377 , -3.1577187 , -0.3795338 ,
        0.8614265 , -3.7936888 , -0.6206701 , -1.395244  ,  0.72786427,
        0.49032864,  0.0818934 , -4.7055054 ,  0.66912216, -0.97138435,
        2.9170582 ,  3.6437726 , -1.7249912 ,  1.0940584 , -2.3356736 ,
        2.7478673 ,  2.5569503 , -0.9863114 ,  2.8252892 , -1.513991  ,
       -2.8017695 , -0.05285136,  0.20108384, -3.9713352 , -1.5073471 ,
        1.5789729 ,  2.6081314 ,  1.1301067 ,  2.474019  , -0.27810496,
       -3.5028772 ,  1.7702115 ,  1.039428  , -0.08655253,  3.1031399 ,
        0.19271071, -1.7972337 , -2.8442361 ,  1.023537  ,  1.3212758 ,
       -0.81874555, -0.99070144, -0.11126587,  0.2582027 ,  0.9283447 ,
        1.3093848 ,  0.25714013,  0.4114246 ,  0.5528492 , -0.7850197 ,
       -2.4544306 ,  1.0792232 ,  1.4879901 , -3.5585694 , -0.92100114,
        3.013928  ,  3.0351725 ,  0.8748696 ,  0.18200335,  4.12

In [None]:
torch.tensor(word_vectors["chào"])
# torch.Size([400])

In [200]:
from underthesea import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity

# https://medium.com/@eskandar.sahel/exploring-feature-extraction-techniques-for-natural-language-processing-46052ee6514
def keyword_extractor(raw_doc, word2vec, top_n = 5):
  candidates, candidates_embedding = get_features_from_doc(raw_doc, word2vec)
  sentences = raw_doc.split('.')
  keywords = set([])
  for idx in range(len(sentences)):
    raw_sentence = sentences[idx]
    sentence_embedding = phobert_sentence_embedding(raw_sentence)

    print(sentence_embedding.shape)
    print('---\n', candidates_embedding.shape)
    cosine_similarity(sentence_embedding, candidates_embedding)
    list_cand = [candidates[index] for index in distances.argsort()[idx][-top_n:]]
    for cand in list_cand:
      keywords.add(cand)

  return keywords

In [211]:
from sklearn.feature_extraction.text import CountVectorizer
from keyphrase_vectorizers import KeyphraseCountVectorizer # No Vietnamese Spacy Pipeline
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

doc_segmented_list = [underthesea.word_tokenize(sentence, format='text') for sentence in doc.split('.')]
# doc_segmented = underthesea.word_tokenize(doc, format='text').split(' ')
# Extract candidate words/phrases

count = CountVectorizer(ngram_range=(1,1))
matrix = count.fit_transform([remove_vie_stop_words(doc_segmented) for doc_segmented in doc_segmented_list])
candidates = count.get_feature_names_out()
# print("Vocabulary: ", count.vocabulary_)
print("Feature: ", candidates)

# print(counts)

# model = SentenceTransformer('keepitreal/vietnamese-sbert')

doc_embedding = model.encode(doc.split('.'))
# candidate_embeddings = model.encode(candidates)

# doc_embedding = [vie_sentence_embedding(sent) for sent in doc.split('.')]
candidate_embeddings = model.encode(candidates)
print(doc_embedding)
top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
# print(distances)
# [ 0.26308233 -0.07585391 -0.18924099 ... -0.1127905  -0.17521721 0.08920351]
#ndarray
keywords = set([])

for num in range(len(doc_embedding)):
  list_cand = [candidates[index] for index in distances.argsort()[num][-top_n:]]
  for x in list_cand:
    keywords.add(x)

print(keywords)
# ['cơ_sở_dữ_liệu', 'không_gian', 'giảng_dạy', 'dựa', 'nền_tảng', 'đám', 'phần_mềm', 'mô_hình_ngữ', 'giáo_dục', 'mây']

Feature:  ['biểu_đồ' 'chuyên_nghiệp' 'cung_cấp' 'công_cụ' 'công_ty' 'cơ_sở_dữ_liệu'
 'dữ_liệu' 'dựa' 'giáo_dục' 'giảng_dạy' 'giảng_viên' 'không_gian'
 'khởi_nghiệp' 'mây' 'mô_hình_ngữ' 'nghĩa' 'nền_tảng' 'phân_tích'
 'phần_mềm' 'quy_chính' 'sinh_viên' 'thị_trường' 'xây_dựng' 'đám' 'địa_lý']
[[ 0.30408776 -0.1469394  -0.11775875 ... -0.24302202 -0.21559511
  -0.04005393]
 [ 0.20555344 -0.2437994  -0.10414701 ... -0.14379044 -0.47291678
   0.00733215]
 [ 0.28972203  0.0674792  -0.02028078 ...  0.41467595 -0.13666882
  -0.48374814]
 [-0.56419903 -0.2203982   0.25754747 ...  0.02115586  0.16262352
  -0.10238987]]
{'dữ_liệu', 'mô_hình_ngữ', 'địa_lý', 'đám', 'chuyên_nghiệp', 'phần_mềm', 'mây', 'giảng_dạy', 'dựa', 'giáo_dục', 'cơ_sở_dữ_liệu', 'không_gian', 'nền_tảng', 'giảng_viên'}


In [205]:
candidate_embeddings = model.encode(candidates)
print(candidate_embeddings)

[[ 0.34032217 -0.04999039 -0.2946525  ...  0.24886976 -0.12449721
  -0.45401725]
 [ 0.17048645  0.21563308 -0.18239711 ... -0.449849    0.03005989
  -0.23290817]
 [ 0.01932761 -0.16716848  0.32516825 ... -0.26481768  0.19191629
  -0.193913  ]
 ...
 [-0.07969067  0.38819042  0.0575594  ... -0.04781376 -0.27206686
  -0.44541565]
 [-0.23443066  0.31187168  0.14679874 ... -0.3631808   0.37211597
  -0.49685523]
 [ 0.09350439 -0.09177337 -0.2073117  ... -0.09563017  0.21218896
   0.24592485]]


In [214]:
keyword_extractor(doc, word2vec=word_vectors)

ValueError: only one element tensors can be converted to Python scalars

In [None]:
def get_candidate_embedding(sentence):
  phobert_sentence_embedding(sentence)