### TF-IDF

In [1]:
import numpy as np
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# from sklearn.datasets import fetch_20newsgroups # test dataset

In [2]:
nlp = spacy.load('en_core_web_sm')
unwanted_pipes = ["ner", "parser"]

In [3]:
table = pd.read_excel('C:/JupyterLab/github_repos/NLP_project/Dados/df_completo.xlsx')
unprocessed_data = table['Abstract'][0:1000].dropna()

In [4]:
def spacy_tokenizer(doc):
  with nlp.disable_pipes(*unwanted_pipes):
    return [t.lemma_ for t in nlp(doc) if \
            not t.is_punct and \
            not t.is_space and \
            t.is_alpha]

def preprocess_text(series):
    processed_texts = []
    for text in series:
            doc = nlp(text)
            filtered_words = [token.text.lower() for token in doc if not token.is_stop and token.is_alpha]
            processed_texts.append(' '.join(filtered_words))
    return processed_texts

In [5]:
data = preprocess_text(unprocessed_data)
corpus = list(data)

In [13]:
print(data)



In [6]:
%%time
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer)
features = vectorizer.fit_transform(corpus)



CPU times: total: 2.05 s
Wall time: 6.62 s


In [7]:
print(features.shape)
print(features[0])

(878, 7596)
  (0, 5928)	0.05724450790675197
  (0, 6717)	0.03156833834249365
  (0, 6674)	0.0411963981638496
  (0, 1696)	0.03878217928356582
  (0, 2354)	0.036997194167933616
  (0, 5653)	0.03800157703462785
  (0, 1315)	0.06580594505203696
  (0, 7114)	0.15703985085856204
  (0, 5615)	0.06918904361292316
  (0, 3590)	0.10410573465652666
  (0, 4865)	0.03631110342834046
  (0, 5924)	0.0625062004226305
  (0, 526)	0.022120398250058872
  (0, 3591)	0.05574723101481619
  (0, 2304)	0.10253694721708613
  (0, 2456)	0.02353224766325066
  (0, 6093)	0.05710555776228502
  (0, 6920)	0.08059504282962508
  (0, 6502)	0.04038029272881981
  (0, 6321)	0.04131713096347153
  (0, 3972)	0.05850372491535683
  (0, 1265)	0.028389257436090356
  (0, 3070)	0.07049978051635222
  (0, 1106)	0.08911445820962889
  (0, 3554)	0.0325433610733976
  :	:
  (0, 518)	0.03557887126727706
  (0, 3357)	0.053545593200327285
  (0, 5382)	0.08605325879085532
  (0, 7138)	0.04026764255743444
  (0, 3091)	0.041685965458091
  (0, 4576)	0.08312378875

#### Query

In [8]:
def top_k(arr, k):
  kth_largest = (k + 1) * -1
  return np.argsort(arr)[:kth_largest:-1]

In [9]:
# Transform the query into a TF-IDF vector / Lunar Query
query = ["lunar orbit"]
query_tfidf = vectorizer.transform(query)

In [10]:
cosine_similarities = cosine_similarity(features, query_tfidf).flatten()

In [11]:
top_related_indices = top_k(cosine_similarities, 5)
print(top_related_indices)
print(cosine_similarities[top_related_indices])

[  3 877 288 299 298]
[0.06993976 0.         0.         0.         0.        ]


In [12]:
# query satellite
query = ["satellite"]
query_tfidf = vectorizer.transform(query)

cosine_similarities = cosine_similarity(features, query_tfidf).flatten()
top_related_indices = top_k(cosine_similarities, 5)

print(top_related_indices)
print(cosine_similarities[top_related_indices])

[877 288 299 298 297]
[0. 0. 0. 0. 0.]
