In [51]:
import csv
import numpy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
print("Getting list of topic names and texts...")
with open("01_topic_text.csv") as file:
    topic_names = []
    topic_texts = []
    for line in csv.reader(file):
        topic_names.append(line[0])
        topic_texts.append(line[1])
print("Done!")

Getting list of topic names and texts...
Done!


In [6]:
topic_id = 1
print(f"Topic {topic_id} is:", topic_names[topic_id])
print(topic_texts[topic_id][:200] + "...")

Topic 1 is: configuring-project-and-ide-settings.html
Configuring the IDE
IntelliJ IDEA allows you to configure the settings on several levels: the module level, the project level, and globally.
Global settings apply to all projects that you open with a ...


In [138]:
import string
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import word_tokenize

class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in string.punctuation and len(t) > 1 and t.isalnum()]

class StemTokenizer:
    def __init__(self):
        self.ps = PorterStemmer()
    def __call__(self, doc):
        return [self.ps.stem(t) for t in word_tokenize(doc) if t not in string.punctuation and len(t) > 1 and t.isalnum()]

In [202]:
vectorizer = TfidfVectorizer(tokenizer=StemTokenizer())

In [203]:
corpus_vec = vectorizer.fit_transform(topic_texts)

In [204]:
print(corpus_vec.shape)

(1505, 5528)


In [205]:
vectorizer.get_feature_names_out()

array(['0032a0', '0047e4', '007c7c', ..., 'сompil', 'сurrent', 'сustom'],
      dtype=object)

In [207]:
vectorizer.vocabulary_["enabl"]

1547

In [208]:
queries = ["file template", "file templates", "reader mode"]

In [209]:
for query in queries:
    query_vec = vectorizer.transform([query])
    results = cosine_similarity(corpus_vec, query_vec)

    print(f"Query: {query}")

    for i in results.argsort(axis=0)[-5:][::-1]:
        print(topic_names[i[0]])

    print("--- --- ---")

Query: file template
settings-file-and-code-templates.html
save-file-as-template-dialog.html
using-file-and-code-templates.html
templates-dialog.html
templates.html
--- --- ---
Query: file templates
settings-file-and-code-templates.html
save-file-as-template-dialog.html
using-file-and-code-templates.html
templates-dialog.html
templates.html
--- --- ---
Query: reader mode
reader-mode.html
accessibility.html
ide-viewing-modes.html
viewing-modes.html
settings-appearance.html
--- --- ---
