In [1]:
import pandas
import gensim
import csv
import nltk

In [2]:
print("Getting list of topic names...")
with open("01_topic_text.csv") as file:
    topic_names = []
    for line in csv.reader(file):
        topic_names.append(line[0])
print("Done!")

Getting list of topic names...
Done!


In [3]:
print("Getting corpus of word tokens...")
with open("02a_word_corpus.csv") as file:
    corpus = list(csv.reader(file))
print("Done!")

Getting corpus of word tokens...
Done!


In [4]:
topic_id = 100
print(f"Topic {topic_id} is:", topic_names[topic_id])
print(corpus[topic_id])

Topic 100 is: create-ssh-configurations.html
['create', 'ssh', 'configurations', 'intellij', 'idea', 'save', 'remote', 'server', 'ssh', 'connection', 'parameters', 'dedicated', 'ssh', 'configuration', 'created', 'configuration', 'used', 'configuring', 'remote', 'interpreters', 'connecting', 'sftp', 'deployment', 'servers', 'launching', 'ssh', 'sessions', 'settings', 'preferences', 'dialog', 'ctrl', 'alt', 'go', 'tools', 'ssh', 'configurations', 'left', 'hand', 'pane', 'lists', 'existing', 'ssh', 'configurations', 'click', 'use', 'visible', 'project', 'checkbox', 'configure', 'visibility', 'server', 'access', 'configuration', 'select', 'checkbox', 'restrict', 'use', 'ssh', 'configuration', 'current', 'project', 'ssh', 'configuration', 'cannot', 'reused', 'outside', 'current', 'project', 'appear', 'list', 'available', 'configurations', 'projects', 'ssh', 'configurations', 'stored', 'idea', 'directory', 'together', 'project', 'allows', 'sharing', 'team', 'members', 'vcs', 'ssh', 'configur

In [5]:
print("Lets create a dictionary...")
dictionary = gensim.corpora.Dictionary(corpus)
print(dictionary)

Lets create a dictionary...
Dictionary<6860 unique tokens: ['accessibility', 'added', 'allow', 'aloud', 'also']...>


In [8]:
print("Here is how to get a word by its ID:", dictionary[806])
print("And this is how to find the ID of a word:", dictionary.token2id["write"])

Here is how to get a word by its ID: workspace
And this is how to find the ID of a word: 1123


In [9]:
print("Now we can vectorize all topics based on the term frequencies...")
freq_vectors = [dictionary.doc2bow(doc) for doc in corpus]
print("Done!")

Now we can vectorize all topics based on the term frequencies...
Done!


In [10]:
print(f"Here is a frequency vector of {topic_names[topic_id]} with {len(freq_vectors[topic_id])} unique tokens:")
print(freq_vectors[topic_id])

Here is a frequency vector of create-ssh-configurations.html with 134 unique tokens:
[(6, 1), (9, 1), (10, 1), (16, 5), (19, 1), (20, 1), (21, 2), (37, 8), (38, 6), (41, 1), (42, 1), (46, 1), (61, 1), (66, 1), (68, 3), (69, 1), (71, 3), (75, 2), (86, 4), (90, 1), (95, 2), (97, 2), (98, 1), (104, 1), (105, 1), (110, 2), (115, 1), (116, 6), (117, 1), (120, 1), (124, 1), (125, 2), (127, 1), (128, 1), (130, 1), (148, 1), (160, 1), (167, 5), (168, 3), (177, 1), (187, 1), (215, 1), (218, 1), (227, 1), (232, 1), (235, 1), (238, 1), (250, 1), (253, 1), (255, 1), (268, 1), (275, 1), (281, 1), (284, 1), (286, 1), (289, 2), (295, 4), (315, 4), (320, 5), (321, 1), (340, 1), (347, 1), (349, 1), (367, 1), (380, 1), (393, 1), (397, 1), (401, 1), (402, 1), (407, 1), (435, 1), (436, 1), (485, 2), (520, 1), (525, 1), (541, 1), (560, 6), (566, 1), (604, 1), (647, 1), (658, 6), (688, 2), (707, 3), (731, 1), (734, 1), (756, 1), (766, 3), (779, 17), (787, 1), (791, 2), (795, 1), (876, 2), (880, 1), (929, 1)

In [11]:
print("Lets train a TFIDF model...")
tfidf = gensim.models.TfidfModel(freq_vectors)
print("Done!")

Lets train a TFIDF model...
Done!


In [12]:
print("We can pass a topic's frequency vector to calculate the topic's TFIDF vector.")
print("For example, here is the TFIDF vector of", topic_names[topic_id])
print(tfidf[freq_vectors[topic_id]])

We can pass a topic's frequency vector to calculate the topic's TFIDF vector.
For example, here is the TFIDF vector of create-ssh-configurations.html
[(6, 0.028909835466749238), (9, 0.015589860254431619), (10, 0.010228950367729235), (16, 0.07070292231325756), (19, 0.010708240852542744), (20, 0.01622338213237244), (21, 0.031179720508863237), (37, 0.03043509046940126), (38, 0.02297947991575833), (41, 0.007572747968287224), (42, 0.0347619597773027), (46, 0.039699483188862975), (61, 0.019538562618527842), (66, 0.010810069328799356), (68, 0.008167578957028304), (69, 0.010150550626298166), (71, 0.022570579000358474), (75, 0.013082336239924428), (86, 0.017771130926609636), (90, 0.022725588254986443), (95, 0.05105142835391318), (97, 0.06586522475071023), (98, 0.010506654089129869), (104, 0.017936522691191562), (105, 0.022725588254986443), (110, 0.008357781906063599), (115, 0.04495774252938348), (116, 0.09353916152658971), (117, 0.0332068980094234), (120, 0.010426779076705625), (124, 0.00475216

In [13]:
print("We can similarly pass any bag of words from our dictionary and count the TFIDF values for it.")
words = ["notifications", "template", "notification", "use", "templates", "file", "code"]
print("For example, lets try:", words)

print("Frequency vector:", dictionary.doc2bow(words))
print("TFIDF vector:", tfidf[dictionary.doc2bow(words)])

print(dictionary[128])

We can similarly pass any bag of words from our dictionary and count the TFIDF values for it.
For example, lets try: ['notifications', 'template', 'notification', 'use', 'templates', 'file', 'code']
Frequency vector: [(49, 1), (50, 1), (86, 1), (111, 1), (130, 1), (578, 1), (579, 1)]
TFIDF vector: [(49, 0.48232568064825554), (50, 0.7595777640152309), (86, 0.04701211346661294), (111, 0.09242627487709114), (130, 0.07002985236199662), (578, 0.28410187782432184), (579, 0.306647592352226)]
existing


In [20]:
print(dictionary[579])

templates


In [21]:
print("Now lets create a search index...")
index = gensim.similarities.SparseMatrixSimilarity(tfidf[freq_vectors], num_features=len(dictionary))
print("Done!")

Now lets create a search index...
Done!


In [22]:
for sim in index:
    print(sim)

[1.         0.01587038 0.01944593 ... 0.00204912 0.04178446 0.0105508 ]
[0.01587038 1.         0.02791603 ... 0.00984822 0.02748322 0.00518559]
[0.01944593 0.02791603 0.9999998  ... 0.00594655 0.03418799 0.01316745]
[0.00635474 0.01377373 0.01523568 ... 0.00635849 0.02032415 0.00656701]
[0.01791443 0.01909164 0.03998194 ... 0.01124821 0.07644653 0.26521984]
[0.00748923 0.01352946 0.01088319 ... 0.07084955 0.0232591  0.02996931]
[0.00778598 0.00871798 0.00915771 ... 0.00753994 0.01822897 0.05254068]
[0.04951287 0.0131467  0.03974944 ... 0.0125767  0.04015921 0.01295632]
[0.03935448 0.04895874 0.1249641  ... 0.02321024 0.09182823 0.02447462]
[0.01689503 0.02785479 0.05132109 ... 0.00706234 0.05915422 0.00935975]
[0.01242426 0.02062029 0.09520372 ... 0.01219787 0.02898123 0.00819882]
[0.00578108 0.03921133 0.02211501 ... 0.00400371 0.03241805 0.0021794 ]
[0.007708   0.01433083 0.02576579 ... 0.01950181 0.0922685  0.01391675]
[0.01282994 0.01481464 0.03274583 ... 0.03143226 0.04540933 0.00

In [27]:
print("We can pass a TFIDF vector to the index and it will count how close it is to the TFIDF vectors of every topic.")
print("For example, we can pass a TFIDF vector of a search query:")

query = "file templates"
print(query)

q_bow = dictionary.doc2bow(query.split())
print("Frequency vector of the query:", q_bow)

q_tfidf = tfidf[q_bow]
print("TFIDF vector of the query:", q_tfidf)

distance = index[q_tfidf]
print("Distance to TFIDF vectors of the first 10 topics:", distance[:10])

We can pass a TFIDF vector to the index and it will count how close it is to the TFIDF vectors of every topic.
For example, we can pass a TFIDF vector of a search query:
file templates
Frequency vector of the query: [(130, 1), (579, 1)]
TFIDF vector of the query: [(130, 0.22264041570138327), (579, 0.9749006335500636)]
Distance to TFIDF vectors of the first 10 topics: [0.         0.00455083 0.00907133 0.00615042 0.01254935 0.16825102
 0.         0.02288241 0.01711549 0.01379965]


In [25]:
dictionary[578]

'template'

In [26]:
for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:10]:
    print(document_number, topic_names[document_number], score)

1118 settings-live-templates.html 0.63848656
1260 creating-and-editing-live-templates.html 0.6264195
751 templates.html 0.5929057
348 templates-dialog.html 0.59279495
1082 saving-project-as-template.html 0.59258115
418 save-file-as-template-dialog.html 0.57191086
899 settings-file-and-code-templates.html 0.5164528
826 structural-search-and-replace-dialogs.html 0.5084825
1445 template-data-languages-settings.html 0.4671379
494 templates-with-multiple-files.html 0.45209965


In [28]:
for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:10]:
    print(document_number, topic_names[document_number], score)

899 settings-file-and-code-templates.html 0.67035156
1256 using-file-and-code-templates.html 0.5668079
418 save-file-as-template-dialog.html 0.45621872
118 sharing-live-templates.html 0.4007301
1244 using-live-templates.html 0.38993555
1024 parse-directive.html 0.3532419
20 settings-postfix-completion.html 0.28665146
1082 saving-project-as-template.html 0.22076602
572 symfony-templates.html 0.21789482
348 templates-dialog.html 0.21414979


In [29]:
results_csv = "search_test_results.csv"

with open(results_csv, "r") as file:
    df = pandas.read_csv(file)

search_queries = df["query"].tolist()

all_results = []

for query in search_queries:
    distance = index[tfidf[dictionary.doc2bow(query.split())]]

    q_result = ""

    for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:5]:
        if score > 0.01:
            q_result = q_result + topic_names[document_number] + "\n"

    all_results.append(q_result)

# print(all_results)
df["word_tfidf"] = all_results

In [30]:
df.to_csv(results_csv, index=False)

# LEMMAS

In [31]:
def lem(token):
    lemmatizer = nltk.stem.wordnet.WordNetLemmatizer()
    if token[1].startswith("V"):
        lemma = lemmatizer.lemmatize(token[0], nltk.corpus.wordnet.VERB)
    if token[1].startswith("J"):
        lemma = lemmatizer.lemmatize(token[0], nltk.corpus.wordnet.ADJ)
    if token[1].startswith("R"):
        lemma = lemmatizer.lemmatize(token[0], nltk.corpus.wordnet.ADV)
    else:
        lemma = lemmatizer.lemmatize(token[0], nltk.corpus.wordnet.NOUN)
    return lemma

In [32]:
print("Getting corpus of lemmas...")
with open("02b_lemma_corpus.csv") as file:
    corpus_lemmas = list(csv.reader(file))
print("Done!")

Getting corpus of lemmas...
Done!


In [33]:
print(corpus_lemmas[0][:10])

['notification', 'use', 'page', 'enable', 'disable', 'notification', 'certain', 'event', 'change', 'presentation']


In [34]:
print("Lets create a dictionary...")
dictionary_lemmas = gensim.corpora.Dictionary(corpus_lemmas)
print(dictionary_lemmas)

Lets create a dictionary...
Dictionary<6119 unique tokens: ['accessibility', 'added', 'allow', 'aloud', 'also']...>


In [40]:
dictionary_lemmas.token2id["template"]

529

In [41]:
print("Now we can vectorize all topics based on the lemma frequencies...")
freq_vectors_lemmas = [dictionary_lemmas.doc2bow(doc) for doc in corpus_lemmas]
print("Done!")

Now we can vectorize all topics based on the lemma frequencies...
Done!


In [42]:
print(f"Here is a frequency vector of {topic_names[topic_id]} with {len(freq_vectors_lemmas[topic_id])} unique tokens:")
print(freq_vectors_lemmas[topic_id])

Here is a frequency vector of create-ssh-configurations.html with 128 unique tokens:
[(6, 1), (9, 1), (10, 1), (15, 5), (18, 1), (19, 1), (20, 2), (34, 8), (35, 6), (38, 2), (42, 1), (55, 1), (60, 1), (62, 3), (63, 1), (64, 1), (65, 3), (69, 2), (77, 1), (80, 4), (83, 1), (88, 2), (90, 2), (91, 1), (97, 1), (98, 1), (103, 2), (108, 1), (109, 12), (110, 1), (113, 1), (116, 1), (117, 2), (119, 1), (120, 1), (122, 1), (139, 1), (149, 1), (155, 8), (164, 1), (169, 1), (174, 1), (176, 1), (202, 1), (204, 1), (209, 3), (213, 1), (218, 1), (221, 1), (224, 1), (235, 1), (238, 1), (240, 1), (251, 1), (258, 1), (261, 1), (264, 1), (267, 1), (269, 1), (271, 2), (277, 4), (295, 4), (300, 6), (317, 1), (324, 1), (326, 1), (343, 1), (356, 3), (369, 1), (376, 1), (377, 1), (381, 1), (406, 1), (450, 2), (479, 1), (483, 1), (497, 1), (515, 9), (518, 1), (553, 1), (586, 1), (637, 3), (657, 1), (660, 1), (681, 1), (690, 3), (702, 17), (709, 1), (712, 2), (785, 2), (789, 1), (834, 1), (873, 1), (967, 3), 

In [43]:
print("Lets train a TFIDF model for lemmatized vectors...")
tfidf_lemmas = gensim.models.TfidfModel(freq_vectors_lemmas)
print("Done!")

Lets train a TFIDF model for lemmatized vectors...
Done!


In [44]:
print("Now lets create a search index for the lemmatized TFIDF model...")
index_lemmas = gensim.similarities.SparseMatrixSimilarity(tfidf_lemmas[freq_vectors_lemmas], num_features=len(dictionary_lemmas))
print("Done!")

Now lets create a search index for the lemmatized TFIDF model...
Done!


In [47]:
print("We can pass a TFIDF vector to the index and it will count how close it is to the TFIDF vectors of every topic.")
print("For example, we can pass a TFIDF vector of a search query:")

query = "file template"
print(query)

q_bow = dictionary_lemmas.doc2bow([lem(token) for token in nltk.pos_tag(query.split())])
print("Frequency vector of the query:", q_bow)

q_tfidf = tfidf_lemmas[q_bow]
print("TFIDF vector of the query:", q_tfidf)

distance = index_lemmas[q_tfidf]
print("Distance to TFIDF vectors of the first 10 topics:", distance[:10])

We can pass a TFIDF vector to the index and it will count how close it is to the TFIDF vectors of every topic.
For example, we can pass a TFIDF vector of a search query:
file template
Frequency vector of the query: [(122, 1), (529, 1)]
TFIDF vector of the query: [(122, 0.1871069791629675), (529, 0.9823395433089868)]
Distance to TFIDF vectors of the first 10 topics: [0.         0.00288103 0.00806485 0.00554526 0.01498164 0.3031664
 0.         0.01984372 0.02869705 0.01564465]


In [245]:
dictionary_lemmas[541]

'transformation'

In [46]:
for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:10]:
    print(document_number, topic_names[document_number], score)

899 settings-file-and-code-templates.html 0.8896681
418 save-file-as-template-dialog.html 0.7931419
1256 using-file-and-code-templates.html 0.77156055
1118 settings-live-templates.html 0.7303256
348 templates-dialog.html 0.71707034
1082 saving-project-as-template.html 0.70117843
751 templates.html 0.6870214
1260 creating-and-editing-live-templates.html 0.66808915
826 structural-search-and-replace-dialogs.html 0.60541964
118 sharing-live-templates.html 0.56853396


In [48]:
for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:10]:
    print(document_number, topic_names[document_number], score)

899 settings-file-and-code-templates.html 0.8896681
418 save-file-as-template-dialog.html 0.7931419
1256 using-file-and-code-templates.html 0.77156055
1118 settings-live-templates.html 0.7303256
348 templates-dialog.html 0.71707034
1082 saving-project-as-template.html 0.70117843
751 templates.html 0.6870214
1260 creating-and-editing-live-templates.html 0.66808915
826 structural-search-and-replace-dialogs.html 0.60541964
118 sharing-live-templates.html 0.56853396


In [49]:
results_csv = "search_test_results.csv"

with open(results_csv, "r") as file:
    df = pandas.read_csv(file)

search_queries = df["query"].tolist()

all_results = []

for query in search_queries:

    distance = index_lemmas[tfidf_lemmas[dictionary_lemmas.doc2bow([lem(token) for token in nltk.pos_tag(query.split())])]]

    q_result = ""

    for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:5]:
        if score > 0.01:
            q_result = q_result + topic_names[document_number] + "\n"

    all_results.append(q_result)

# print(all_results)
df["lemma_tfidf"] = all_results

In [50]:
df.to_csv(results_csv, index=False)

# STEMMING

In [51]:
print("Getting corpus of stemmed tokens...")
with open("02c_stemmed_corpus.csv") as file:
    corpus_stemmed = list(csv.reader(file))
print("Done!")

Getting corpus of stemmed tokens...
Done!


In [52]:
print("Lets create a dictionary...")
dictionary_stemmed = gensim.corpora.Dictionary(corpus_stemmed)
print(dictionary_stemmed)

Lets create a dictionary...
Dictionary<4557 unique tokens: ['access', 'ad', 'allow', 'aloud', 'also']...>


In [53]:
dictionary_stemmed.token2id["notif"]

43

In [54]:
print("Now we can vectorize all topics based on the stemmed token frequencies...")
freq_vectors_stemmed = [dictionary_stemmed.doc2bow(doc) for doc in corpus_stemmed]
print("Done!")

Now we can vectorize all topics based on the stemmed token frequencies...
Done!


In [55]:
print(f"Here is a frequency vector of {topic_names[topic_id]} with {len(freq_vectors_stemmed[topic_id])} unique tokens:")
print(freq_vectors_stemmed[topic_id])

Here is a frequency vector of create-ssh-configurations.html with 117 unique tokens:
[(0, 2), (1, 1), (2, 2), (6, 1), (9, 1), (10, 1), (15, 5), (18, 14), (19, 1), (20, 2), (30, 2), (34, 8), (35, 6), (38, 2), (40, 1), (52, 1), (57, 1), (59, 4), (60, 1), (61, 3), (65, 2), (73, 1), (76, 5), (79, 1), (85, 1), (90, 1), (91, 1), (95, 2), (100, 1), (102, 2), (103, 1), (106, 1), (107, 2), (108, 1), (109, 1), (111, 1), (126, 1), (128, 1), (136, 1), (142, 8), (148, 1), (152, 2), (156, 1), (158, 1), (181, 1), (183, 1), (187, 5), (194, 1), (197, 1), (200, 1), (208, 1), (211, 1), (220, 1), (224, 1), (227, 1), (230, 1), (233, 1), (236, 1), (237, 1), (239, 2), (244, 4), (259, 4), (263, 6), (281, 1), (283, 1), (302, 3), (314, 1), (321, 1), (322, 1), (326, 1), (342, 2), (350, 1), (388, 2), (418, 1), (429, 1), (439, 9), (442, 1), (471, 3), (497, 1), (536, 3), (551, 1), (554, 1), (570, 1), (579, 3), (589, 17), (592, 3), (595, 1), (598, 2), (649, 2), (682, 1), (705, 1), (785, 1), (822, 1), (879, 3), (888,

In [56]:
print("Lets train a TFIDF model for stemmed vectors...")
tfidf_stemmed = gensim.models.TfidfModel(freq_vectors_stemmed)
print("Done!")

Lets train a TFIDF model for stemmed vectors...
Done!


In [57]:
print("Now lets create a search index for the stemmed TFIDF model...")
index_stemmed = gensim.similarities.SparseMatrixSimilarity(tfidf_stemmed[freq_vectors_stemmed], num_features=len(dictionary_stemmed))
print("Done!")

Now lets create a search index for the stemmed TFIDF model...
Done!


In [61]:
print("We can pass a TFIDF vector to the index and it will count how close it is to the TFIDF vectors of every topic.")
print("For example, we can pass a TFIDF vector of a search query:")

query = "file templates"
print(query)

stemmer = nltk.stem.porter.PorterStemmer()
# q_bow = dictionary_stemmed.doc2bow([stemmer.stem(lem(token)) for token in nltk.pos_tag(query.split())])
q_bow = dictionary_stemmed.doc2bow([stemmer.stem(token) for token in query.split()])
print("Frequency vector of the query:", q_bow)

q_tfidf = tfidf_stemmed[q_bow]
print("TFIDF vector of the query:", q_tfidf)

distance = index_stemmed[q_tfidf]
print("Distance to TFIDF vectors of the first 10 topics:", distance[:10])

We can pass a TFIDF vector to the index and it will count how close it is to the TFIDF vectors of every topic.
For example, we can pass a TFIDF vector of a search query:
file templates
Frequency vector of the query: [(111, 1), (451, 1)]
TFIDF vector of the query: [(111, 0.18749390746061886), (451, 0.9822657658012666)]
Distance to TFIDF vectors of the first 10 topics: [0.         0.00292482 0.00825783 0.00548792 0.01541227 0.30546805
 0.         0.02124273 0.0305678  0.01598057]


In [59]:
dictionary_stemmed[451]

'templat'

In [60]:
for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:10]:
    print(document_number, topic_names[document_number], score)

899 settings-file-and-code-templates.html 0.89820004
418 save-file-as-template-dialog.html 0.8130379
1256 using-file-and-code-templates.html 0.80103093
1118 settings-live-templates.html 0.7402638
348 templates-dialog.html 0.73867685
1082 saving-project-as-template.html 0.7160894
751 templates.html 0.6934399
1260 creating-and-editing-live-templates.html 0.67852545
826 structural-search-and-replace-dialogs.html 0.64273345
1024 parse-directive.html 0.6133199


In [62]:
for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:10]:
    print(document_number, topic_names[document_number], score)

899 settings-file-and-code-templates.html 0.89820004
418 save-file-as-template-dialog.html 0.8130379
1256 using-file-and-code-templates.html 0.80103093
1118 settings-live-templates.html 0.7402638
348 templates-dialog.html 0.73867685
1082 saving-project-as-template.html 0.7160894
751 templates.html 0.6934399
1260 creating-and-editing-live-templates.html 0.67852545
826 structural-search-and-replace-dialogs.html 0.64273345
1024 parse-directive.html 0.6133199


In [63]:
results_csv = "search_test_results.csv"

with open(results_csv, "r") as file:
    df = pandas.read_csv(file)

search_queries = df["query"].tolist()

all_results = []

for query in search_queries:

    distance = index_stemmed[tfidf_stemmed[dictionary_stemmed.doc2bow([stemmer.stem(token) for token in query.split()])]]

    q_result = ""

    for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:5]:
        if score > 0.01:
            q_result = q_result + topic_names[document_number] + "\n"

    all_results.append(q_result)

# print(all_results)
df["stemmed_tfidf"] = all_results

In [64]:
df.to_csv(results_csv, index=False)