In [85]:
import pandas
import nltk
import string
import numpy
from collections import defaultdict as dd

from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

stop_words = nltk.corpus.stopwords.words("english")

import gensim

In [65]:
colnames = ["topic", "text"]
df = pandas.read_csv("01_topic_text.csv", names=colnames, header=0)

df.head(5)

Unnamed: 0,topic,text
0,settings-notifications.html,Notifications\nUse this page to enable and dis...
1,configuring-project-and-ide-settings.html,Configuring the IDE\nIntelliJ IDEA allows you ...
2,php-built-in-web-server.html,Built-in web server\nThe following is only val...
3,work-with-scala-worksheet-and-ammonite.html,Scala worksheet and Ammonite\nYou can quickly ...
4,finding-and-replacing-text-in-project.html,Search and replace a target within a project\n...


In [66]:
print("Lets make a list of all topic names...")
topic_names = list(df["topic"])
print("First 10:", topic_names[:10])

print("And now a list of topic texts...")
topic_texts = list(df["text"])
print("Here is the first one:", repr(topic_texts[0][:100]))

Lets make a list of all topic names...
First 10: ['settings-notifications.html', 'configuring-project-and-ide-settings.html', 'php-built-in-web-server.html', 'work-with-scala-worksheet-and-ammonite.html', 'finding-and-replacing-text-in-project.html', 'using-postfix-templates.html', 'extract-constant-refactoring-dialog.html', 'edit-log-files-aliases-dialog.html', 'run-debug-configuration-attests.html', 'symbols.html']
And now a list of topic texts...
Here is the first one: 'Notifications\nUse this page to enable and disable notifications about certain events, change their p'


In [67]:
print("Tokenizing and removing punctuation and stopwords...")
all_tokens = [[word for word in nltk.word_tokenize(doc.lower().replace("/", " ").replace("\\", " ")) if word not in stop_words and word not in string.punctuation] for doc in topic_texts]
print("Created a list of bags of words")

Tokenizing and removing punctuation and stopwords...
Created a list of bags of words


In [68]:
print("First topic:")
print(len(all_tokens[0]), "tokens")
print(all_tokens[0])

First topic:
171 tokens
['notifications', 'use', 'page', 'enable', 'disable', 'notifications', 'certain', 'events', 'change', 'presentation', 'optionally', 'enable', 'logging', 'item', 'description', 'display', 'balloon', 'notifications', 'select', 'checkbox', 'enable', 'event', 'notifications', 'intellij', 'idea', 'notifications', 'generally', 'shown', 'balloons', 'appear', 'screen', 'corresponding', 'events', 'take', 'place', 'regardless', 'state', 'option', 'always', 'find', 'notifications', 'current', 'session', 'event', 'log', 'enable', 'system', 'notifications', 'select', 'checkbox', 'allow', 'showing', 'system', 'notification', 'windows', 'may', 'want', 'disable', 'option', 'hide', 'intellij', 'idea', 'icon', 'system', 'tray', 'group', 'column', 'lists', 'groups', 'events', 'may', 'notified', 'may', 'logged', 'popup', 'display', 'balloon', 'notification', 'checkbox', 'selected', 'settings', 'column', 'specify', 'notifications', 'corresponding', 'group', 'events', 'shown', 'avail

In [69]:
print("Counting term frequencies...")

frequency = dd(int)

for doc in all_tokens:
    for token in doc:
        frequency[token] += 1

print("There are", len(frequency), "tokens in total.")

print("The 10 most frequent are:", dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True)[:10]))

Counting term frequencies...
There are 12879 tokens in total.
The 10 most frequent are: {'select': 12012, 'run': 9247, 'click': 7625, 'intellij': 7462, 'idea': 7443, 'file': 7379, 'project': 6497, 'configuration': 6449, 'code': 5493, 'dialog': 5449}


In [70]:
word = "permissions"
print("The word", word.upper(), "appears", frequency[word], "times.")

The word PERMISSIONS appears 50 times.


In [71]:
print("Lets get rid of tokens that appear only once...")

corpus = [[token for token in doc if frequency[token] > 1] for doc in all_tokens]

unique = set()

for doc in corpus:
    for token in doc:
        unique.add(token)

print("Now we have", len(unique), "tokens left.")

print("First topic:")
print(len(corpus[0]))
print(corpus[0])

Lets get rid of tokens that appear only once...
Now we have 8260 tokens left.
First topic:
170
['notifications', 'use', 'page', 'enable', 'disable', 'notifications', 'certain', 'events', 'change', 'presentation', 'optionally', 'enable', 'logging', 'item', 'description', 'display', 'balloon', 'notifications', 'select', 'checkbox', 'enable', 'event', 'notifications', 'intellij', 'idea', 'notifications', 'generally', 'shown', 'balloons', 'appear', 'screen', 'corresponding', 'events', 'take', 'place', 'regardless', 'state', 'option', 'always', 'find', 'notifications', 'current', 'session', 'event', 'log', 'enable', 'system', 'notifications', 'select', 'checkbox', 'allow', 'showing', 'system', 'notification', 'windows', 'may', 'want', 'disable', 'option', 'hide', 'intellij', 'idea', 'icon', 'system', 'tray', 'group', 'column', 'lists', 'groups', 'events', 'may', 'notified', 'may', 'logged', 'popup', 'display', 'balloon', 'notification', 'checkbox', 'selected', 'settings', 'column', 'specify

In [72]:
print("Here is how to get some topic file name:", topic_names[101])

print("And its bag of words from the corpus:")
print(corpus[101])

Here is how to get some topic file name: navigation-in-groovy.html
And its bag of words from the corpus:
['navigation', 'groovy', 'intellij', 'idea', 'offers', 'standard', 'navigation', 'actions', 'work', 'groovy', 'recent', 'files', 'time', 'work', 'finite', 'set', 'files', 'need', 'switch', 'quickly', 'real', 'time-saver', 'action', 'called', 'recent', 'files', 'invoked', 'pressing', 'ctrl+e', 'default', 'focus', 'last', 'accessed', 'file', 'note', 'also', 'open', 'tool', 'window', 'action', 'navigate', 'class', 'available', 'pressing', 'ctrl+n', 'supports', 'sophisticated', 'expressions', 'including', 'camel', 'humps', 'paths', 'line', 'navigate', 'middle', 'name', 'matching', 'many', 'call', 'twice', 'shows', 'results', 'project', 'classes', 'navigate', 'file', 'works', 'similarly', 'pressing', 'ctrl+shift+n', 'used', 'files', 'folders', 'navigate', 'folder', 'end', 'expression', 'slash', 'character', 'navigate', 'symbol', 'available', 'pressing', 'ctrl+alt+shift+n', 'allows', 'fin

In [73]:
print("Lets create a dictionary...")
dictionary = gensim.corpora.Dictionary(corpus)
print(dictionary)

Lets create a dictionary...
Dictionary(8260 unique tokens: ['accessibility', 'added', 'allow', 'aloud', 'also']...)


In [175]:
print("Here is how to get a word by its ID:", dictionary[807])
print("And this is how to find the ID of a word:", dictionary.token2id["commandline"])

Here is how to get a word by its ID: ssh


KeyError: 'commandline'

In [75]:
print("Now we can vectorize all topics based on the term frequencies...")
freq_vectors = [dictionary.doc2bow(doc) for doc in corpus]

Now we can vectorize all topics based on the term frequencies...


In [76]:
id = 100
print("Here is a frequency vector of", topic_names[id])
print(freq_vectors[id])

Here is a frequency vector of create-ssh-configurations.html
[(6, 1), (9, 1), (10, 1), (16, 5), (19, 1), (20, 1), (21, 2), (37, 6), (38, 6), (41, 1), (42, 1), (46, 1), (61, 1), (66, 1), (68, 3), (69, 1), (71, 3), (75, 2), (86, 4), (90, 1), (96, 2), (98, 2), (104, 1), (105, 1), (110, 2), (114, 6), (115, 1), (118, 1), (122, 1), (123, 2), (125, 1), (126, 1), (128, 1), (145, 1), (157, 1), (164, 5), (165, 3), (174, 1), (183, 1), (211, 1), (213, 1), (222, 1), (228, 1), (231, 1), (234, 1), (245, 1), (248, 1), (250, 1), (264, 1), (271, 1), (277, 1), (280, 1), (282, 1), (285, 2), (292, 4), (313, 4), (318, 5), (319, 1), (339, 1), (346, 1), (352, 1), (372, 1), (385, 1), (402, 1), (406, 1), (407, 1), (412, 1), (439, 1), (440, 1), (463, 1), (497, 2), (532, 1), (537, 1), (552, 1), (571, 6), (577, 1), (614, 1), (638, 2), (680, 6), (710, 2), (731, 3), (754, 1), (757, 1), (780, 1), (793, 3), (807, 15), (819, 1), (823, 2), (827, 1), (910, 2), (918, 1), (974, 1), (1048, 1), (1136, 3), (1162, 1), (1170, 1

In [78]:
print("Lets train a TFIDF model...")

tfidf = gensim.models.TfidfModel(freq_vectors)

Lets train a TFIDF model...


In [79]:
print("We can pass a topic's frequency vector to calculate the topic's TFIDF vector.")
print("For example, here is the TFIDF vector of", topic_names[id])
print(tfidf[freq_vectors[id]])

We can pass a topic's frequency vector to calculate the topic's TFIDF vector.
For example, here is the TFIDF vector of create-ssh-configurations.html
[(6, 0.030019874966861534), (9, 0.016188457942874147), (10, 0.010621707322915893), (16, 0.07341767441315639), (19, 0.011140490704105091), (20, 0.016846304909399732), (21, 0.03237691588574829), (37, 0.02402113060999782), (38, 0.02402113060999782), (41, 0.007863515772167057), (42, 0.036096700976314434), (46, 0.04122380851836431), (61, 0.02032701045496918), (66, 0.011225139278530517), (68, 0.008555118362237227), (69, 0.010560609563914249), (71, 0.023693053431385794), (75, 0.013584653522109209), (86, 0.018508801283859018), (90, 0.0235981736647283), (96, 0.0531261172586214), (98, 0.06839422569369911), (104, 0.01872841636470138), (105, 0.023645568983061954), (110, 0.009700360751982105), (114, 0.09765917716416737), (115, 0.03448193011775401), (118, 0.020442282453173845), (122, 0.004934635864199034), (123, 0.03214361172712941), (125, 0.0102986135

In [81]:
print("We can similarly pass any bag of words from our dictionary and count the TFIDF values for it.")
words = ["notifications", "template", "notification", "use", "templates", "file", "code"]
print("For example, lets try:", words)

print("Frequency vector:", dictionary.doc2bow(words))
print("TFIDF vector:", tfidf[dictionary.doc2bow(words)])

print(dictionary[128])
print(frequency["use"])

We can similarly pass any bag of words from our dictionary and count the TFIDF values for it.
For example, lets try: ['notifications', 'template', 'notification', 'use', 'templates', 'file', 'code']
Frequency vector: [(49, 1), (50, 1), (86, 1), (111, 1), (128, 1), (589, 1), (590, 1)]
TFIDF vector: [(49, 0.4822266231359381), (50, 0.7594217659277688), (86, 0.04714336089537494), (111, 0.09259541587067503), (128, 0.07001546999912177), (589, 0.28468116526805004), (590, 0.30658461468200177)]
file
5115


In [82]:
print("Now lets create a search index...")
index = gensim.similarities.SparseMatrixSimilarity(tfidf[freq_vectors], num_features=len(dictionary))
print("Done!")

Now lets create a search index...
Done!


In [83]:
for sim in index:
    print(sim)

[1.         0.019305   0.01967547 ... 0.00200838 0.04250559 0.01035289]
[0.019305   1.0000002  0.03063431 ... 0.01134181 0.03088375 0.00672705]
[0.01967547 0.03063431 1.0000001  ... 0.00540346 0.03195567 0.01674758]
[0.00660839 0.01717903 0.01599788 ... 0.00604091 0.02111693 0.0063573 ]
[0.01766419 0.01828648 0.03795993 ... 0.0099074  0.06784267 0.2588701 ]
[0.00746643 0.0169356  0.01098995 ... 0.06841523 0.02251978 0.02968446]
[0.00773716 0.01083436 0.00931545 ... 0.00590314 0.01860577 0.05157303]
[0.0499848  0.01648132 0.04019947 ... 0.01179612 0.03844546 0.0135881 ]
[0.03873462 0.05766625 0.12147315 ... 0.02255065 0.08779094 0.02183409]
[0.01681823 0.03081714 0.05142988 ... 0.00656184 0.05717475 0.00953472]
[0.01761781 0.03568901 0.12317863 ... 0.01559065 0.033058   0.0118446 ]
[0.00590193 0.0479845  0.0223975  ... 0.0034165  0.03232911 0.00215604]
[0.00766483 0.01852947 0.02436591 ... 0.00908439 0.08896468 0.01369652]
[0.01268977 0.01815884 0.03373889 ... 0.02974984 0.04212256 0.00

In [170]:
print("We can pass a TFIDF vector to the index and it will count how close it is to the TFIDF vectors of every topic.")
print("For example, we can pass a TFIDF vector of a search query:")

query = "flutter"
print(query)

q_bow = dictionary.doc2bow(query.split())
print("Frequency vector of the query:", q_bow)

q_tfidf = tfidf[q_bow]
print("TFIDF vector of the query:", q_tfidf)

distance = index[q_tfidf]
print("Distance to TFIDF vectors of the first 10 topics:", distance[:10])

We can pass a TFIDF vector to the index and it will count how close it is to the TFIDF vectors of every topic.
For example, we can pass a TFIDF vector of a search query:
flutter
Frequency vector of the query: []
TFIDF vector of the query: []
Distance to TFIDF vectors of the first 10 topics: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [86]:
numpy.argmax(distance)

899

In [87]:
topic_names[899]

'settings-file-and-code-templates.html'

In [171]:
for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:10]:
    print(document_number, topic_names[document_number], score)

0 settings-notifications.html 0.0
1 configuring-project-and-ide-settings.html 0.0
2 php-built-in-web-server.html 0.0
3 work-with-scala-worksheet-and-ammonite.html 0.0
4 finding-and-replacing-text-in-project.html 0.0
5 using-postfix-templates.html 0.0
6 extract-constant-refactoring-dialog.html 0.0
7 edit-log-files-aliases-dialog.html 0.0
8 run-debug-configuration-attests.html 0.0
9 symbols.html 0.0


In [90]:
for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:10]:
    print(document_number, topic_names[document_number], score)

1118 settings-live-templates.html 0.63922334
1260 creating-and-editing-live-templates.html 0.6217025
348 templates-dialog.html 0.59002995
751 templates.html 0.5833273
418 save-file-as-template-dialog.html 0.56446
1082 saving-project-as-template.html 0.5490642
899 settings-file-and-code-templates.html 0.5154492
826 structural-search-and-replace-dialogs.html 0.50830114
1445 template-data-languages-settings.html 0.4658313
494 templates-with-multiple-files.html 0.4638382


# LEMMATIZATION

In [91]:
print("Lets try to lemmatize our tokens...")

lemmatizer = WordNetLemmatizer()

def lem(token):
    if token[1].startswith("V"):
        lemma = lemmatizer.lemmatize(token[0], wordnet.VERB)
    if token[1].startswith("J"):
        lemma = lemmatizer.lemmatize(token[0], wordnet.ADJ)
    if token[1].startswith("R"):
        lemma = lemmatizer.lemmatize(token[0], wordnet.ADV)
    else:
        lemma = lemmatizer.lemmatize(token[0], wordnet.NOUN)
    return lemma

all_lemmas = [[lem(token) for token in nltk.pos_tag(doc)] for doc in all_tokens]

print(all_lemmas[0][:20])

Lets try to lemmatize our tokens...
['notification', 'use', 'page', 'enable', 'disable', 'notification', 'certain', 'event', 'change', 'presentation', 'optionally', 'enable', 'logging', 'item', 'description', 'display', 'balloon', 'notification', 'select', 'checkbox']


In [165]:
lemmatizer.lemmatize("using", pos="r")

'using'

In [118]:
nltk.pos_tag(["best", "better", "using"])

[('best', 'RB'), ('better', 'RBR'), ('using', 'VBG')]

In [92]:
print("Counting lemma frequencies...")

frequency_lemmas = dd(int)

for doc in all_lemmas:
    for token in doc:
        frequency_lemmas[token] += 1

print("There are", len(frequency_lemmas), "tokens in total.")

print("The 10 most frequent are:", dict(sorted(frequency_lemmas.items(), key=lambda item: item[1], reverse=True)[:10]))

Counting lemma frequencies...
There are 11968 tokens in total.
The 10 most frequent are: {'select': 12012, 'file': 11431, 'run': 9497, 'configuration': 7982, 'click': 7630, 'intellij': 7462, 'idea': 7444, 'project': 7151, 'option': 6273, 'open': 5873}


In [93]:
print("Lets get rid of lemmas that appear only once...")

corpus_lemmas = [[token for token in doc if frequency_lemmas[token] > 1] for doc in all_lemmas]

unique = set()

for doc in corpus_lemmas:
    for token in doc:
        unique.add(token)

print("Now we have", len(unique), "tokens left.")

print("First topic:")
print(len(corpus_lemmas[0]))
print(corpus_lemmas[0])

Lets get rid of lemmas that appear only once...
Now we have 7528 tokens left.
First topic:
170
['notification', 'use', 'page', 'enable', 'disable', 'notification', 'certain', 'event', 'change', 'presentation', 'optionally', 'enable', 'logging', 'item', 'description', 'display', 'balloon', 'notification', 'select', 'checkbox', 'enable', 'event', 'notification', 'intellij', 'idea', 'notification', 'generally', 'shown', 'balloon', 'appear', 'screen', 'corresponding', 'event', 'take', 'place', 'regardless', 'state', 'option', 'always', 'find', 'notification', 'current', 'session', 'event', 'log', 'enable', 'system', 'notification', 'select', 'checkbox', 'allow', 'showing', 'system', 'notification', 'window', 'may', 'want', 'disable', 'option', 'hide', 'intellij', 'idea', 'icon', 'system', 'tray', 'group', 'column', 'list', 'group', 'event', 'may', 'notified', 'may', 'logged', 'popup', 'display', 'balloon', 'notification', 'checkbox', 'selected', 'setting', 'column', 'specify', 'notificatio

In [94]:
print("Lets create a dictionary...")
dictionary_lemmas = gensim.corpora.Dictionary(corpus_lemmas)
print(dictionary_lemmas)

Lets create a dictionary...
Dictionary(7528 unique tokens: ['accessibility', 'added', 'allow', 'aloud', 'also']...)


In [168]:
dictionary_lemmas.token2id["flutter"]

KeyError: 'flutter'

In [124]:
print("Now we can vectorize all topics based on the lemma frequencies...")
freq_vectors_lemmas = [dictionary_lemmas.doc2bow(doc) for doc in corpus_lemmas]

Now we can vectorize all topics based on the lemma frequencies...


In [125]:
print("Lets train a TFIDF model for lemmatized vectors...")

tfidf_lemmas = gensim.models.TfidfModel(freq_vectors_lemmas)

Lets train a TFIDF model for lemmatized vectors...


In [126]:
print("Now lets create a search index for the lemmatized TFIDF model...")
index_lemmas = gensim.similarities.SparseMatrixSimilarity(tfidf_lemmas[freq_vectors_lemmas], num_features=len(dictionary_lemmas))
print("Done!")

Now lets create a search index for the lemmatized TFIDF model...
Done!


In [166]:
print("We can pass a TFIDF vector to the index and it will count how close it is to the TFIDF vectors of every topic.")
print("For example, we can pass a TFIDF vector of a search query:")

query = "file template"
print(query)

q_bow = dictionary_lemmas.doc2bow([lem(token) for token in nltk.pos_tag(query.split())])
print("Frequency vector of the query:", q_bow)

q_tfidf = tfidf_lemmas[q_bow]
print("TFIDF vector of the query:", q_tfidf)

distance = index_lemmas[q_tfidf]
print("Distance to TFIDF vectors of the first 10 topics:", distance[:10])

We can pass a TFIDF vector to the index and it will count how close it is to the TFIDF vectors of every topic.
For example, we can pass a TFIDF vector of a search query:
file template
Frequency vector of the query: [(120, 1), (540, 1)]
TFIDF vector of the query: [(120, 0.1871069791629675), (540, 0.9823395433089868)]
Distance to TFIDF vectors of the first 10 topics: [0.         0.00348058 0.00818068 0.00566972 0.0148157  0.30278903
 0.         0.01975496 0.02870232 0.01555367]


In [63]:
dictionary_lemmas[540]

'template'

In [167]:
for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:10]:
    print(document_number, topic_names[document_number], score)

899 settings-file-and-code-templates.html 0.8874572
418 save-file-as-template-dialog.html 0.7845077
1256 using-file-and-code-templates.html 0.76745254
1118 settings-live-templates.html 0.72966576
348 templates-dialog.html 0.7154784
751 templates.html 0.67601347
1082 saving-project-as-template.html 0.66522175
1260 creating-and-editing-live-templates.html 0.66313416
826 structural-search-and-replace-dialogs.html 0.60396296
494 templates-with-multiple-files.html 0.5787665


In [138]:
for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:10]:
    print(document_number, topic_names[document_number], score)

0 settings-notifications.html 0.7579207
792 event-log-tool-window.html 0.34506154
1481 perforce-working-offline.html 0.2000306
676 navigating-between-text-and-message-file.html 0.08861907
786 authentication-required.html 0.07660091
380 database-users-and-roles.html 0.069045775
652 compiler.html 0.06755538
855 code-with-me-guest-ui-overview.html 0.06311048
735 choosing-your-testing-framework.html 0.058963507
327 configuring-javascript-debugger.html 0.056894135


# STEMMING

In [139]:
print("Now lets stem the lemmas...")

stemmer = nltk.stem.porter.PorterStemmer()

all_lemmas_stemmed = [[stemmer.stem(lemma) for lemma in doc] for doc in all_lemmas]

print(all_lemmas_stemmed[0][:20])

Now lets stem the lemmas...
['notif', 'use', 'page', 'enabl', 'disabl', 'notif', 'certain', 'event', 'chang', 'present', 'option', 'enabl', 'log', 'item', 'descript', 'display', 'balloon', 'notif', 'select', 'checkbox']


In [140]:
print("Counting stemmed lemma frequencies...")

frequency_lemmas_stemmed = dd(int)

for doc in all_lemmas_stemmed:
    for token in doc:
        frequency_lemmas_stemmed[token] += 1

print("There are", len(frequency_lemmas_stemmed), "stemmed tokens in total.")

print("The 10 most frequent are:", dict(sorted(frequency_lemmas_stemmed.items(), key=lambda item: item[1], reverse=True)[:10]))

Counting stemmed lemma frequencies...
There are 9863 stemmed tokens in total.
The 10 most frequent are: {'select': 16528, 'file': 11434, 'run': 11122, 'configur': 10926, 'use': 8118, 'click': 7881, 'intellij': 7462, 'idea': 7444, 'project': 7154, 'specifi': 6690}


In [141]:
print("Lets get rid of stemmed lemmas that appear only once...")

corpus_lemmas_stemmed = [[token for token in doc if frequency_lemmas_stemmed[token] > 1] for doc in all_lemmas_stemmed]

unique = set()

for doc in corpus_lemmas_stemmed:
    for token in doc:
        unique.add(token)

print("Now we have", len(unique), "tokens left.")

print("First topic:")
print(len(corpus_lemmas_stemmed[0]))
print(corpus_lemmas_stemmed[0])

Lets get rid of stemmed lemmas that appear only once...
Now we have 5954 tokens left.
First topic:
170
['notif', 'use', 'page', 'enabl', 'disabl', 'notif', 'certain', 'event', 'chang', 'present', 'option', 'enabl', 'log', 'item', 'descript', 'display', 'balloon', 'notif', 'select', 'checkbox', 'enabl', 'event', 'notif', 'intellij', 'idea', 'notif', 'gener', 'shown', 'balloon', 'appear', 'screen', 'correspond', 'event', 'take', 'place', 'regardless', 'state', 'option', 'alway', 'find', 'notif', 'current', 'session', 'event', 'log', 'enabl', 'system', 'notif', 'select', 'checkbox', 'allow', 'show', 'system', 'notif', 'window', 'may', 'want', 'disabl', 'option', 'hide', 'intellij', 'idea', 'icon', 'system', 'tray', 'group', 'column', 'list', 'group', 'event', 'may', 'notifi', 'may', 'log', 'popup', 'display', 'balloon', 'notif', 'checkbox', 'select', 'set', 'column', 'specifi', 'notif', 'correspond', 'group', 'event', 'shown', 'avail', 'display', 'option', 'balloon', 'balloon', 'notif', '

In [142]:
print("Lets create a dictionary...")
dictionary_lemmas_stemmed = gensim.corpora.Dictionary(corpus_lemmas_stemmed)
print(dictionary_lemmas_stemmed)

Lets create a dictionary...
Dictionary(5954 unique tokens: ['access', 'ad', 'allow', 'aloud', 'also']...)


In [45]:
dictionary_lemmas_stemmed.token2id["notif"]

43

In [156]:
stemmer.stem("usage")

'usag'

In [143]:
print("Now we can vectorize all topics based on the stemmed lemma frequencies...")
freq_vectors_lemmas_stemmed = [dictionary_lemmas_stemmed.doc2bow(doc) for doc in corpus_lemmas_stemmed]

Now we can vectorize all topics based on the stemmed lemma frequencies...


In [144]:
print("Lets train a TFIDF model for lemmatized and stemmed vectors...")

tfidf_lemmas_stemmed = gensim.models.TfidfModel(freq_vectors_lemmas_stemmed)

Lets train a TFIDF model for lemmatized and stemmed vectors...


In [145]:
print("Now lets create a search index for the lemmatized and stemmed TFIDF model...")
index_lemmas_stemmed = gensim.similarities.SparseMatrixSimilarity(tfidf_lemmas_stemmed[freq_vectors_lemmas_stemmed], num_features=len(dictionary_lemmas_stemmed))
print("Done!")

Now lets create a search index for the lemmatized and stemmed TFIDF model...
Done!


In [148]:
print("We can pass a TFIDF vector to the index and it will count how close it is to the TFIDF vectors of every topic.")
print("For example, we can pass a TFIDF vector of a search query:")

query = "file template"
print(query)

q_bow = dictionary_lemmas_stemmed.doc2bow([stemmer.stem(lem(token)) for token in nltk.pos_tag(query.split())])
print("Frequency vector of the query:", q_bow)

q_tfidf = tfidf_lemmas_stemmed[q_bow]
print("TFIDF vector of the query:", q_tfidf)

distance = index_lemmas_stemmed[q_tfidf]
print("Distance to TFIDF vectors of the first 10 topics:", distance[:10])

We can pass a TFIDF vector to the index and it will count how close it is to the TFIDF vectors of every topic.
For example, we can pass a TFIDF vector of a search query:
file template
Frequency vector of the query: [(109, 1), (463, 1)]
TFIDF vector of the query: [(109, 0.18749390746061886), (463, 0.9822657658012666)]
Distance to TFIDF vectors of the first 10 topics: [0.         0.00354089 0.0083671  0.00569922 0.01524156 0.30511686
 0.         0.02035361 0.03050604 0.01584896]


In [52]:
dictionary_lemmas_stemmed[463]

'templat'

In [147]:
for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:10]:
    print(document_number, topic_names[document_number], score)

899 settings-file-and-code-templates.html 0.8963451
418 save-file-as-template-dialog.html 0.8043806
1256 using-file-and-code-templates.html 0.7949943
1118 settings-live-templates.html 0.73883104
348 templates-dialog.html 0.73729944
751 templates.html 0.68210924
1082 saving-project-as-template.html 0.67989385
1260 creating-and-editing-live-templates.html 0.67350936
826 structural-search-and-replace-dialogs.html 0.64086074
494 templates-with-multiple-files.html 0.6046936


In [149]:
for document_number, score in sorted(enumerate(distance), key=lambda x: x[1], reverse=True)[:10]:
    print(document_number, topic_names[document_number], score)

899 settings-file-and-code-templates.html 0.8963451
418 save-file-as-template-dialog.html 0.8043806
1256 using-file-and-code-templates.html 0.7949943
1118 settings-live-templates.html 0.73883104
348 templates-dialog.html 0.73729944
751 templates.html 0.68210924
1082 saving-project-as-template.html 0.67989385
1260 creating-and-editing-live-templates.html 0.67350936
826 structural-search-and-replace-dialogs.html 0.64086074
494 templates-with-multiple-files.html 0.6046936
