In [1]:
from collections import defaultdict as dd
import nltk
stop_words = nltk.corpus.stopwords.words("english")

import csv

In [2]:
print("Lets make a list of all topic names and a list of corresponding topic texts...")
with open("01_topic_text.csv") as file:
    topic_names = []
    topic_texts = []
    for line in csv.reader(file):
        topic_names.append(line[0])
        topic_texts.append(line[1])
print("Done!")

Lets make a list of all topic names and a list of corresponding topic texts...
Done!


In [3]:
topic_id = 100
print(f"Topic {topic_id} is:", topic_names[topic_id])
print(topic_texts[topic_id][:200])

Topic 100 is: create-ssh-configurations.html
Create SSH configurations
In IntelliJ IDEA, you can save the remote server SSH connection parameters as a dedicated SSH configuration. The created configuration can be then used for configuring remote


# Simple tokenization into words

In [33]:
from nltk.tokenize import RegexpTokenizer

# Tokenize by word bounds only
tknzr = RegexpTokenizer("\w+")

# Also tokenize shortcuts "ctrl+alt+s" and version numbers "2020.1"
# tknzr = RegexpTokenizer("\w+(\+\w+)+|\d+\.\d|\w+")

# text = topic_texts[1]
text = "Hello, world! How are you?\nI am good/bad//the worst. Create \etc\not\\new\\user. IntelliJ IDEA 2020.1. IntelliJ_IDEA. Ctrl+Alt+S."

tkns = [word for word in tknzr.tokenize(text.lower()) if word not in stop_words]
print(tkns)

['hello', 'world', 'good', 'bad', 'worst', 'create', 'etc', 'ot', 'new', 'user', 'intellij', 'idea', '2020', '1', 'intellij_idea', 'ctrl', 'alt']


In [38]:
print("Let's get rid of stop words, punctuation, and other symbols...")
word_tokens = [[word for word in tknzr.tokenize(doc.lower()) if word not in stop_words] for doc in topic_texts]
print("Created a list of bags of words")

Let's get rid of stop words, punctuation, and other symbols...
Created a list of bags of words


In [31]:
# print("Tokenizing, removing punctuation and stopwords, replacing slashes with spaces...")
# word_tokens = [[word for word in nltk.word_tokenize(doc.lower().replace("/", " ").replace("\\", " ")) if word not in stop_words and word not in string.punctuation] for doc in topic_texts]
# print("Created a list of bags of words")

Tokenizing, removing punctuation and stopwords, replacing slashes with spaces...
Created a list of bags of words


In [39]:
print(f"Topic {topic_id} is:", topic_names[topic_id])
print(len(word_tokens[topic_id]), "tokens")
print(word_tokens[100])

Topic 100 is: create-ssh-configurations.html
261 tokens
['create', 'ssh', 'configurations', 'intellij', 'idea', 'save', 'remote', 'server', 'ssh', 'connection', 'parameters', 'dedicated', 'ssh', 'configuration', 'created', 'configuration', 'used', 'configuring', 'remote', 'interpreters', 'connecting', 'sftp', 'deployment', 'servers', 'launching', 'ssh', 'sessions', 'settings', 'preferences', 'dialog', 'ctrl', 'alt', 'go', 'tools', 'ssh', 'configurations', 'left', 'hand', 'pane', 'lists', 'existing', 'ssh', 'configurations', 'click', 'use', 'visible', 'project', 'checkbox', 'configure', 'visibility', 'server', 'access', 'configuration', 'select', 'checkbox', 'restrict', 'use', 'ssh', 'configuration', 'current', 'project', 'ssh', 'configuration', 'cannot', 'reused', 'outside', 'current', 'project', 'appear', 'list', 'available', 'configurations', 'projects', 'ssh', 'configurations', 'stored', 'idea', 'directory', 'together', 'project', 'allows', 'sharing', 'team', 'members', 'vcs', 'ssh'

In [40]:
def tokens_to_corpus(tkns):
    print("Counting term frequencies...")
    frequency = dd(int)

    for doc in tkns:
        for token in doc:
            frequency[token] += 1

    print("There are", len(frequency), "tokens in total.")
    print("The 10 most frequent are:", dict(sorted(frequency.items(), key=lambda item: item[1], reverse=True)[:10]))

    print("\nLets get rid of tokens that appear only once...")

    corpus = [[token for token in doc if frequency[token] > 1] for doc in tkns]

    unique = set()

    for doc in corpus:
        for token in doc:
            unique.add(token)

    print("Now we have", len(unique), "tokens left.")

    print(f"Topic {topic_id} is:", topic_names[topic_id])
    print(len(corpus[topic_id]), "tokens")
    print(corpus[100])

    return corpus

In [30]:
word_corpus = tokens_to_corpus(word_tokens)

Counting term frequencies...
There are 9581 tokens in total.
The 10 most frequent are: {'select': 12020, 'run': 9301, 'click': 8513, 'idea': 7916, 'intellij': 7506, 'file': 7427, 'project': 6600, 'configuration': 6493, 'code': 5509, 'dialog': 5451}

Lets get rid of tokens that appear only once...
Now we have 6860 tokens left.
Topic 100 is: create-ssh-configurations.html
244 tokens
['create', 'ssh', 'configurations', 'intellij', 'idea', 'save', 'remote', 'server', 'ssh', 'connection', 'parameters', 'dedicated', 'ssh', 'configuration', 'created', 'configuration', 'used', 'configuring', 'remote', 'interpreters', 'connecting', 'sftp', 'deployment', 'servers', 'launching', 'ssh', 'sessions', 'settings', 'preferences', 'dialog', 'ctrl', 'alt', 'go', 'tools', 'ssh', 'configurations', 'left', 'hand', 'pane', 'lists', 'existing', 'ssh', 'configurations', 'click', 'use', 'visible', 'project', 'checkbox', 'configure', 'visibility', 'server', 'access', 'configuration', 'select', 'checkbox', 'restr

In [41]:
print("Saving word tokens to csv...")
with open("02a_word_corpus.csv", "w") as file:
    csv.writer(file).writerows(word_corpus)
print("Done!")

Saving word tokens to csv...
Done!


# Lemmatization

In [34]:
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [42]:
print("Lets try to lemmatize our tokens...")

lemmatizer = WordNetLemmatizer()

def lem(token):
    if token[1].startswith("V"):
        lemma = lemmatizer.lemmatize(token[0], wordnet.VERB)
    if token[1].startswith("J"):
        lemma = lemmatizer.lemmatize(token[0], wordnet.ADJ)
    if token[1].startswith("R"):
        lemma = lemmatizer.lemmatize(token[0], wordnet.ADV)
    else:
        lemma = lemmatizer.lemmatize(token[0], wordnet.NOUN)
    return lemma

lemma_tokens = [[lem(token) for token in nltk.pos_tag(doc)] for doc in word_tokens]
print("Done!")

Lets try to lemmatize our tokens...
Done!


In [43]:
lemma_corpus = tokens_to_corpus(lemma_tokens)

Counting term frequencies...
There are 8661 tokens in total.
The 10 most frequent are: {'select': 12020, 'file': 11483, 'run': 9551, 'click': 8519, 'configuration': 8031, 'idea': 7917, 'intellij': 7506, 'project': 7258, 'option': 6279, 'open': 5893}

Lets get rid of tokens that appear only once...
Now we have 6119 tokens left.
Topic 100 is: create-ssh-configurations.html
244 tokens
['create', 'ssh', 'configuration', 'intellij', 'idea', 'save', 'remote', 'server', 'ssh', 'connection', 'parameter', 'dedicated', 'ssh', 'configuration', 'created', 'configuration', 'used', 'configuring', 'remote', 'interpreter', 'connecting', 'sftp', 'deployment', 'server', 'launching', 'ssh', 'session', 'setting', 'preference', 'dialog', 'ctrl', 'alt', 'go', 'tool', 'ssh', 'configuration', 'left', 'hand', 'pane', 'list', 'existing', 'ssh', 'configuration', 'click', 'use', 'visible', 'project', 'checkbox', 'configure', 'visibility', 'server', 'access', 'configuration', 'select', 'checkbox', 'restrict', 'use

In [44]:
print("Saving word tokens to csv...")
with open("02b_lemma_corpus.csv", "w") as file:
    csv.writer(file).writerows(lemma_corpus)
print("Done!")

Saving word tokens to csv...
Done!


# Stemming

In [45]:
print("Now lets try stemming the tokens instead...")

stemmer = nltk.stem.porter.PorterStemmer()

stemmed_tokens = [[stemmer.stem(token) for token in doc] for doc in word_tokens]
print("Done!")

Now lets try stemming the tokens instead...
Done!


In [46]:
stemmed_corpus = tokens_to_corpus(stemmed_tokens)

Counting term frequencies...
There are 6605 tokens in total.
The 10 most frequent are: {'select': 16545, 'file': 11486, 'run': 11178, 'configur': 10996, 'click': 8813, 'use': 8532, 'idea': 7917, 'intellij': 7506, 'project': 7261, 'specifi': 6692}

Lets get rid of tokens that appear only once...
Now we have 4557 tokens left.
Topic 100 is: create-ssh-configurations.html
244 tokens
['creat', 'ssh', 'configur', 'intellij', 'idea', 'save', 'remot', 'server', 'ssh', 'connect', 'paramet', 'dedic', 'ssh', 'configur', 'creat', 'configur', 'use', 'configur', 'remot', 'interpret', 'connect', 'sftp', 'deploy', 'server', 'launch', 'ssh', 'session', 'set', 'prefer', 'dialog', 'ctrl', 'alt', 'go', 'tool', 'ssh', 'configur', 'left', 'hand', 'pane', 'list', 'exist', 'ssh', 'configur', 'click', 'use', 'visibl', 'project', 'checkbox', 'configur', 'visibl', 'server', 'access', 'configur', 'select', 'checkbox', 'restrict', 'use', 'ssh', 'configur', 'current', 'project', 'ssh', 'configur', 'cannot', 'reus',

In [47]:
print("Saving word tokens to csv...")
with open("02c_stemmed_corpus.csv", "w") as file:
    csv.writer(file).writerows(stemmed_corpus)
print("Done!")

Saving word tokens to csv...
Done!
