# Topic Modeling

Topic modeling is basicaly a way to find the main topics of an article or any text resource

## 1. Installing requirements

In [2]:
%pip install nltk gensim pyLDAvis

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


## 2. Downloading required datasets

In [5]:
import nltk


pkgs = ['punkt', 'wordnet', 'averaged_perceptron_tagger', 'stopwords']
for pkg in pkgs:
    nltk.download(pkg)

[nltk_data] Downloading package punkt to /home/amiresm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/amiresm/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/amiresm/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/amiresm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## 3. Getting the data

In [30]:
import wikipedia

wikipedia_page = wikipedia.page('Javascript').content
print(wikipedia_page)

JavaScript (), often abbreviated as JS, is a programming language that conforms to the ECMAScript specification. JavaScript is high-level, often just-in-time compiled, and multi-paradigm. It has curly-bracket syntax, dynamic typing, prototype-based object-orientation, and first-class functions.
Alongside HTML and CSS, JavaScript is one of the core technologies of the World Wide Web. JavaScript enables interactive web pages and is an essential part of web applications. The vast majority of websites use it for client-side page behavior, and all major web browsers have a dedicated JavaScript engine to execute it.
As a multi-paradigm language, JavaScript supports event-driven, functional, and imperative programming styles. It has application programming interfaces (APIs) for working with text, dates, regular expressions, standard data structures, and the Document Object Model (DOM). However, the language itself does not include any input/output (I/O), such as networking, storage, or graphi

## 4. Pre-Processing

In [31]:
# Tokenizing sentences
from nltk.tokenize import sent_tokenize

sentences = sent_tokenize(wikipedia_page)
print(sentences)

['JavaScript (), often abbreviated as JS, is a programming language that conforms to the ECMAScript specification.', 'JavaScript is high-level, often just-in-time compiled, and multi-paradigm.', 'It has curly-bracket syntax, dynamic typing, prototype-based object-orientation, and first-class functions.', 'Alongside HTML and CSS, JavaScript is one of the core technologies of the World Wide Web.', 'JavaScript enables interactive web pages and is an essential part of web applications.', 'The vast majority of websites use it for client-side page behavior, and all major web browsers have a dedicated JavaScript engine to execute it.', 'As a multi-paradigm language, JavaScript supports event-driven, functional, and imperative programming styles.', 'It has application programming interfaces (APIs) for working with text, dates, regular expressions, standard data structures, and the Document Object Model (DOM).', 'However, the language itself does not include any input/output (I/O), such as netw

In [32]:
# Tokenize words
from nltk.tokenize import word_tokenize
words = list()

for sent in sentences:
    words.extend(word_tokenize(sent))
    
print(words)

['JavaScript', '(', ')', ',', 'often', 'abbreviated', 'as', 'JS', ',', 'is', 'a', 'programming', 'language', 'that', 'conforms', 'to', 'the', 'ECMAScript', 'specification', '.', 'JavaScript', 'is', 'high-level', ',', 'often', 'just-in-time', 'compiled', ',', 'and', 'multi-paradigm', '.', 'It', 'has', 'curly-bracket', 'syntax', ',', 'dynamic', 'typing', ',', 'prototype-based', 'object-orientation', ',', 'and', 'first-class', 'functions', '.', 'Alongside', 'HTML', 'and', 'CSS', ',', 'JavaScript', 'is', 'one', 'of', 'the', 'core', 'technologies', 'of', 'the', 'World', 'Wide', 'Web', '.', 'JavaScript', 'enables', 'interactive', 'web', 'pages', 'and', 'is', 'an', 'essential', 'part', 'of', 'web', 'applications', '.', 'The', 'vast', 'majority', 'of', 'websites', 'use', 'it', 'for', 'client-side', 'page', 'behavior', ',', 'and', 'all', 'major', 'web', 'browsers', 'have', 'a', 'dedicated', 'JavaScript', 'engine', 'to', 'execute', 'it', '.', 'As', 'a', 'multi-paradigm', 'language', ',', 'JavaSc

In [33]:
# POS tagging
from nltk.tag import pos_tag

tagged_words = pos_tag(words)
print(tagged_words)

[('JavaScript', 'NNP'), ('(', '('), (')', ')'), (',', ','), ('often', 'RB'), ('abbreviated', 'VBN'), ('as', 'IN'), ('JS', 'NNP'), (',', ','), ('is', 'VBZ'), ('a', 'DT'), ('programming', 'JJ'), ('language', 'NN'), ('that', 'WDT'), ('conforms', 'VBZ'), ('to', 'TO'), ('the', 'DT'), ('ECMAScript', 'NNP'), ('specification', 'NN'), ('.', '.'), ('JavaScript', 'NNP'), ('is', 'VBZ'), ('high-level', 'JJ'), (',', ','), ('often', 'RB'), ('just-in-time', 'NN'), ('compiled', 'VBD'), (',', ','), ('and', 'CC'), ('multi-paradigm', 'NN'), ('.', '.'), ('It', 'PRP'), ('has', 'VBZ'), ('curly-bracket', 'NN'), ('syntax', 'NN'), (',', ','), ('dynamic', 'JJ'), ('typing', 'NN'), (',', ','), ('prototype-based', 'JJ'), ('object-orientation', 'NN'), (',', ','), ('and', 'CC'), ('first-class', 'JJ'), ('functions', 'NNS'), ('.', '.'), ('Alongside', 'NNP'), ('HTML', 'NNP'), ('and', 'CC'), ('CSS', 'NNP'), (',', ','), ('JavaScript', 'NNP'), ('is', 'VBZ'), ('one', 'CD'), ('of', 'IN'), ('the', 'DT'), ('core', 'NN'), ('tec

In [34]:
# Lemmatization with pos tagged words
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

roles = {'J': 'a', 'V': 'v', 'N': 'n', 'R': 'r'}
stop_words = set(stopwords.words('english'))
punk = string.punctuation
lemma_pos = list()
lemmatizer = WordNetLemmatizer()

for word, tag in tagged_words:
    if (word not in stop_words) and (word not in punk) and (word.isalpha()):
        if tag[0] in roles:
            lemma_pos.append(lemmatizer.lemmatize(word=word, pos=roles[tag[0]]))

print(lemma_pos)

['JavaScript', 'often', 'abbreviate', 'JS', 'programming', 'language', 'conform', 'ECMAScript', 'specification', 'JavaScript', 'often', 'compile', 'syntax', 'dynamic', 'typing', 'function', 'Alongside', 'HTML', 'CSS', 'JavaScript', 'core', 'technology', 'World', 'Wide', 'Web', 'JavaScript', 'enable', 'interactive', 'web', 'page', 'essential', 'part', 'web', 'application', 'vast', 'majority', 'website', 'use', 'page', 'behavior', 'major', 'web', 'browser', 'dedicate', 'JavaScript', 'engine', 'execute', 'language', 'JavaScript', 'support', 'functional', 'imperative', 'programming', 'style', 'application', 'program', 'interface', 'APIs', 'work', 'text', 'date', 'regular', 'expression', 'standard', 'data', 'structure', 'Document', 'Object', 'Model', 'DOM', 'However', 'language', 'include', 'networking', 'storage', 'graphic', 'facility', 'host', 'environment', 'usually', 'web', 'browser', 'provide', 'APIs', 'JavaScript', 'engine', 'originally', 'use', 'web', 'browser', 'embed', 'server', 'u

## 5. Finding Topics

In [37]:
from gensim import corpora, models, similarities

word2id = corpora.Dictionary([lemma_pos])
corpus = [word2id.doc2bow([word]) for word in lemma_pos]

lda_model = models.LdaModel(corpus=corpus,
                           id2word=word2id,
                           num_topics=7,
                           random_state=100,
                           update_every=1,
                           chunksize=100,
                           passes=10,
                           alpha='symmetric',
                           per_word_topics=True)
print(lda_model.print_topics())

[(0, '0.093*"Java" + 0.048*"object" + 0.030*"also" + 0.020*"Windows" + 0.017*"source" + 0.014*"Object" + 0.014*"native" + 0.014*"format" + 0.012*"enable" + 0.012*"paper"'), (1, '0.099*"code" + 0.060*"browser" + 0.050*"function" + 0.037*"web" + 0.031*"program" + 0.024*"tool" + 0.023*"example" + 0.019*"video" + 0.014*"information" + 0.013*"transpilers"'), (2, '0.069*"library" + 0.037*"vulnerability" + 0.033*"syntax" + 0.033*"use" + 0.027*"sandbox" + 0.024*"development" + 0.020*"user" + 0.019*"JSON" + 0.016*"developer" + 0.016*"security"'), (3, '0.030*"site" + 0.028*"attack" + 0.028*"Internet" + 0.024*"implementation" + 0.024*"standard" + 0.023*"Explorer" + 0.018*"Microsoft" + 0.018*"system" + 0.017*"typing" + 0.016*"call"'), (4, '0.058*"language" + 0.036*"create" + 0.026*"attacker" + 0.024*"Press" + 0.024*"Starch" + 0.022*"flaw" + 0.016*"application" + 0.014*"regular" + 0.013*"buffer" + 0.013*"Crockford"'), (5, '0.297*"JavaScript" + 0.047*"page" + 0.039*"Web" + 0.037*"script" + 0.032*"ru

In [38]:
#Visualizing

import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, word2id)
vis