#Micro Trainning: Topic Modelling

##Use Case

Imagine we have 1000 documents without a catelog to classify them into any catgories or interesting categories

In [35]:
# Grab my Tools
import os
import utils
import json
from collections import Counter
from gensim.corpora import Dictionary
from gensim.models import CoherenceModel, LdaMulticore, TfidfModel

# Data Path
path = os.path.join(os.getcwd(), 'data')

#### Steps to Model topics
* Reading Data into Machine Understandable Format
* Cleaning
* Building a Word Frequency Dictonary 
* Fitting a [LDA](https://ai.stanford.edu/~ang/papers/jair03-lda.pdf) ([for dummies](https://www.quora.com/What-is-a-good-explanation-of-Latent-Dirichlet-Allocation))
* Visualizing Results
* Building on Top of Result

In [13]:
# Reading Data
files = utils.readfiles(path)
lst_files = utils.parsebibfiles(files)

# Sample
print("Read {0} files\nSample Entry:\n {1}".format(len(lst_files), json.dumps(lst_files[1], indent=2, sort_keys=True)))

Read 29 files
Sample Entry:
 {
  "ENTRYTYPE": "article",
  "ID": "Boci2015",
  "abstract": "The first building block of the Federal Aviation Administration's (FAA) Next Generation Air Transportation System (NextGen) initiative to modernize the US national airspace system (NAS) was the implementation of the Automatic Dependent Surveillance-Broadcast (ADS-B) ground infrastructure. A primary aspect of the ADS-B program design is the terrestrial radio station infrastructure. It determined the terrestrial radio stations layout throughout the US and was optimized to meet system performance, safety and security in the NAS. In March 2014, the FAA completed the nationwide infrastructure upgrade, enabling air traffic controllers to track aircraft with greater accuracy and reliability, while giving pilots more information in the cockpit. More than 650 ADS-B radios communicate with equipped aircraft, supporting the new satellite-based surveillance system. Currently, the ADS-B system ingests proces

In [14]:
# Clean Data
lst_clean_bow = [utils.clean_abstract(file) for file in lst_files]

# Sample
print("Cleaned Text\nSample\nTop 10 Words in Document\n{0}".format(Counter(lst_clean_bow[2]).most_common(10)))

Cleaned Text
Sample
Top 10 Words in Document
[('data', 5), ('research', 4), ('databas', 3), ('meet', 2), ('impact', 2), ('report', 2), ('divers', 2), ('manag', 2), ('everi', 1), ('year', 1)]


In [15]:
# build gensim dict
corpus_dict = Dictionary(lst_clean_bow)

# filter low freq words threshold > 15 Abstracts
corpus_dict.filter_extremes(no_below=2)

# gensim doc2bow
bow_corpus = [corpus_dict.doc2bow(doc) for doc in lst_clean_bow]

# tf-idf model over bow
tfidf_model = TfidfModel(bow_corpus)
corpus_tfidf = tfidf_model[bow_corpus]




In [17]:
# gensim LDA model over tf-idf
lda_tfidf_model = LdaMulticore(corpus_tfidf, num_topics=10, id2word=corpus_dict, passes=2, workers=4)

# preview
for idx, topic in lda_tfidf_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

for index, score in sorted(lda_tfidf_model[bow_corpus[12]], key=lambda tup: -1 * tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_tfidf_model.print_topic(index, 10)))

Topic: 0 Word: 0.013*"tabl" + 0.010*"locat" + 0.008*"relat" + 0.007*"lake" + 0.007*"recommend" + 0.007*"element" + 0.006*"learn" + 0.006*"peopl" + 0.006*"related" + 0.006*"featur"
Topic: 1 Word: 0.011*"cluster" + 0.010*"structur" + 0.009*"schema" + 0.009*"web" + 0.008*"custom" + 0.008*"text" + 0.007*"within" + 0.007*"dimens" + 0.007*"reduct" + 0.007*"statist"
Topic: 2 Word: 0.008*"short" + 0.008*"cluster" + 0.008*"attribut" + 0.007*"text" + 0.007*"flexibl" + 0.007*"schema" + 0.006*"framework" + 0.006*"sourc" + 0.006*"represent" + 0.005*"entiti"
Topic: 3 Word: 0.012*"match" + 0.010*"mani" + 0.010*"improv" + 0.009*"schema" + 0.008*"multipl" + 0.008*"combin" + 0.007*"reduct" + 0.006*"techniqu" + 0.006*"basic" + 0.006*"work"
Topic: 4 Word: 0.014*"lake" + 0.012*"clean" + 0.009*"store" + 0.007*"warehous" + 0.007*"error" + 0.007*"queri" + 0.006*"analyz" + 0.006*"solut" + 0.006*"unstructur" + 0.006*"could"
Topic: 5 Word: 0.016*"map" + 0.014*"lsd" + 0.013*"learner" + 0.012*"sourc" + 0.010*"syst

In [None]:
# Visualize
var = utils.visualize_lda_model(model=lda_tfidf_model, corpus=bow_corpus, corpus_dict=corpus_dict)
print(var)

AttributeError: module 'utils' has no attribute 'visualize_lda_model'