#Micro Trainning: Topic Modelling

##Use Case

Imagine we have 1000 documents without a catelog to classify them into any catgories or interesting categories

In [26]:
# Grab my Tools
import os
from . import utils
import json
from collections import Counter
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore, TfidfModel
import pyLDAvis.gensim
import pandas as pd

# Data Path
path = os.path.join(os.getcwd(), 'data')

# Enable visualization
pyLDAvis.enable_notebook()

#### Steps to Model topics
* Reading Data into Machine Understandable Format
* Cleaning
* Building a Word Frequency Dictonary 
* Fitting a [LDA](https://ai.stanford.edu/~ang/papers/jair03-lda.pdf) ([for dummies](https://www.quora.com/What-is-a-good-explanation-of-Latent-Dirichlet-Allocation))
* Visualizing Results
* Building on Top of Result

In [27]:
# Reading Data
files = utils.readfiles(path)
lst_files = utils.parsebibfiles(files)

# Sample
print("Read {0} files\nSample Entry:\n {1}".format(len(lst_files), json.dumps(lst_files[1], indent=2, sort_keys=True)))

Read 1485 files
Sample Entry:
 {
  "ENTRYTYPE": "inproceedings",
  "ID": "ISI:000432607700002",
  "abstract": "{Motion capture acting is a challenging task, it requires trained and\nexperienced actors who can highly rely on their acting and imagination\nskills to deliver believable performances. This is especially the case\nwhen preparation times are short and scenery needs to be imagined, as it\nis commonly the case for shoots in the gaming industry. To support\nactors in such cases, we developed a mixed reality application that\nallows showing digital scenery and triggering emotions while performing.\nIn this paper we tested our hypothesis that a mixed reality head-mounted\nprojection display can support motion capture acting through the help of\nexperienced motion capture actors performing short acting scenes common\nfor game productions. We evaluated our prototype with four motion\ncapture actors and four motion capture experts. Both groups considered\nour application as helpful, e

In [28]:
# Clean Data
lst_clean_bow = [utils.clean_abstract(file) for file in lst_files]

# Sample
print("Cleaned Text\nSample\nTop 10 Words in Document\n{0}".format(Counter(lst_clean_bow[2]).most_common(10)))

Cleaned Text
Sample
Top 10 Words in Document
[('biosecur', 15), ('catcher', 6), ('catch', 5), ('awar', 5), ('threat', 4), ('practic', 4), ('thin', 3), ('good', 3), ('studi', 3), ('high', 2)]


In [29]:
# build gensim dict
corpus_dict = Dictionary(lst_clean_bow)

# filter low freq words threshold > 15 Abstracts
corpus_dict.filter_extremes(no_below=2)

# gensim doc2bow
bow_corpus = [corpus_dict.doc2bow(doc) for doc in lst_clean_bow]

# tf-idf model over bow
tfidf_model = TfidfModel(bow_corpus)
corpus_tfidf = tfidf_model[bow_corpus]




In [30]:
### Bigram & Trigram Analysis
#### 1.Corpus 
#### 2.Bigram Model
#### 3.Trigram Model


In [31]:
pds_corpus = pd.Series([item.get('abstract') for item in lst_files])
df_bigram = utils.bigrams(pds_corpus, 10)
df_trigram = utils.trigram_analysis(pds_corpus, 10)

NameError: name 'util' is not defined

In [None]:
# gensim LDA model over tf-idf
lda_tfidf_model = LdaMulticore(corpus_tfidf, num_topics=10, id2word=corpus_dict, passes=2, workers=4)

# preview
for idx, topic in lda_tfidf_model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))

for index, score in sorted(lda_tfidf_model[bow_corpus[12]], key=lambda tup: -1 * tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_tfidf_model.print_topic(index, 10)))

# visualization
vis = pyLDAvis.gensim.prepare(topic_model=lda_tfidf_model, corpus=bow_corpus, dictionary=corpus_dict)
vis


In [None]:
# Visualize
# var = utils.visualize_lda_model(model=lda_tfidf_model, corpus=bow_corpus, corpus_dict=corpus_dict)
# print(var)
vis = pyLDAvis.gensim.prepare(topic_model=lda_tfidf_model, corpus=bow_corpus,dictionary=corpus_dict)
vis