In [1]:
import os, json
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
import gensim
from gensim import corpora, models



In [2]:
tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
en_stop = get_stop_words('en')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

In [13]:
def extractDocuments(chunk = False, docCount=10000):
    texts = []
    count = 0
    for root, dirs, files in os.walk(".\Data", topdown=False):
        for reviewFile in files:
            splitRoot = []
            #Windows : "//", Mac "\"
            splitRoot = root.rsplit("\\",2)
            if (len(splitRoot) < 3):
               print "Skipping ",reviewFile
                continue
            version = splitRoot[2]
            packageName = splitRoot[1]
            try:
                json_data=open(os.path.join(root, reviewFile)).read()
                data = json.loads(json_data)
                
                #Ignore empty files
                if isinstance(data, list):
                    for reviews in data:
                        try:
                            comment = reviews['comment']
                            timestamp = reviews['timestampMsec']
                            
                            raw = comment.lower()
                            tokens = tokenizer.tokenize(raw)
                            
                            # remove stop words from tokens
                            stopped_tokens = [i for i in tokens if not i in en_stop]
                            
                            # stem tokens
                            stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
                              
                            # add tokens to list
                            texts.append(stemmed_tokens)
                            
                        except KeyError as er:
                            continue
            except (ValueError, IOError, IndexError) as err:
                continue
            
            count = count + 1
            if (chunk and count > docCount):
                print count, " documents extracted "
                return texts
            
    print count, " documents extracted in total"
    return texts

In [14]:
texts = extractDocuments()

362555  documents extracted in total


In [15]:
dictionary = corpora.Dictionary(texts)
    
# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

In [19]:
# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=10, id2word = dictionary, passes=20)
print "Done!"

Done!


In [20]:
for top in ldamodel.print_topics():
  print top

(0, u'0.032*ok + 0.014*home + 0.014*info + 0.014*root + 0.013*photo + 0.011*la + 0.011*audio + 0.010*languag + 0.010*s + 0.010*theme')
(1, u'0.180*app + 0.082*great + 0.056*work + 0.046*thank + 0.021*use + 0.014*keep + 0.014*well + 0.011*realli + 0.010*perfect + 0.010*sound')
(2, u'0.145*good + 0.143*game + 0.113*love + 0.048*like + 0.040*fun + 0.039*awesom + 0.033*play + 0.033*realli + 0.022*great + 0.016*s')
(3, u'0.143*nice + 0.064*cool + 0.062*app + 0.033*applic + 0.023*must + 0.021*wallpap + 0.021*pictur + 0.015*map + 0.015*tv + 0.014*color')
(4, u'0.027*add + 0.026*need + 0.021*just + 0.020*like + 0.018*make + 0.018*better + 0.017*bad + 0.015*amaz + 0.015*can + 0.013*s')
(5, u'0.044*app + 0.018*use + 0.017*can + 0.017*one + 0.015*s + 0.014*ad + 0.013*like + 0.011*version + 0.010*need + 0.008*phone')
(6, u'0.062*use + 0.049*easi + 0.020*help + 0.018*simpl + 0.014*learn + 0.013*word + 0.009*live + 0.009*beauti + 0.009*recommend + 0.009*accur')
(7, u'0.029*fix + 0.028*star + 0.027*5