# Topic modeling

### Get data from google search

In [1]:
from googlesearch.googlesearch import GoogleSearch
import re
import unicodedata
def googleSearch(query,num_results):
    response = GoogleSearch().search(query, num_results = num_results)
    doc_complete = []
    urls=[]
    titles=[]
    for result in response.results:
        try:
            if (result.getText() is not None and langdetect(result.getText())=='en'):
                doc_complete.append(re.sub("\s+"," " , result.getText()))
                urls.append(result.url)
                titles.append(result.title)
        except:
            print "failed to fetch text for page " + result.url
    return doc_complete,urls,titles

In [2]:
from langdetect import detect
def langdetect(txt):
    return detect(txt)
#langdetect("hello, my name is ")

### Cleaning and Preprocessing

In [12]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import string
from itertools import islice
tokenizer = RegexpTokenizer(r'\w+')
stop = set(stopwords.words('english'))

exclude = set(string.punctuation) 
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
lemma = WordNetLemmatizer()
def clean(doc):
    # clean and tokenize document string
    raw = doc.lower()
    tokens = tokenizer.tokenize(raw)
    
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stop]
    
    # remove punc_free from tokens
    punc_free = [ch for ch in stopped_tokens if ch not in exclude]
    
    # stem tokens
    #stemmed_tokens = [p_stemmer.stem(i) for i in punc_free]
    
    # lemm tokens
    normalized = [lemma.lemmatize(word) for word in punc_free]
    
    return normalized

def ngrams(tokens, n):
    # reconstruction de la chaine
    #raw=(' '.join(a) for a in nltk.ngrams(tokens, n))
    if n==1:
        grams=tokens
    else :
        raw="".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()
        raw=raw.split()
    
        grams = [' '.join(raw[i:i+n]) for i in xrange(len(raw)-n+1)]
    
    return grams


### Models functions

In [4]:
import gensim
from gensim import corpora, models, similarities
from six import iteritems
 

def gettfidf(doc_term_matrix):
    tfidf = models.TfidfModel(doc_term_matrix) # step 1 -- initialize a model
    corpus_tfidf = tfidf[doc_term_matrix]
    return corpus_tfidf

def Lsi(corpus_tfidf,dictionary,nbtopic,num_words):
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=nbtopic) # initialize an LSI transformation
    return lsi

def LDA(doc_term_matrix,dictionary,nbtopic,num_words):
    # Creating the object for LDA model using gensim library
    Lda = gensim.models.ldamodel.LdaModel
    # Running and Trainign LDA model on the document term matrix.
    ldamodel = models.LdaModel(doc_term_matrix, num_topics=nbtopic, id2word = dictionary, passes=50)
    #print(ldamodel)
    # words by topic
    return ldamodel

def print_topics(topics):
    #print "======= topics======================================================================================="
    for i,topic in enumerate(topics):
        print "topic %s : %s" %(i,topic[1])
        
def topics_toArray(topics):
    for i,topic in enumerate(topics):
        print "topic %s : %s" %(i,topic[1])

# Mains

### Search info from net

In [10]:
# search info from net
query = "data analytic"
num_results_search=10
doc_complete,urls,titles=googleSearch(query,num_results_search)
for i,d in enumerate(doc_complete): #range(len(doc_complete)):
    print "============================== doc", i,"======================================================"
    print d[:100]

 What is data analytics (DA)? - Definition from WhatIs.com SearchDataManagement Search the TechTarge
 What is big data analytics? - Definition from WhatIs.com SearchBusinessAnalytics Search the TechTar
 Data analysis - Wikipedia Data analysis From Wikipedia, the free encyclopedia Jump to: navigation, 
 Data Analysis Courses | Coursera Toggle navigationNavigation openNavigation closedCatalogBrowseSear
 Big data analytics: What it is and why it matters | SAS SAS | The Power to Know Sign In Welcome Edi
 What is Data Analytics? - Definition from Techopedia ALERT [WEBINAR] Enabling the Mobile Workforce 
 Data Science vs. Big Data vs. Data Analytics: Comparison | Simplilearn All Courses Business Courses
 What is Data Analytics: Definition | Informatica US Products Intelligent Big Data Big Data Manageme
 How to Become a Data Analyst | Data Analyst Salary Master's in Data ScienceTop Schools 23 Great Sch
 Data Analytics Definition | Investopedia Topics What's New Stocks Reboot After Another Tec

### Cleaning + split in ngrams

In [13]:
# cleaning + split in ngrams
n_gram=2
doc_clean = [ngrams(clean(doc),n_gram) for doc in doc_complete]
for i,d in enumerate(doc_clean):#range(len(doc_clean)):
    print "============================== doc", i,"======================================================"
    print d[:50]

[u'data analytics', u'analytics da', u'da definition', u'definition whatis', u'whatis com', u'com searchdatamanagement', u'searchdatamanagement search', u'search techtarget', u'techtarget network', u'network sign', u'sign start', u'start free', u'free unlimited', u'unlimited access', u'access login', u'login register', u'register techtarget', u'techtarget network', u'network news', u'news feature', u'feature tip', u'tip content', u'content answer', u'answer buyer', u'buyer guide', u'guide essential', u'essential guide', u'guide opinion', u'opinion photo', u'photo story', u'story podcasts', u'podcasts quiz', u'quiz tutorial', u'tutorial sponsored', u'sponsored community', u'community multimedia', u'multimedia searchdatamanagement', u'searchdatamanagement topic', u'topic data', u'data warehouse', u'warehouse integration', u'integration resource', u'resource quality', u'quality governance', u'governance dbms', u'dbms bi', u'bi ecm', u'ecm mdm', u'mdm vertical', u'vertical topic']
[u'big d

### Preparing Dictionary and Document-Term Matrix

In [14]:
#Preparing Dictionary and Document-Term Matrix
from six import iteritems
import gensim 
from gensim import corpora
#dir="/Users/kimtaing/Documents/github/WebMining/topics/python/"
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq <= 1] # get words that appear only once
dictionary.filter_tokens(once_ids)  # remove words that appear only once
#dictionary.save(dir+'mydict.dict')  # store the dictionary, for future reference
#print dic to ckeck
for k, v in dictionary.iteritems():
    print "id %s => %s" % (k, v)

id 0 => encompasses searchsqlserver
id 1 => item network
id 2 => schedule container
id 3 => easier path
id 4 => web service
id 5 => process data
id 6 => quality governance
id 7 => u privacy
id 8 => google file
id 9 => term use
id 864 => maximum character
id 11 => diverse data
id 13 => access login
id 14 => content management
id 15 => analysis qualitative
id 16 => community multimedia
id 17 => really know
id 18 => developer community
id 19 => raw information
id 20 => text based
id 21 => data data
id 22 => product service
id 23 => analytics involves
id 355 => enterprise different
id 457 => information used
id 26 => predictive model
id 27 => network news
id 28 => hypothesis data
id 29 => multitenant technology
id 30 => relationship predictive
id 31 => username comment
id 32 => l n
id 33 => true false
id 34 => email address
id 35 => science data
id 36 => submitting personal
id 37 => ad google
id 38 => cover latest
id 39 => techtarget close
id 41 => financial service
id 42 => service intell

In [15]:
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
for i,doc in enumerate(doc_term_matrix):
    print "============ doc %s =======================" %(i)
    for d in doc:
        print "%s : %s"% (dictionary[d[0]], d[1])

encompasses searchsqlserver : 1
item network : 1
schedule container : 1
easier path : 1
process data : 1
quality governance : 1
u privacy : 1
google file : 1
term use : 2
baseline crucial : 1
commerce company : 1
access login : 1
content management : 1
analysis qualitative : 1
community multimedia : 1
really know : 1
developer community : 1
text based : 1
product service : 1
network news : 1
hypothesis data : 1
multitenant technology : 1
relationship predictive : 1
username comment : 1
l n : 1
true false : 1
email address : 4
submitting personal : 1
ad google : 1
techtarget close : 1
scan upgrade : 1
financial service : 1
improvement need : 1
g h : 1
member offer : 1
database national : 1
umbrella term : 1
premise cloud : 1
e commerce : 1
site map : 1
know sql : 2
community turn : 1
engineer data : 1
handbook big : 1
science analytics : 1
making data : 2
crowded open : 1
hybrid cloud : 1
changing better : 1
predictive analytics : 2
configuration management : 1
join conversation : 1
mad

In [16]:
# doc_term_matrix structure : doc(row) * M terms (column) => nb occurences of terms t in doc d
for i,doc in enumerate(doc_term_matrix):
    print "======== doc ",i,"========="
    print doc_term_matrix[i][:10]

[(0, 1), (1, 1), (2, 1), (3, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1)]
[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2), (7, 1), (8, 1), (9, 2), (10, 1)]
[(5, 2), (9, 1), (15, 1), (19, 1), (21, 3), (24, 2), (26, 1), (28, 1), (33, 1), (34, 1)]
[(72, 1), (73, 1), (78, 1), (118, 1), (122, 2), (191, 1), (210, 1), (277, 1), (290, 1), (334, 1)]
[(9, 1), (20, 1), (22, 1), (38, 1), (45, 1), (48, 1), (53, 1), (67, 3), (69, 1), (73, 1)]
[(9, 1), (21, 1), (32, 1), (42, 1), (44, 1), (46, 1), (48, 1), (50, 1), (59, 1), (72, 1)]
[(5, 1), (21, 5), (23, 1), (25, 1), (35, 1), (38, 1), (41, 2), (46, 2), (48, 3), (49, 3)]
[(4, 2), (6, 2), (9, 1), (21, 1), (34, 5), (35, 2), (42, 1), (45, 1), (48, 2), (59, 9)]
[(7, 1), (9, 1), (11, 1), (24, 1), (25, 1), (35, 2), (48, 2), (49, 5), (52, 1), (53, 1)]
[(5, 1), (9, 1), (11, 1), (19, 1), (32, 1), (44, 1), (50, 1), (59, 4), (67, 2), (78, 1)]


### Parm model Lsi and LDA

In [18]:
#parameter for model Lsi and LDA
nbtopic=3
num_words=5

### Running Lsi Model

## Lsi Model

In [15]:
# print tfidf by doc
corpus_tfidf=gettfidf(doc_term_matrix)
#for i,doc in enumerate(corpus_tfidf): # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
#    print "======== doc ",i,"========="
#    print(doc)
    
# call model lsi    
lsi=Lsi(corpus_tfidf,dictionary,nbtopic,num_words)
topics=lsi.print_topics(nbtopic,num_words=num_words)
print_topics(topics)
print "\n======= topic rate by doc ======================================================================================="
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
for i,doc in enumerate(corpus_lsi):
    print "===> doc ",i," :"
    print(doc)

topic 0 : 0.556*"course specialization" + 0.271*"specialization university" + 0.186*"data analyst" + 0.175*"data science" + 0.150*"big data"
topic 1 : 0.460*"course specialization" + 0.224*"specialization university" + -0.212*"data analyst" + -0.188*"data management" + -0.176*"big data"
topic 2 : -0.516*"data analyst" + -0.263*"data science" + 0.239*"data management" + -0.152*"per year" + -0.142*"lean six"

===> doc  0  :
[(0, 0.37870001022863919), (1, -0.50952962771325239), (2, 0.17320417599987201)]
===> doc  1  :
[(0, 0.25668334428003647), (1, -0.314485443280628), (2, -0.16690200031485453)]
===> doc  2  :
[(0, 0.33874448109203698), (1, -0.43225888071261753), (2, 0.34335641641235887)]
===> doc  3  :
[(0, 0.24740980288080033), (1, -0.3290708903393792), (2, 0.20881288745153762)]
===> doc  4  :
[(0, 0.77238175272008602), (1, 0.5935850164137354), (2, 0.048065150209669213)]
===> doc  5  :
[(0, 0.76896096217514376), (1, 0.59937921099039959), (2, 0.038135921081810825)]
===> doc  6  :
[(0, 0.

# Running LDA Model

In [19]:
ldamodel=LDA(doc_term_matrix,dictionary,nbtopic,num_words)
topics=ldamodel.print_topics(num_topics=nbtopic, num_words=num_words)
print_topics(topics)
print "\n======= topic rate by doc ======================================================================================="
for i,doc in enumerate(doc_term_matrix):
    print "===> doc ",i," :"
    print(ldamodel.get_document_topics(doc))

topic 0 : 0.079*"data analysis" + 0.030*"data analyst" + 0.022*"data science" + 0.017*"set data" + 0.014*"data visualization"
topic 1 : 0.078*"big data" + 0.051*"data analytics" + 0.024*"data science" + 0.022*"data management" + 0.011*"data quality"
topic 2 : 0.035*"data analytics" + 0.031*"big data" + 0.009*"add comment" + 0.009*"comment cancel" + 0.006*"data set"

===> doc  0  :
[(2, 0.99911167132010614)]
===> doc  1  :
[(2, 0.99924657796224892)]
===> doc  2  :
[(0, 0.99806666948615963)]
===> doc  3  :
[(0, 0.61611052418815104), (1, 0.37627537967213126)]
===> doc  4  :
[(1, 0.99745113356837822)]
===> doc  5  :
[(0, 0.16837178037643266), (1, 0.39825119768270872), (2, 0.43337702194085859)]
===> doc  6  :
[(1, 0.99780474967981325)]
===> doc  7  :
[(1, 0.99764577704167201)]
===> doc  8  :
[(0, 0.99712324708626443)]
===> doc  9  :
[(0, 0.25359116160518164), (1, 0.33111601669632384), (2, 0.41529282169849457)]
