# Topic modeling

### Get data from google search

In [2]:
from googlesearch.googlesearch import GoogleSearch
import re
import unicodedata
def googleSearch(query,num_results):
    response = GoogleSearch().search(query, num_results = num_results)
    doc_complete = []
    urls=[]
    titles=[]
    for result in response.results:
        try:
            if (result.getText() is not None and langdetect(result.getText())=='en'):
                doc_complete.append(re.sub("\s+"," " , result.getText()))
                urls.append(result.url)
                titles.append(result.title)
        except:
            print "failed to fetch text for page " + result.url
    return doc_complete,urls,titles

In [3]:
from langdetect import detect
def langdetect(txt):
    return detect(txt)
#langdetect("hello, my name is ")

### Cleaning and Preprocessing

In [4]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import string
from itertools import islice
tokenizer = RegexpTokenizer(r'\w+')
stop = set(stopwords.words('english'))

exclude = set(string.punctuation) 
# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()
lemma = WordNetLemmatizer()
def clean(doc):
    # clean and tokenize document string
    raw = doc.lower()
    tokens = tokenizer.tokenize(raw)
    
    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in stop]
    
    # remove punc_free from tokens
    punc_free = [ch for ch in stopped_tokens if ch not in exclude]
    
    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in punc_free]
    
    # lemm tokens
    normalized = [lemma.lemmatize(word) for word in punc_free]
    
    return normalized

def ngrams(tokens, n):
    # reconstruction de la chaine
    #raw=(' '.join(a) for a in nltk.ngrams(tokens, n))
    if n==1:
        grams=tokens
    else :
        raw="".join([" "+i if not i.startswith("'") and i not in string.punctuation else i for i in tokens]).strip()
        raw=raw.split()
    
        grams = [' '.join(raw[i:i+n]) for i in xrange(len(raw)-n+1)]
    
    return grams


### Models functions

In [5]:
import gensim
from gensim import corpora, models, similarities
from six import iteritems
 

def gettfidf(doc_term_matrix):
    tfidf = models.TfidfModel(doc_term_matrix) # step 1 -- initialize a model
    corpus_tfidf = tfidf[doc_term_matrix]
    return corpus_tfidf

def Lsi(corpus_tfidf,dictionary,nbtopic,num_words):
    lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=nbtopic) # initialize an LSI transformation
    return lsi

def LDA(doc_term_matrix,dictionary,nbtopic,num_words):
    # Creating the object for LDA model using gensim library
    Lda = gensim.models.ldamodel.LdaModel
    # Running and Trainign LDA model on the document term matrix.
    ldamodel = models.LdaModel(doc_term_matrix, num_topics=nbtopic, id2word = dictionary, passes=50)
    #print(ldamodel)
    # words by topic
    return ldamodel

def print_topics(topics):
    #print "======= topics======================================================================================="
    for i,topic in enumerate(topics):
        print "topic %s : %s" %(i,topic[1])
        
def topics_toArray(topics):
    for i,topic in enumerate(topics):
        print "topic %s : %s" %(i,topic[1])

# Mains

### Search info from net

In [9]:
# search info from net
query = "data analytic"
num_results_search=10
doc_complete,urls,titles=googleSearch(query,num_results_search)
for i,d in enumerate(doc_complete): #range(len(doc_complete)):
    print "============================== doc", i,"======================================================"
    print d[:50]

 What is data analytics (DA)? - Definition from Wh
 Data analysis - Wikipedia Data analysis From Wiki
 Big data analytics: What it is and why it matters
 What is Data Analytics? - Definition from Techope
 Data Analysis Courses | Coursera Toggle navigatio
 Data Analytics Courses | Coursera Toggle navigati
 Data Science vs. Big Data vs. Data Analytics: Com
 Data Analytics Lab, ETH Zürich home people resear
 How to Become a Data Analyst | Data Analyst Salar
 What is Data Analytics: Definition | Informatica 


### Cleaning + split in ngrams

In [10]:
# cleaning + split in ngrams
n_gram=2
doc_clean = [ngrams(clean(doc),n_gram) for doc in doc_complete]
for d in doc_clean:#range(len(doc_clean)):
    print "============================== doc", i,"======================================================"
    print d[:50]

[u'data analytics', u'analytics da', u'da definition', u'definition whatis', u'whatis com', u'com searchdatamanagement', u'searchdatamanagement search', u'search techtarget', u'techtarget network', u'network sign', u'sign start', u'start free', u'free unlimited', u'unlimited access', u'access login', u'login register', u'register techtarget', u'techtarget network', u'network news', u'news feature', u'feature tip', u'tip content', u'content answer', u'answer buyer', u'buyer guide', u'guide essential', u'essential guide', u'guide opinion', u'opinion photo', u'photo story', u'story podcasts', u'podcasts quiz', u'quiz tutorial', u'tutorial sponsored', u'sponsored community', u'community multimedia', u'multimedia searchdatamanagement', u'searchdatamanagement topic', u'topic data', u'data warehouse', u'warehouse integration', u'integration resource', u'resource quality', u'quality governance', u'governance dbms', u'dbms bi', u'bi ecm', u'ecm mdm', u'mdm vertical', u'vertical topic']
[u'data 

### Preparing Dictionary and Document-Term Matrix

In [11]:
#Preparing Dictionary and Document-Term Matrix
from six import iteritems
import gensim 
from gensim import corpora
#dir="/Users/kimtaing/Documents/github/WebMining/topics/python/"
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(doc_clean)
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq <= 1] # get words that appear only once
dictionary.filter_tokens(once_ids)  # remove words that appear only once
#dictionary.save(dir+'mydict.dict')  # store the dictionary, for future reference
#print dic to ckeck
for k, v in dictionary.iteritems():
    print "id %s => %s" % (k, v)

id 0 => term use
id 1 => analysis identify
id 2 => process data
id 3 => quality governance
id 4 => u privacy
id 5 => information data
id 6 => analysis qualitative
id 7 => text based
id 8 => model using
id 9 => data data
id 10 => product service
id 11 => solution solution
id 12 => hypothesis data
id 13 => natural language
id 14 => l n
id 15 => true false
id 16 => email address
id 17 => science data
id 18 => cover latest
id 19 => closedcatalogbrowsesearchfor enterpriselog
id 20 => financial service
id 21 => service intelligent
id 22 => g h
id 23 => brazil canada
id 24 => e book
id 25 => data big
id 26 => per year
id 27 => umbrella term
id 28 => identify new
id 29 => database management
id 30 => analytics data
id 31 => engineer data
id 32 => company corporate
id 186 => making data
id 34 => hybrid cloud
id 35 => predictive analytics
id 36 => certified base
id 37 => skill pwc
id 38 => design data
id 39 => mining data
id 182 => use big
id 41 => transportation utility
id 42 => kingdom united


In [12]:
# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
for i,doc in enumerate(doc_term_matrix):
    print "============ doc %s =======================" %(i)
    for d in doc:
        print "%s : %s"% (dictionary[d[0]], d[1])

term use : 2
analysis identify : 1
process data : 1
quality governance : 1
u privacy : 1
information data : 1
analysis qualitative : 1
text based : 1
model using : 1
product service : 1
hypothesis data : 1
l n : 1
true false : 1
email address : 4
financial service : 1
g h : 1
umbrella term : 1
database management : 1
engineer data : 1
company corporate : 1
real time : 3
hybrid cloud : 2
predictive analytics : 2
analytics applies : 1
statistical technique : 1
data query : 1
need know : 1
data analyzed : 1
analyzing data : 2
self service : 1
right reserved : 1
confirmatory data : 1
platform support : 1
customer data : 2
analytics process : 5
data cleansing : 1
analytics use : 1
open source : 2
j k : 1
data management : 1
play role : 1
job responsibility : 1
data processing : 1
programming data : 1
project management : 1
data source : 1
machine learning : 4
b c : 1
card company : 1
analysis data : 1
raw data : 1
draw conclusion : 1
trend pattern : 1
service provider : 2
understand data : 

In [13]:
# doc_term_matrix structure : doc(row) * M terms (column) => nb occurences of terms t in doc d
for i,doc in enumerate(doc_term_matrix):
    print "======== doc ",i,"========="
    print doc_term_matrix[i][:10]

[(0, 2), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (10, 1)]
[(0, 1), (1, 1), (2, 2), (6, 1), (9, 3), (12, 1), (15, 1), (16, 1), (35, 2), (39, 1)]
[(0, 1), (7, 1), (10, 1), (11, 1), (13, 1), (18, 1), (23, 1), (25, 1), (28, 1), (32, 1)]
[(0, 1), (9, 1), (14, 1), (21, 1), (22, 1), (24, 1), (25, 1), (30, 1), (33, 1), (38, 1)]
[(19, 1), (37, 1), (38, 1), (39, 1), (46, 1), (53, 4), (66, 1), (70, 1), (71, 1), (80, 1)]
[(17, 2), (19, 1), (37, 1), (46, 1), (53, 2), (66, 1), (70, 1), (71, 1), (80, 1), (84, 1)]
[(2, 1), (5, 2), (9, 5), (17, 1), (18, 1), (20, 2), (24, 2), (25, 3), (26, 3), (30, 1)]
[(8, 1), (13, 3), (44, 3), (54, 1), (107, 6), (151, 2), (174, 5), (183, 1), (239, 1), (246, 1)]
[(0, 1), (4, 1), (17, 2), (25, 2), (26, 5), (27, 1), (28, 1), (29, 1), (30, 1), (31, 2)]
[(0, 1), (3, 2), (9, 1), (11, 3), (16, 5), (17, 2), (21, 1), (23, 1), (25, 2), (30, 9)]


### Parm model Lsi and LDA

In [14]:
#parameter for model Lsi and LDA
nbtopic=3
num_words=5

### Running Lsi Model

## Lsi Model

In [15]:
# print tfidf by doc
corpus_tfidf=gettfidf(doc_term_matrix)
#for i,doc in enumerate(corpus_tfidf): # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly
#    print "======== doc ",i,"========="
#    print(doc)
    
# call model lsi    
lsi=Lsi(corpus_tfidf,dictionary,nbtopic,num_words)
topics=lsi.print_topics(nbtopic,num_words=num_words)
print_topics(topics)
print "\n======= topic rate by doc ======================================================================================="
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
for i,doc in enumerate(corpus_lsi):
    print "===> doc ",i," :"
    print(doc)

topic 0 : 0.556*"course specialization" + 0.271*"specialization university" + 0.186*"data analyst" + 0.175*"data science" + 0.150*"big data"
topic 1 : 0.460*"course specialization" + 0.224*"specialization university" + -0.212*"data analyst" + -0.188*"data management" + -0.176*"big data"
topic 2 : -0.516*"data analyst" + -0.263*"data science" + 0.239*"data management" + -0.152*"per year" + -0.142*"lean six"

===> doc  0  :
[(0, 0.37870001022863919), (1, -0.50952962771325239), (2, 0.17320417599987201)]
===> doc  1  :
[(0, 0.25668334428003647), (1, -0.314485443280628), (2, -0.16690200031485453)]
===> doc  2  :
[(0, 0.33874448109203698), (1, -0.43225888071261753), (2, 0.34335641641235887)]
===> doc  3  :
[(0, 0.24740980288080033), (1, -0.3290708903393792), (2, 0.20881288745153762)]
===> doc  4  :
[(0, 0.77238175272008602), (1, 0.5935850164137354), (2, 0.048065150209669213)]
===> doc  5  :
[(0, 0.76896096217514376), (1, 0.59937921099039959), (2, 0.038135921081810825)]
===> doc  6  :
[(0, 0.

# Running LDA Model

In [16]:
ldamodel=LDA(doc_term_matrix,dictionary,nbtopic,num_words)
topics=ldamodel.print_topics(num_topics=nbtopic, num_words=num_words)
print_topics(topics)
print "\n======= topic rate by doc ======================================================================================="
for i,doc in enumerate(doc_term_matrix):
    print "===> doc ",i," :"
    print(ldamodel.get_document_topics(doc))

topic 0 : 0.072*"data analysis" + 0.049*"big data" + 0.044*"data science" + 0.034*"data analyst" + 0.025*"data analytics"
topic 1 : 0.055*"big data" + 0.044*"course specialization" + 0.036*"data analytics" + 0.022*"specialization university" + 0.015*"data science"
topic 2 : 0.068*"data analytics" + 0.036*"big data" + 0.027*"data management" + 0.017*"data quality" + 0.012*"data integration"

===> doc  0  :
[(2, 0.99582140411250664)]
===> doc  1  :
[(0, 0.99791962869082096)]
===> doc  2  :
[(1, 0.99685617481487432)]
===> doc  3  :
[(2, 0.99401149781747522)]
===> doc  4  :
[(1, 0.99586433857931522)]
===> doc  5  :
[(1, 0.99396173206896998)]
===> doc  6  :
[(0, 0.99756740859961923)]
===> doc  7  :
[(0, 0.97709138026869258), (1, 0.011125139320809189), (2, 0.011783480410498291)]
===> doc  8  :
[(0, 0.99714019787729524)]
===> doc  9  :
[(2, 0.99752816113350695)]
