## Import Required Packages

In [31]:
import time
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups_vectorized, fetch_20newsgroups
import requests

In [2]:
dataset = fetch_20newsgroups(remove=('headers', 'footers', 'quotes'))

In [3]:
len(dataset.data)

11314

## Import Data set (from IEEE)

In [87]:
ieee_data = pd.read_csv("Data/IEEE-Computer-Science-2013.csv")
ieee_data

Unnamed: 0,Title,Abstract,Keywords
0,An Overview of Recent Progress in the Study of...,This paper reviews some main results and progr...,"Delay effects,\nNetwork topology,\nHeuristic a..."
1,Speech recognition with deep recurrent neural ...,Recurrent neural networks (RNNs) are a powerfu...,"Speech recognition,\nRecurrent neural networks..."
2,A survey of energy-efficient wireless communic...,Reducing energy consumption in wireless commun...,"Energy consumption,\nQuality of service,\nWire..."
3,A Survey on Human Activity Recognition using W...,Providing accurate and opportune information o...,"Feature extraction,\nAccelerometers,\nPervasiv..."
4,Enhanced Computer Vision With Microsoft Kinect...,With the invention of the low-cost Microsoft K...,"Computer vision,\nData integration,\nSensors,\..."
5,Deformable Medical Image Registration: A Survey,Deformable image registration is a fundamental...,"Deformable models,\nMathematical model,\nBiome..."
6,Decentralized Charging Control of Large Popula...,This paper develops a strategy to coordinate t...,"Nash equilibrium,\nGames,\nElectricity,\nTraje..."
7,Nonlocally Centralized Sparse Representation f...,Sparse representation models code an image pat...,"Dictionaries,\nEncoding,\nImage restoration,\n..."
8,Mobile Data Offloading: How Much Can WiFi Deli...,This paper presents a quantitative study on th...,"IEEE 802.11 Standards,\nMobile communication,\..."
9,Efficiency Resource Allocation for Device-to-D...,Peer-to-peer communication has been recently c...,"Resource management,\nInterference,\nReceivers..."


## Data Preprocessing

### Combine all text data into one column

In [88]:
ieee_data["Combined_text"] = ieee_data["Title"] + ieee_data["Abstract"] + ieee_data["Keywords"]
raw = ieee_data["Combined_text"]
raw = raw.dropna()

### Tokenization and stemming

In [79]:
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import re
import string
#Tokenize the text
def tokenize(text):

    #Create Stemmer
    stemmer = WordNetLemmatizer()

    #Remove irrelevant character
    text = re.sub(r"[^a-zA-Z]", ' ', text)

    #Tokenization
    tokens = word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]

    #Stemming
    stems = stem_tokens(tokens, stemmer)
    return stems

#Stemming Function
def stem_tokens(t,s):
    stemmed=[]
    for item in t:
        stemmed.append(s.lemmatize(item))
    return stemmed

### Tfidf Vectorizer and Count Vectorizer

In [89]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=tokenize)
tf_vectorizer = CountVectorizer(stop_words='english', tokenizer=tokenize, max_df=500, max_features=1000)

### Convert text to tfidf and tf format

In [90]:
ieee_tfidf = tfidf_vectorizer.fit_transform(raw)
print ieee_tfidf.shape

(1449, 12561)


In [91]:
ieee_tf = tf_vectorizer.fit_transform(raw)
print ieee_tf.shape
ieee_tf.data

(1449, 1000)


array([1, 1, 1, ..., 2, 1, 1])

## Model Training

### Fit to NMF model (Frobenius norm)

In [21]:
nmf = NMF(n_components=10, alpha = 0.1, l1_ratio=0.5).fit(ieee_tfidf)

### check NMF model result

In [82]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % (topic_idx+1)
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    

In [24]:
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), 10)

Topic #0: algorithm data predict network use comput train applic process model
Topic #1: web servic design interfac defect solut ws modular d evalu
Topic #2: kidney diseas chronic predict tree analyt classifi decis logist vector
Topic #3: degrad ber fec pre rout failur detect reduc affect optic
Topic #4: industri initi medic variou research healthcar past analyt data healthcarein
Topic #5: follow therapi endocrin adjuv patient care medic appoint relat record
Topic #6: nfr recal precis fr function requir dataset secur supervis classifi
Topic #7: wave paramet relationship determin approach ml buoy convers power method
Topic #8: signal behavior mental process versu physiolog bodi make clinic represent
Topic #9: estim kernel semisupervis increment prior neighborhood se bandwidth data label


### Fit NMF model (Kullback-Leibler divergence)

In [25]:
nmf = NMF(n_components=10, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,l1_ratio=.5).fit(ieee_tfidf)

In [26]:
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), 10)

Topic #0: data model support network base perform pattern use algorithm process
Topic #1: power level design comput requir softwar shown thi highli empir
Topic #2: thi support techniqu vector use remain paper classifi interfac s
Topic #3: propos set mobil occur result protect match thi analysi help
Topic #4: thi potenti use open health imag follow defect healthcar fine
Topic #5: random classifi experi mani rate secur linear comparison term non
Topic #6: novel method just issu propos paper handl svm mcdbn rate
Topic #7: increasingli storag svm train continu use studi core regress comput
Topic #8: analyt decis parallel art data paper sever rule perform dataset
Topic #9: possibl statu studi predictor signific s hip posit thi robot


### Fit LDA Model

In [33]:
class TopicTree:
    def __init__(self, level=0, index=1, children=[], keywords=[]):
        self.children = children
        self.keywords = keywords
        self.level = level
        self.index = index
        self.dup_index = 0
        self.association_index = 0
        self.base_url = 'https://api.wordassociations.net/associations/v1.0/json/search?apikey=85618aa1-21c2-4382-9dc6-373c5b1424b8&lang=en&limit=100'
        self.n_top_words = 20
    
    def get_number_of_children(self):
        return len(self.children)

    def get_children(self):
        return children
    
    def get_topic_content(self):
        return self.topic_content

    def compute_dup_index(self):
        if not self.children:
            return
        children_keywords = []
        for child in self.children:
            children_keywords += child.keywords
            child.compute_dup_index()
        original_len = len(children_keywords)
        children_keywords = list(set(children_keywords))
        num_of_duplicate = original_len - len(children_keywords)
        self.dup_index = float(num_of_duplicate) / original_len
        
    def get_whole_tree_dup_index(self):
        if self.dup_index == 0:
            self.compute_dup_index()
        if not self.children:
            return self.dup_index
        children_index = 0
        for child in self.children:
            children_index += child.get_whole_tree_dup_index()
        return (self.dup_index + children_index) / float(1 + len(self.children))
    
    def compute_association_index(self):
        base_url = self.base_url
        word_sum = 0
        if len(self.keywords) <= 10:
            for word in self.keywords:
                base_url = base_url + '&text=' + word
            response = requests.get(base_url).json()['response']
        elif len(self.keywords) <= 20:
            for word in self.keywords[:10]:
                base_url = base_url + '&text=' + word
            response = requests.get(base_url).json()['response']
            base_url = self.base_url
            for word in self.keywords[10:]:
                base_url = base_url + '&text=' + word
            response = response + requests.get(base_url).json()['response']
        else:
            for word in self.keywords[:10]:
                base_url = base_url + '&text=' + word
            response = requests.get(base_url).json()['response']
            base_url = self.base_url
            for word in self.keywords[10:20]:
                base_url = base_url + '&text=' + word
            response = response + requests.get(base_url).json()['response']
            base_url = self.base_url
            for word in self.keywords[20:]:
                base_url = base_url + '&text=' + word
            response = response + requests.get(base_url).json()['response']
        for word_json in response:
            items = word_json['items']
            for item in items:
                if item['item'].lower() in self.keywords:
                    word_sum += float(item['weight']) / 100.0
        self.association_index = float(word_sum) / self.n_top_words / (self.n_top_words - 1)
    
    def get_whole_tree_association_index(self):
        if self.association_index == 0:
            self.compute_association_index()
        if not self.children:
            return self.association_index
        children_index = 0
        for child in self.children:
            children_index += float(child.get_whole_tree_association_index())
        # children_index = children_index / len(self.children)
        return (self.association_index + children_index) / float(1 + len(self.children))

In [11]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % (topic_idx+1)
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
        
def get_top_words(level, model, feature_names, n_top_words):
    children = []
    for topic_idx, topic in enumerate(model.components_):
        topic_words = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        children.append(TopicTree(level=level, index=topic_idx+1, keywords=topic_words))
    return children

def build_root(tree,data, feature_names):
    print "\nlevel 0"
    message = "#Topic 1: "
    centroids = np.mean(data,axis=0)
    topic_words = [feature_names[i]
                         for i in centroids.argsort()[:20]]
    tree.keywords = topic_words
    message += " ".join(topic_words)   
    print message

In [12]:
def get_MPC(doc_topic_weight_mat):
    num_doc = doc_topic_weight_mat.shape[0]
    num_topic = doc_topic_weight_mat.shape[1]
    pc = np.sum(np.square(doc_topic_weight_mat)) / num_doc
    if num_topic == 1:
        mpc = 0.50
    else:
        mpc = 1 - float(num_topic) / (num_topic - 1) * (1 - pc)
    return mpc

In [92]:
max_topic = 5
max_level = 2
topic_tree = TopicTree(level=1, index=1)
build_root(topic_tree, ieee_tf.toarray(), tf_vectorizer.get_feature_names())


level 0
#Topic 1: caused observer quantitative supply stream svm cause successfully attention kinect biological metal sub validation sensitive sparsity implement artifact meet score


In [73]:
def Hierarchical_lda(tree, max_topic, max_level, doc_word_matrix, level, index, dictionary):
    if level > max_level:
        return
    
    target_mpc = 0
    for topic_index in range(2,max_topic+1):
        model = LatentDirichletAllocation(n_components=topic_index, max_iter = 20, learning_method="batch", learning_offset=10)
        model.fit(doc_word_matrix)
        mpc = get_MPC(model.transform(doc_word_matrix))
        mpc = model.perplexity(doc_word_matrix)*(1-mpc)
        if target_mpc == 0:
            target_mpc = mpc
            target_num_topic = topic_index
            target_model = model
        elif mpc < target_mpc:
            target_mpc = mpc
            target_num_topic = topic_index
            target_model = model
    print "Level %d topic %d" %(level, index)
    print "Detect %d topics" %target_num_topic
    tree.children = get_top_words(level=level, n_top_words=20, model=target_model, feature_names=dictionary)
    print_top_words(target_model, dictionary, n_top_words=20)
    doc_topic_matrix = target_model.transform(doc_word_matrix)
    
    for topic_index in range(target_num_topic):
        weight_vector = doc_topic_matrix.T[topic_index]
        weight_vector = weight_vector.reshape(len(weight_vector),)
        new_doc_word_matrix = doc_word_matrix[:]
        for i in range(len(weight_vector)):
            new_doc_word_matrix[i] = (new_doc_word_matrix[i]*weight_vector[i]*target_num_topic*10).astype(int)
        Hierarchical_lda(tree.children[topic_index], max_topic, max_level, new_doc_word_matrix, level+1, topic_index+1, dictionary)
        

In [93]:
import time
start = time.time()

Hierarchical_lda(tree=topic_tree, max_topic = max_topic, max_level = max_level, doc_word_matrix=ieee_tf, level=1, index=1, dictionary=tf_vectorizer.get_feature_names())

print time.time() - start

Level 1 topic 1
Detect 3 topics
Topic #1: network algorithm user channel problem wireless sensor communication performance mobile data scheme time energy rate cloud node optimal service protocol
Topic #2: power control frequency design high voltage model current energy circuit architecture performance time device grid optical antenna measurement low simulation
Topic #3: image algorithm data model feature approach analysis time problem detection information learning used set signal accuracy different vector technique noise
Level 2 topic 1
Detect 4 topics
Topic #1: cloud data service computing network mobile algorithm delay time video user application traffic vehicle resource performance routing scheduling framework problem
Topic #2: channel rate user interference problem signal coding noise algorithm code performance error optimal bound network receiver cognitive n scheme mimo
Topic #3: network user security model data social privacy sensor scheme attack wireless communication ha applic

In [94]:
topic_tree.get_whole_tree_dup_index()

0.065

In [95]:
topic_tree.get_whole_tree_association_index()

0.04203596491228071

#### Get Topic Docuemnt Association

In [18]:
topic_document_association_16 = lda_16.transform(ieee_tf)
print topic_document_association_16.shape
print topic_document_association_16

(1693, 10)
[[  7.52007740e-04   7.52009568e-04   7.52075078e-04 ...,   7.51909706e-04
    3.26960539e-01   1.92206110e-01]
 [  5.74800645e-04   5.74770696e-04   5.74858043e-04 ...,   5.74778461e-04
    9.94826557e-01   5.74824124e-04]
 [  5.43534364e-04   5.43618385e-04   5.43632548e-04 ...,   5.43550054e-04
    5.31083328e-01   5.43600511e-04]
 ..., 
 [  6.45430633e-02   9.34692717e-04   1.09913827e-01 ...,   9.34642882e-04
    9.34793652e-04   2.28162224e-01]
 [  1.14960557e-03   1.14969817e-03   8.00667687e-01 ...,   1.14947725e-03
    1.14963424e-03   1.14970015e-03]
 [  7.69432053e-04   6.81390887e-01   7.69386957e-04 ...,   7.69327685e-04
    7.69428287e-04   1.91882154e-01]]


### Try Nested Topic Modeling on LDA

#### Fit LDA Model 8 Topics

In [44]:
lda_8 = LatentDirichletAllocation(n_components=8, max_iter = 100, learning_method="batch", learning_offset=10)
lda_8.fit(lda_16.components_)
print_top_words(lda_8, tf_vectorizer.get_feature_names(), 8)

Topic #0: design servic hardwar techniqu defect web approach complex
Topic #1: hemorrhag detect imag counselor flow distort visual gpu
Topic #2: use differ servic evalu cost health provid cloud
Topic #3: zero gaussian semant cepstral clip mel learningaudio secondli
Topic #4: model use featur measur mesh d distribut method
Topic #5: zero gaussian semant cepstral clip mel learningaudio secondli
Topic #6: data algorithm use predict method base model network
Topic #7: predict follow patient failur endocrin adjuv therapi cloud


In [45]:
lda_8.perplexity(lda_16.components_)

1500.4501327232101

#### Fit LDA Model 4 Topics

In [46]:
lda_4 = LatentDirichletAllocation(n_components=4, max_iter = 100, learning_method="batch", learning_offset=10)
lda_4.fit(lda_8.components_)
print_top_words(lda_4, tf_vectorizer.get_feature_names(), 4)

Topic #0: cloud differ servic provid
Topic #1: design servic hemorrhag hardwar
Topic #2: addit way usag shown
Topic #3: data algorithm use model


In [47]:
lda_4.perplexity(lda_8.components_)

1309.2208992631054

#### Fit LDA Model 1 Topic

In [49]:
lda_1 = LatentDirichletAllocation(n_components=1, max_iter = 100, learning_method="batch", learning_offset=10)
lda_1.fit(lda_4.components_)
print_top_words(lda_1, tf_vectorizer.get_feature_names(), 1)

Topic #0: data


In [50]:
lda_1.perplexity(lda_4.components_)

1406.2830331290431