## Import Required Packages

In [1]:
import time
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation

## Import Data set (from IEEE)

In [2]:
ieee_data = pd.read_csv("Data/ieee_data.csv")
ieee_data

Unnamed: 0,Title,Abstract,Keywords
0,Data Mining and Analytics in the Process Indus...,Data mining and analytics have played an impor...,"Data mining,\nIndustries,\nData models,\nMachi..."
1,Automatically Classifying Functional and Non-f...,"In this paper, we take up the second RE17 data...","Training,\nUsability,\nSecurity,\nFeature extr..."
2,MACORD: Online Adaptive Machine Learning Frame...,Future high-performance computing (HPC) system...,"Training,\nAlgorithm design and analysis,\nHeu..."
3,TGE: Machine Learning Based Task Graph Embeddi...,Task mapping is an important problem in parall...,"Topology,\nBenchmark testing,\nNetwork topolog..."
4,Amazon?s Echo Look: Harnessing the Power of Ma...,"Here we are in 2017, but, at times, it feels a...","Algorithm design and analysis,\nConsumer elect..."
5,The Trust Value Calculating for Social Network...,"In this paper, a social network model is built...","Training,\nSocial network services,\nLogistics..."
6,Japanese Fingerspelling Recognition Based on C...,Sign language is a very important communicatio...,"Gesture recognition,\nAssistive technology,\nS..."
7,A fully configurable and scalable neural copro...,This paper presents a fully configurable and p...,"Coprocessors,\nComputer architecture,\nIP netw..."
8,Category Classification of Text Data with Mach...,The beginner counselors have more likely to co...,"Employee welfare,\nSupport vector machines,\nD..."
9,Understanding the feasibility of machine learn...,The key enabling technology in dynamic spectru...,"Games,\nSwitches,\nMachine learning algorithms..."


## Data Preprocessing

### Combine all text data into one column

In [3]:
ieee_data["Combined_text"] = ieee_data["Title"] + ieee_data["Abstract"] + ieee_data["Keywords"]
ieee_data["Combined_text"]

0     Data Mining and Analytics in the Process Indus...
1     Automatically Classifying Functional and Non-f...
2     MACORD: Online Adaptive Machine Learning Frame...
3     TGE: Machine Learning Based Task Graph Embeddi...
4     Amazon?s Echo Look: Harnessing the Power of Ma...
5     The Trust Value Calculating for Social Network...
6     Japanese Fingerspelling Recognition Based on C...
7     A fully configurable and scalable neural copro...
8     Category Classification of Text Data with Mach...
9     Understanding the feasibility of machine learn...
10    Machine learning approach for optimal determin...
11    Audio Classification Method Based on Machine L...
12    Matheuristic with machine-learning-based predi...
13    Machine Learning Techniques for Analyzing Trai...
14    A Comparison of Distributed Machine Learning P...
15    Machine-Learning Based Threat-Aware System in ...
16    Machine-Learning Classifiers for Security in C...
17    Machine learning based prediction of therm

### Tokenization and stemming

In [4]:
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
import re
import string
#Tokenize the text
def tokenize(text):

    #Create Stemmer
    stemmer = PorterStemmer()

    #Remove irrelevant character
    text = re.sub(r"[^a-zA-Z]", ' ', text)

    #Tokenization
    tokens = word_tokenize(text)
    tokens = [i for i in tokens if i not in string.punctuation]

    #Stemming
    stems = stem_tokens(tokens, stemmer)
    return stems

#Stemming Function
def stem_tokens(t,s):
    stemmed=[]
    for item in t:
        stemmed.append(s.stem(item))
    return stemmed

### Tfidf Vectorizer and Count Vectorizer

In [5]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, stop_words='english', tokenizer=tokenize)
tf_vectorizer = CountVectorizer(max_df=0.95, stop_words='english', tokenizer=tokenize)

### Convert text to tfidf and tf format

In [6]:
ieee_tfidf = tfidf_vectorizer.fit_transform(ieee_data['Combined_text'])
print ieee_tfidf.shape

(53, 1552)


In [7]:
ieee_tf = tf_vectorizer.fit_transform(ieee_data['Combined_text'])
print ieee_tf.shape
ieee_tf.data

(53, 1552)


array([1, 3, 1, ..., 1, 1, 1])

## Model Training

### Fit to NMF model (Frobenius norm)

In [21]:
nmf = NMF(n_components=10, alpha = 0.1, l1_ratio=0.5).fit(ieee_tfidf)

### check NMF model result

In [22]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    

In [23]:
print tfidf_vectorizer.get_feature_names()

[u'abbrevi', u'abil', u'abl', u'abov', u'absolut', u'abstract', u'acceler', u'accept', u'access', u'accessth', u'accid', u'accompani', u'accomplish', u'accord', u'account', u'accru', u'accumul', u'accur', u'accuraci', u'achiev', u'act', u'action', u'activ', u'actual', u'ad', u'adapt', u'add', u'addit', u'address', u'adjuv', u'admet', u'administr', u'adopt', u'adult', u'advanc', u'advantag', u'advers', u'affect', u'aforement', u'age', u'aggreg', u'aim', u'air', u'alarm', u'alexnet', u'algorithm', u'alloc', u'allow', u'altern', u'amazon', u'ambigu', u'amen', u'analog', u'analysi', u'analyt', u'analyticsmachin', u'analyz', u'ani', u'anim', u'ann', u'anneal', u'annual', u'anomali', u'anorexia', u'anticip', u'anxieti', u'anybodi', u'api', u'appar', u'appear', u'appendix', u'appli', u'applic', u'applicationsthi', u'appoint', u'approach', u'approachdespit', u'approachesmetabolom', u'approachrobust', u'appropri', u'approxim', u'arbitrari', u'architectur', u'area', u'aris', u'array', u'art', u'

In [24]:
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), 10)

Topic #0: algorithm data predict network use comput train applic process model
Topic #1: web servic design interfac defect solut ws modular d evalu
Topic #2: kidney diseas chronic predict tree analyt classifi decis logist vector
Topic #3: degrad ber fec pre rout failur detect reduc affect optic
Topic #4: industri initi medic variou research healthcar past analyt data healthcarein
Topic #5: follow therapi endocrin adjuv patient care medic appoint relat record
Topic #6: nfr recal precis fr function requir dataset secur supervis classifi
Topic #7: wave paramet relationship determin approach ml buoy convers power method
Topic #8: signal behavior mental process versu physiolog bodi make clinic represent
Topic #9: estim kernel semisupervis increment prior neighborhood se bandwidth data label


### Fit NMF model (Kullback-Leibler divergence)

In [25]:
nmf = NMF(n_components=10, beta_loss='kullback-leibler', solver='mu', max_iter=1000, alpha=.1,l1_ratio=.5).fit(ieee_tfidf)

In [26]:
print_top_words(nmf, tfidf_vectorizer.get_feature_names(), 10)

Topic #0: data model support network base perform pattern use algorithm process
Topic #1: power level design comput requir softwar shown thi highli empir
Topic #2: thi support techniqu vector use remain paper classifi interfac s
Topic #3: propos set mobil occur result protect match thi analysi help
Topic #4: thi potenti use open health imag follow defect healthcar fine
Topic #5: random classifi experi mani rate secur linear comparison term non
Topic #6: novel method just issu propos paper handl svm mcdbn rate
Topic #7: increasingli storag svm train continu use studi core regress comput
Topic #8: analyt decis parallel art data paper sever rule perform dataset
Topic #9: possibl statu studi predictor signific s hip posit thi robot


### Fit LDA Model (16 Topics)

In [36]:
lda_16 = LatentDirichletAllocation(n_components=16, max_iter = 100, learning_method="batch", learning_offset=10)
lda_16.fit(ieee_tf)
print_top_words(lda_16, tf_vectorizer.get_feature_names(), 10)

Topic #0: predict follow patient failur endocrin therapi adjuv cloud data signal
Topic #1: use predict method model fingerspel network node recognit valu trust
Topic #2: model use featur measur method data mesh base network distribut
Topic #3: health use forest random organ osteosarcoma rumor zika studi world
Topic #4: algorithm elm data propos use predict train framework result base
Topic #5: audio classif data method propos featur base classifi paper ha
Topic #6: design servic techniqu hardwar approach defect web complex dure autom
Topic #7: data model estim prior method network work accuraci avail analyt
Topic #8: use servic differ cost network web cloud neural comput secur
Topic #9: data kernel estim vector support semisupervis increment svm propos base
Topic #10: wave network data paramet approach base model sdn relationship control
Topic #11: use data effect pattern gpu comput distort model applic parallel
Topic #12: data use stage task algorithm applic train process svm vector
T

#### Perplexity for LDA Model

In [37]:
lda_16.perplexity(ieee_tf)

1015.0373438840458

#### Get Topic Docuemnt Association

In [38]:
topic_document_association_16 = lda_16.transform(ieee_tf)
print topic_document_association_16.shape
print topic_document_association_16

(53, 16)
[[  7.62195561e-04   7.62195527e-04   7.62195454e-04   7.62195256e-04
    7.62195492e-04   7.62195290e-04   7.62195303e-04   9.88567069e-01
    7.62195405e-04   7.62195311e-04   7.62195418e-04   7.62195669e-04
    7.62195468e-04   7.62195252e-04   7.62195381e-04   7.62195408e-04]
 [  5.16529120e-04   5.16529171e-04   9.92252062e-01   5.16529206e-04
    5.16529186e-04   5.16529154e-04   5.16529148e-04   5.16529216e-04
    5.16529244e-04   5.16529184e-04   5.16529093e-04   5.16529099e-04
    5.16529256e-04   5.16529150e-04   5.16529134e-04   5.16529156e-04]
 [  4.52898837e-04   4.52898767e-04   4.52898818e-04   4.52898682e-04
    9.93206518e-01   4.52898651e-04   4.52898999e-04   4.52898734e-04
    4.52898720e-04   4.52898639e-04   4.52898760e-04   4.52898762e-04
    4.52898985e-04   4.52898703e-04   4.52898813e-04   4.52898777e-04]
 [  5.16529182e-04   5.16529241e-04   5.16529294e-04   5.16528976e-04
    5.16529242e-04   5.16529117e-04   5.16529087e-04   5.16529166e-04
    5.16

### Try Nested Topic Modeling on LDA

#### Fit LDA Model 8 Topics

In [44]:
lda_8 = LatentDirichletAllocation(n_components=8, max_iter = 100, learning_method="batch", learning_offset=10)
lda_8.fit(lda_16.components_)
print_top_words(lda_8, tf_vectorizer.get_feature_names(), 8)

Topic #0: design servic hardwar techniqu defect web approach complex
Topic #1: hemorrhag detect imag counselor flow distort visual gpu
Topic #2: use differ servic evalu cost health provid cloud
Topic #3: zero gaussian semant cepstral clip mel learningaudio secondli
Topic #4: model use featur measur mesh d distribut method
Topic #5: zero gaussian semant cepstral clip mel learningaudio secondli
Topic #6: data algorithm use predict method base model network
Topic #7: predict follow patient failur endocrin adjuv therapi cloud


In [45]:
lda_8.perplexity(lda_16.components_)

1500.4501327232101

#### Fit LDA Model 4 Topics

In [46]:
lda_4 = LatentDirichletAllocation(n_components=4, max_iter = 100, learning_method="batch", learning_offset=10)
lda_4.fit(lda_8.components_)
print_top_words(lda_4, tf_vectorizer.get_feature_names(), 4)

Topic #0: cloud differ servic provid
Topic #1: design servic hemorrhag hardwar
Topic #2: addit way usag shown
Topic #3: data algorithm use model


In [47]:
lda_4.perplexity(lda_8.components_)

1309.2208992631054

#### Fit LDA Model 1 Topic

In [49]:
lda_1 = LatentDirichletAllocation(n_components=1, max_iter = 100, learning_method="batch", learning_offset=10)
lda_1.fit(lda_4.components_)
print_top_words(lda_1, tf_vectorizer.get_feature_names(), 1)

Topic #0: data


In [50]:
lda_1.perplexity(lda_4.components_)

1406.2830331290431