In [1]:
import re
import random
from string import punctuation

import pandas as pd
import numpy as np
from nltk import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [2]:
# DE: keywords
# ID: extended keywords
# TI: title
# AB: abstract

In [3]:
print('load data..')
whole_data = None
for i in range(1, 7):
    df = pd.read_csv('data/{}.txt'.format(str(i)), 
                     delimiter='\t',  
                     usecols=['DE', 'ID', 'TI', 'AB'], 
                     encoding='utf8',
                     index_col=False,
                     dtype=np.str)
    df = df[df['AB'].notnull() & df['TI'].notnull()]  # filter null abstract
    df = df.fillna('')
    if whole_data is None:
        whole_data = df
    else:
        whole_data = pd.concat([whole_data, df], ignore_index=True)
whole_data.info()
whole_data.head()

load data..
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2596 entries, 0 to 2595
Data columns (total 4 columns):
TI    2596 non-null object
DE    2596 non-null object
ID    2596 non-null object
AB    2596 non-null object
dtypes: object(4)
memory usage: 40.6+ KB


Unnamed: 0,TI,DE,ID,AB
0,Identifying Emerging Trends and Temporal Patte...,Self-driving car; Clustering; Term burst detec...,SCIENCE; VEHICLES; DYNAMICS,Self-driving is an emerging technology which h...
1,Data Analysis of Tourists' Online Reviews on R...,Online reviews; Text mining; Latent Dirichlet ...,PERCEPTIONS; CULTURE; QUALITY,The proliferation of online consumer reviews h...
2,Development strategies for heavy duty electric...,Heavy duty vehicle; Electric vehicle; Battery;...,EARTH-ELEMENTS DEMAND; FUEL-CELL TECHNOLOGY; P...,This paper investigates the development of hea...
3,Topic-based rank search with verifiable social...,Topic-based rank search; Verifiable social dat...,ENABLING EFFICIENT; QUERY,As the explosive development of social network...
4,Identifying topic relevant hashtags in Twitter...,Text mining; Topic modeling; Latent Dirichlet ...,,Hashtags have become a crucial social media to...


In [4]:
random.choices(whole_data['AB'], k=5)

["In traditional health recommending system the recommendations are not personalized according to the patient's, the recommendation extremely depends on physical, emotional and psychological matters of the user list is generated based on the diseases the patient navigates. In health era, personalized health recommendation helps us to extract personalized health content form overloading information's available on the web. the patient's, the initial step is to identify the patient interest in which health related issues they needs recommendation. In this paper we have used statistical topic modeling technique Hierarchical Latent Dirichlet Allocation (HLDA) to identify the user interest which provides robust and interpretable topic representation. After identifying the user interest, neighborhood selection is done based on ranking and finally recommendation is done according to user preference. In this model we have learned six parameters, in parameter(1) the topical distribution of each 

In [5]:
wnl = WordNetLemmatizer()
url = '(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'
def preprocess(abstract):
    '''
    preprocess abstract: lower, remove punctuations, tokenize
    :param abstract: str
    :return: list
    '''
    abstract = re.sub(url, ' ', abstract)
    abstract = re.sub('\d+?', ' ', abstract)
    for p in punctuation:
        abstract = re.sub(re.escape(p), ' ', abstract)
    abstract = abstract.lower()
    abstract = [wnl.lemmatize(w) for w in word_tokenize(abstract)]
    filtered = [w for w in abstract if w not in stopwords.words('english')]
    return filtered

In [6]:
def do_lda(input_text, feature_method='tf', topic_num=5, vocabulary=None, method="batch"):
    '''
    do lda process
    :param input_text: list, [str,], preprocessed text
    :return: tuple of np.array, terms, doc-topic probability, topic-word probability, perplexity
    '''
    vector = None
    if feature_method == 'tf':
        vector = CountVectorizer(ngram_range=(1, 1), vocabulary=vocabulary, stop_words='english')
        vector.build_analyzer()
    if feature_method == 'idf':
        vector = TfidfVectorizer(ngram_range=(2, 2), vocabulary=vocabulary, stop_words='english')
        vector.build_analyzer()
    x = vector.fit_transform(input_text)
    lda = LatentDirichletAllocation(n_components=topic_num, learning_method=method, max_iter=20, random_state=0,
                                    batch_size=128, topic_word_prior=0.5 / topic_num)
    lda_topics = lda.fit_transform(x)
    return np.array(vector.get_feature_names()), lda_topics, lda.components_, lda.perplexity(x)

In [7]:
def print_topics(topic_word, terms, num=20):
    '''
    print topics
    :param topic_word: np.array, topic-word probability
    :param terms: np.array, feature names
    :param num: int, term num of topic to print
    :return: None
    '''
    for idx, t in enumerate(topic_word):
        sort_idx = np.argsort(t)
        print("#", idx + 1, "-" * 20)
        print(terms[sort_idx[-1:-num - 1:-1]])

In [8]:
input_text = [' '.join(preprocess(a)) for a in whole_data['AB']]
vocab = None
topic_param = 20
terms, doc_topic, topic_word, perplexity = do_lda(input_text, 'tf', topic_param, vocab)
print_topics(topic_word, terms)

# 1 --------------------
['review' 'analysis' 'online' 'product' 'research' 'sentiment' 'data'
 'study' 'opinion' 'customer' 'text' 'mining' 'aspect' 'consumer'
 'business' 'student' 'public' 'topic' 'result' 'service']
# 2 --------------------
['model' 'topic' 'latent' 'method' 'document' 'dirichlet' 'approach'
 'using' 'based' 'paper' 'allocation' 'lda' 'proposed' 'distribution'
 'word' 'result' 'sentence' 'summarization' 'algorithm' 'task']
# 3 --------------------
['software' 'topic' 'code' 'developer' 'source' 'application' 'lda'
 'latent' 'result' 'project' 'approach' 'dirichlet' 'allocation' 'based'
 'analysis' 'research' 'model' 'method' 'paper' 'using']
# 4 --------------------
['message' 'drug' 'learning' 'spam' 'latent' 'method' 'approach' 'wa'
 'group' 'lda' 'allocation' 'dirichlet' 'study' 'sm' 'ha' 'data'
 'information' 'language' 'comment' 'used']
# 5 --------------------
['topic' 'question' 'information' 'concept' 'model' 'latent' 'method'
 'behavior' 'allocation' 'diri

In [13]:
whole_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2596 entries, 0 to 2595
Data columns (total 4 columns):
TI    2596 non-null object
DE    2596 non-null object
ID    2596 non-null object
AB    2596 non-null object
dtypes: object(4)
memory usage: 40.6+ KB


In [16]:
def get_keywords(raw_list):
    tmp_set = set()
    for r in raw_list:
        for keyword in r.split(";"):
            processed = " ".join([wnl.lemmatize(w) for w in word_tokenize(keyword.lower())])
            tmp_set.add(processed)
    return sorted(list(tmp_set))

In [17]:
keywords = get_keywords(whole_data["DE"])
random.choices(keywords, k=10)

['social audience',
 'application essay',
 'customer',
 'theme',
 'topic analysis',
 'latent dirchlet allocation',
 'active learning',
 'behavioral segmentation',
 'siti',
 'hadoop/mapreduce']

In [22]:
import time
start_time = time.time()
input_text = [' '.join(preprocess(a)) for a in whole_data['AB']]
vocab = ["latent dirchlet allocation"]
topic_param = 20
terms, doc_topic, topic_word, perplexity = do_lda(input_text, 'tf', topic_param, vocab)
print_topics(topic_word, terms)
print("used:", time.time()-start_time)

  perword_bound = bound / word_cnt


# 1 --------------------
['latent dirchlet allocation']
# 2 --------------------
['latent dirchlet allocation']
# 3 --------------------
['latent dirchlet allocation']
# 4 --------------------
['latent dirchlet allocation']
# 5 --------------------
['latent dirchlet allocation']
# 6 --------------------
['latent dirchlet allocation']
# 7 --------------------
['latent dirchlet allocation']
# 8 --------------------
['latent dirchlet allocation']
# 9 --------------------
['latent dirchlet allocation']
# 10 --------------------
['latent dirchlet allocation']
# 11 --------------------
['latent dirchlet allocation']
# 12 --------------------
['latent dirchlet allocation']
# 13 --------------------
['latent dirchlet allocation']
# 14 --------------------
['latent dirchlet allocation']
# 15 --------------------
['latent dirchlet allocation']
# 16 --------------------
['latent dirchlet allocation']
# 17 --------------------
['latent dirchlet allocation']
# 18 --------------------
['latent dirch

  perword_bound = bound / word_cnt
