In [1]:
#load dataset
from sklearn.datasets import fetch_20newsgroups
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
newsgroups_test = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

docs = newsgroups_test.data

In [2]:
#load library
from CTM import *
from tm_eval import *

In [3]:
#define a ctm class
ctm = CTM()

In [4]:
#pre-processing
doc_clean = [ctm.clean_text(doc) for doc in docs]

In [5]:
#vocabulary extraction, optimization and vectorization
vocab,w2vmodel, wtoi, itow = ctm.getvocab_and_vectorized(doc_clean, optimized=0, njobs=1)

In [7]:
#calculate the similar words with obtained vectors
ctm.similar_words("apple", w2vmodel,wtoi,tpn=10)

[('mac', 0.5695577236138913),
 ('macweek', 0.5548453866991787),
 ('hardware', 0.4888073132688122),
 ('powerpc', 0.4664941923445318),
 ('ibm', 0.4599373278135579),
 ('nubus', 0.4492192341190451),
 ('built', 0.44733814880641254),
 ('cpu', 0.43918939804266843),
 ('os', 0.43303595339317),
 ('macuser', 0.4167949704931615)]

In [8]:
#run the kmeans based topic modeling

topics, centroids = ctm.kMeans_tm(vocab=vocab, wtoi=wtoi, wvmodel=w2vmodel, n=20, 
                                  initA="random", max_iter=1000, topM=10)


In [9]:
for i, topic in enumerate(topics):
    print("topic",i, ':', topic[0:10])

topic 0 : ['facility', 'empire', 'seized', 'expected', 'letter', 'gradually', 'accommodate', 'spain', 'carrier', 'month']
topic 1 : ['bios', 'reichel', 'amour', 'formatting', 'sutter', 'explosive', 'galley', 'ottoman', 'beranek', 'physicist']
topic 2 : ['quicktime', 'dod', 'doom', 'braking', 'ab', 'fluid', 'dot', 'deposited', 'additive', 'argument']
topic 3 : ['american', 'former', 'seven', 'forward', 'oil', 'russian', 'working', 'sanction', 'comment', 'vat']
topic 4 : ['people', 'going', 'know', 'investigation', 'became', 'basement', 'armored', 'something', 'come', 'head']
topic 5 : ['arg', 'converter', 'setting', 'string', 'default', 'osf', 'resource', 'widget', 'src', 'callback']
topic 6 : ['commercial', 'revenue', 'telecommunication', 'mariner', 'probe', 'contract', 'surface', 'venture', 'voyager', 'venus']
topic 7 : ['award', 'date', 'special', 'avoid', 'attempt', 'supplemental', 'prompt', 'open', 'variable', 'buf']
topic 8 : ['ontario', 'protect', 'flexible', 'device', 'specially

In [10]:
#run the word network clustering (WNC) based topic modeling

G, topics = ctm.build_network_and_clustering(vocab, w2vmodel, wtoi, resolution = 0.87, nodeOpti=50, ranking=1)

In [11]:
for i, topic in enumerate(topics):
    print("topic",i, ':', topic[0:10])

topic 0 : ['would', 'way', 'well', 'year', 'without', 'work', 'world', 'want', 'written', 'yet']
topic 1 : ['use', 'via', 'using', 'unix', 'various', 'version', 'system', 'us', 'window', 'software']
topic 2 : ['wayne', 'york', 'trophy', 'team', 'tue', 'tampa', 'wale', 'winnipeg', 'yale', 'standing']
topic 3 : ['volume', 'vehicle', 'space', 'venture', 'usaf', 'surveillance', 'study', 'reported', 'spain', 'venus']
topic 4 : ['time', 'two', 'went', 'woman', 'whole', 'told', 'took', 'thing', 'see', 'three']
topic 5 : ['war', 'zealand', 'south', 'virtually', 'university', 'ship', 'wiped', 'southern', 'secret', 'village']
topic 6 : ['user', 'usenet', 'site', 'widespread', 'service', 'vary', 'related', 'may', 'privacy', 'telephony']
topic 7 : ['used', 'usually', 'wire', 'wiring', 'wider', 'wall', 'trip', 'together', 'versus', 'sometimes']
topic 8 : ['united', 'state', 'watson', 'txt', 'waiting', 'transcript', 'theatre', 'senate', 'rkba', 'union']
topic 9 : ['working', 'think', 'specific', 'to

In [103]:
# preparing evaluation corpus
from gensim.corpora import Dictionary

tokens_text = []
for doc in docs:
    for sent in sent_tokenize(doc):
        words = word_tokenize(sent)
        if len(words) > 0:
            tokens_text.append(words)

dictionary = Dictionary(tokens_text)
bow_corpus = [dictionary.doc2bow(sent, allow_update=True) for sent in tokens_text]

In [119]:
#evaluation
umass = []
uci = []
cv = []
tpns = np.arange(10,15,5)
for tpn in tpns:
    tc1 = Umass_coherence(topics, tpn, bow_corpus, dictionary)
    tc2 = Uci_coherence(topics, tpn, tokens_text, dictionary)
    tc3 = cv_coherence(topics, tpn, tokens_text,dictionary)

    umass.append(tc1)
    uci.append(tc2)
    cv.append(tc3)


In [120]:
eval_tc = {'umass':umass,'uci':uci,'cv':cv}

In [121]:
eval_tc

{'umass': [-13.89305816234572],
 'uci': [-7.565277945544544],
 'cv': [0.3326210811349603]}