In [1]:
import ast
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn import mixture
import itertools, operator
import scipy.stats

In [27]:
n_components = 50
n_top_words = 50
n_clusters = 10

In [3]:
whole_docs = []
whole_doc_labels = []
whole_doc_id = []
file = open("whole_dataset.txt", 'r')
for line in file:
    line = ast.literal_eval(line)
    whole_doc_id.append(line[0])
    whole_doc_labels.append(line[1])
    whole_docs.append(line[2])

In [4]:
docs = []
doc_topics = []
doc_id = []
file = open("sample_dataset.txt", 'r')
for line in file:
    line = ast.literal_eval(line)
    doc_id.append(line[0])
    doc_topics.append(line[1])
    docs.append(line[2])

In [5]:
topic_words_dict = {}

In [6]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        topic_words_dict[topic_idx] = message.split()[2:] 
        print(message)
    print()

In [7]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(whole_docs)
print("Done")

Extracting tf features for LDA...
Done


In [8]:
print("Fitting LDA models with tf features...")
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)
print("Done")

Fitting LDA models with tf features...
Done


In [9]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: bob jupiter jets com sea sir vice friday afternoon cleansing kg balanced persia wednesday milwaukee away pitchers blew retreat mph lords tek smiley carter darren smell coast tactic polish fever stay __________ extremist cassini bikers striking justifiable equate contingent wade precious anxiety crooks realization panicking manhattan challenges betweenthe qualifies doctrinal
Topic #1: attorney northern handguns ira antenna toxic tampa handheld asteroids comets slot cabin killings ireland cannon picks applause ambulance embassy greed luna province scott echl microphone affiliation courage bridges destiny zones telecom zuma vhf statute antennas predecessors du mydisplay charger botched nyx plutonium basil appletalk frederick bourque 1944 main_win conner noonan
Topic #2: batf captain concealed dreams rocks abiding citizens tire grenades decade deadly perry hacker recipient schism respected comic anecdotal nationalism bent society whatthey proportion louisvil

Topic #28: turkey policy militia minister served courts yugoslavia christmas confidential leaders briefed burnt leftover celebration truelove violating escrow hut veteran parallels scandal lawenforcement threatens enacted microcircuits judiciary chronicle subcommittee thinkers embraced theus sawed gic tlu casserole northwest concert philippines itcould coca consultations cola safeguarding hunters amendments longstanding economist palo mykotronx theattorney
Topic #29: 10 12 25 15 20 11 14 turkish 16 13 18 17 19 30 40 21 50 22 28 arab 000 lord 23 26 24 27 31 29 33 jehovah 35 jesus man 34 israel 38 60 32 42 37 power 36 43 75 39 45 41 51 1st men
Topic #30: ripem funding tragedy remark sh lemieux fprintf debt identity investigators bristol stderr 102nd democrats libxmu rabin reserves capitals authorization shield rocketry encrypting jointly crossroads twm parse ls bel socialists deployment openwinhome authentic openwin malpractice visual impacts instruct bash burzynski cud enjoyment sanders

In [10]:
topic_words_prob_dict = {}

In [11]:
lda.components_ /= lda.components_.sum(axis=1)[:, np.newaxis]
for i in range(n_components):
    topic_words_prob_dict[i] = sorted(lda.components_[i])[::-1][:n_top_words] 
    print("Topic ",i,": ", sorted(lda.components_[i])[::-1][:n_top_words])

Topic  0 :  [0.025322754054945974, 0.0201396031259974, 0.01691275053844443, 0.01680265388090994, 0.01653902612417313, 0.016152683468290862, 0.015432820930234537, 0.015018221677366207, 0.012948328232583529, 0.012121023172876302, 0.010022799715605737, 0.009687040101615169, 0.008482309337945715, 0.008428708811807998, 0.00821733140718083, 0.007191554027847411, 0.007165841224381235, 0.0069863209907023395, 0.0068350523020606355, 0.00682162935054214, 0.006494264978728542, 0.006028631569645656, 0.0058812968403416516, 0.005843015077513683, 0.005842250668063869, 0.005757082648369103, 0.00573399847611549, 0.005707428906594485, 0.005680881874259285, 0.00560900600042331, 0.005162375703110892, 0.004851100579834409, 0.0048220348137625445, 0.004817396193554933, 0.004703702678288774, 0.004669133312136042, 0.004583278528830006, 0.004299960985867705, 0.004265303274255722, 0.004246655507201367, 0.004062988763297621, 0.004046373592791727, 0.003755058560104933, 0.003665221266794836, 0.00365279471168147, 0.0

Topic  9 :  [0.03253992696099961, 0.023917540782147303, 0.021111852753479474, 0.020085798856223643, 0.01582045553711047, 0.012840470906219509, 0.01274517647616971, 0.01245124418099841, 0.011230871023586221, 0.010612117213670188, 0.00948863734641941, 0.00947615608019327, 0.007860765728292567, 0.007750125909569822, 0.007217833196260896, 0.00614010406605785, 0.0060034120977509415, 0.005855913122040918, 0.005702593330657882, 0.005625247539904206, 0.005357143171264391, 0.005274591215461146, 0.00483247637307162, 0.004795201961225108, 0.004789855420410042, 0.004707574208629016, 0.004663200907850315, 0.004650754064154823, 0.004602023402481568, 0.004544562902489608, 0.004420869244047223, 0.004265474571457251, 0.00419546806717695, 0.00419127092094093, 0.004157502934412823, 0.004087396660410768, 0.004072835941131068, 0.00407229392576208, 0.004040663759965339, 0.003923064841693489, 0.003800461049132662, 0.003655817921641471, 0.003574034133259635, 0.0035610469117114806, 0.0033998287941711504, 0.003

Topic  19 :  [0.02119903897535069, 0.014241262889212904, 0.01062447867468996, 0.010110782992493945, 0.010057874785145653, 0.009626231546084472, 0.009538762886031693, 0.007078433556528416, 0.00692759558034711, 0.006838317891990822, 0.006724101637895202, 0.006697833016605656, 0.006648037395559195, 0.006334545242739752, 0.006042364641365609, 0.005923662930036233, 0.005863609955834683, 0.0054225444091349825, 0.005386410126850154, 0.005377939816696128, 0.005292715525658993, 0.005289587389793565, 0.005199664970372475, 0.0051539475443999, 0.00498024960062105, 0.00488783848615296, 0.004873311964026071, 0.004830276849727176, 0.0047374516004259175, 0.0046325156358361695, 0.004473097992430455, 0.004244568655421472, 0.004214910191984655, 0.004161697722912445, 0.0040365974156107345, 0.004025942484340809, 0.004015771050610128, 0.0039034057506414205, 0.0038907911641322896, 0.0038836226439268235, 0.003854046169722655, 0.003822478429278112, 0.0035434660235020166, 0.003519610349449357, 0.003505752486676

Topic  29 :  [0.03380050759249456, 0.024649572457055628, 0.024552668416987583, 0.0229870400990776, 0.022916211746661367, 0.02113248973166927, 0.0191870066178175, 0.019086603680111863, 0.017120125591086142, 0.016212326593283317, 0.015484976145669791, 0.015191412701103616, 0.013951269615583744, 0.01387419453132334, 0.011854637128835194, 0.011794702487930154, 0.011668563179652498, 0.011468689330498107, 0.011033514360610923, 0.010556421557975787, 0.010265157649445522, 0.010255590075867368, 0.010156283984815336, 0.009867191100033004, 0.009487809106666798, 0.009106597922023034, 0.00811041736214126, 0.007318748627482964, 0.007043706995521849, 0.006794550128185029, 0.006750556358543371, 0.006523030972964235, 0.006313954615948663, 0.006273515498034808, 0.006045467382312093, 0.0057984231586302405, 0.00578481949500558, 0.005630402357225023, 0.0054487951573757975, 0.005186785420814078, 0.0051312769223808485, 0.005123393468033874, 0.004858176732406382, 0.004809237828283239, 0.004783341216691276, 0.

Topic  40 :  [0.024332348315138367, 0.02060029898795136, 0.017594786322980886, 0.01119225160408722, 0.010400395884015505, 0.010126849136380373, 0.009648941263058729, 0.009489187674922284, 0.008095406896939272, 0.007931510472594128, 0.007901036582720304, 0.007351319263516272, 0.007053486089026785, 0.006727109059632955, 0.00672415703640415, 0.0066545062417343115, 0.006317465615685715, 0.006089729611322164, 0.006039529864179709, 0.005890905350852963, 0.005844516418433757, 0.005612469024237412, 0.00538514233951704, 0.005118497382389124, 0.00502341914061764, 0.004721281261417807, 0.0046955769197405154, 0.004648533924648794, 0.004563178044524117, 0.00449547725704325, 0.004480689110638167, 0.004315187228349545, 0.003947159811543549, 0.0037471339441679827, 0.0036676871471839237, 0.0036338741445653268, 0.003616527099676236, 0.0036153988095026833, 0.0035956319599936776, 0.0035848684964591864, 0.003541264803708447, 0.0033702874119823337, 0.003363567445362385, 0.003301060555459436, 0.0032411845983

In [12]:
import nltk
import enchant
d = enchant.Dict("en_US")
def predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict):
    text = nltk.tokenize.word_tokenize(doc)
#     for word in text:
#         if not d.check(word):
#             if len(word) < 4:
#                 text.remove(word)
    temp_dist = nltk.FreqDist(text)
    word_dist = {}
    for word in temp_dist:
        word_dist[word] = temp_dist[word]
#     for word in temp_dist:
#         if word_dist[word] > 10:
#             word_dist.pop(word)
    doc_topics_prob = {}
    for topic in topic_words_dict:
        score = 0
        word_list = topic_words_dict[topic]
        prob_list = topic_words_prob_dict[topic]
        for index in range(len(word_list)):
            if word_list[index] in word_dist:
                score += word_dist[word_list[index]] * prob_list[index]
        doc_topics_prob[topic] = score
    return doc_topics_prob

In [None]:
docs[27]

In [13]:
new_vectors = []
for doc in docs:
    doc_topic_probs = predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict)
    vector = []
    for i in range(n_components):
        vector.append(doc_topic_probs[i])
    new_vectors.append(vector)

In [14]:
new_vectors = np.array(new_vectors)

In [15]:
new_vectors.shape

(4491, 50)

In [16]:
clf = mixture.GaussianMixture(n_components=10, covariance_type='full')
print("Creating Mixture object")
clf.fit(new_vectors)
print("Done fitting data")

Creating Mixture object
Done fitting data


In [17]:
pred = clf.predict_proba(new_vectors)

In [18]:
pred.shape

(4491, 10)

In [19]:
def group_by_label(l):
    it = itertools.groupby(l, operator.itemgetter(1))
    counts = []
    for key, subiter in it:
        counts.append(sum(item[0] for item in subiter))
    return counts

In [20]:
def compute_homogeneity(preds, labels):
    cluster_label_counts = []
    for pred in preds.transpose():
        cluster_label_counts.append(group_by_label([(p,label) for p,label in zip(pred,labels)]))
    
    entropys = []
    for cluster_label_count in cluster_label_counts:
        entropys.append(scipy.stats.entropy(cluster_label_count))
         
    return np.mean(entropys)

In [21]:
def compute_completeness(preds, labels, num_clusters, num_labels):
    label_cluster_counts = {label:np.zeros(num_clusters) for label in range(num_labels)}
    
    for pred, label in zip(preds, labels):
        label_cluster_counts[label] = np.sum([label_cluster_counts[label], pred], axis=0)
    
    entropys = []
    for label_cluster_count in label_cluster_counts.values():
        entropys.append(scipy.stats.entropy(label_cluster_count))
          
    return np.mean(entropys)

In [22]:
def v_measure(preds, labels, num_clusters, num_labels):
    if len(labels) == 0:
        return 1.0, 1.0, 1.0
      
    homogeneity = compute_homogeneity(preds, labels)
    completeness = compute_completeness(preds, labels, num_clusters, num_labels)
    
    if homogeneity==0.0 and completeness==0.0:
        return 0.0, 0.0, 0.0
    v_measure_score = (2.0 * homogeneity * completeness /
                   (homogeneity + completeness))
      
    return homogeneity, completeness, v_measure_score

In [23]:
topic_label_mapping = {}
label = 0
for topic in set(doc_topics):
    topic_label_mapping[topic] = label
    label += 1

In [24]:
labels = []
for topic in doc_topics:
    labels.append(topic_label_mapping[topic])

In [25]:
v_measure(pred, labels, 10, 20)

(0.6474939380403169, 0.4510979418031044, 0.531741018915238)

In [28]:
soft_cluster_size = [0 for i in range(n_clusters)]
for p in pred:
    for i in range(len(p)):
        soft_cluster_size[i] += soft_cluster_size[i] + p[i]

  after removing the cwd from sys.path.


In [29]:
max_index = -1
max_size = -1234214141241431414
for cluster_size in soft_cluster_size:
    if cluster_size > max_size:
        max_size = cluster_size
        max_index = soft_cluster_size.index(cluster_size)

In [30]:
cluster_labels = clf.predict(new_vectors)

In [31]:
doc_indexs_in_cluster_0 = []
for index in range(len(cluster_labels)):
    if cluster_labels[index] == 0:
        doc_indexs_in_cluster_0.append(index)

In [32]:
doc_indexs_in_cluster_0

[0,
 1,
 2,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 17,
 18,
 19,
 20,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 48,
 50,
 51,
 52,
 53,
 55,
 56,
 57,
 58,
 59,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 87,
 88,
 89,
 90,
 91,
 92,
 94,
 95,
 96,
 97,
 99,
 101,
 102,
 103,
 106,
 107,
 108,
 109,
 110,
 112,
 113,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 192,
 193,
 194,
 195,
 196,
 197,
 198,
 199,
 200,

In [33]:
topics_in_cluster = []
for index in doc_indexs_in_cluster_0:
    topics_probs = predict_doc_topic(docs[index], topic_words_dict, topic_words_prob_dict)
    topics_in_cluster.append(max(topics_probs.items(), key=operator.itemgetter(1))[0])

In [34]:
topic_counter = {}
for i in range(n_components):
    topic_counter[i] = 0
for topic in topics_in_cluster:
    topic_counter[topic] += 1 

In [35]:
topic_counter

{0: 234,
 1: 12,
 2: 14,
 3: 3,
 4: 2,
 5: 32,
 6: 317,
 7: 8,
 8: 2,
 9: 19,
 10: 34,
 11: 24,
 12: 3,
 13: 21,
 14: 4,
 15: 3,
 16: 5,
 17: 12,
 18: 23,
 19: 3,
 20: 3,
 21: 11,
 22: 41,
 23: 3,
 24: 33,
 25: 10,
 26: 40,
 27: 751,
 28: 3,
 29: 219,
 30: 2,
 31: 4,
 32: 74,
 33: 97,
 34: 1006,
 35: 1,
 36: 6,
 37: 47,
 38: 6,
 39: 3,
 40: 78,
 41: 10,
 42: 1,
 43: 0,
 44: 4,
 45: 84,
 46: 2,
 47: 2,
 48: 22,
 49: 45}

In [36]:
soft_cluster_size = [0 for i in range(n_clusters)]
for p in pred:
    for i in range(len(p)):
        soft_cluster_size[i] += soft_cluster_size[i] + p[i]

  after removing the cwd from sys.path.


In [37]:
np.argmax(soft_cluster_size)

0