In [1]:
import ast
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn import mixture
import itertools, operator
import scipy.stats

In [31]:
n_components = 10
n_top_words = 50
n_clusters = 10

In [3]:
whole_docs = []
whole_doc_labels = []
whole_doc_id = []
file = open("whole_dataset.txt", 'r')
for line in file:
    line = ast.literal_eval(line)
    whole_doc_id.append(line[0])
    whole_doc_labels.append(line[1])
    whole_docs.append(line[2])

In [4]:
docs = []
doc_topics = []
doc_id = []
file = open("sample_dataset.txt", 'r')
for line in file:
    line = ast.literal_eval(line)
    doc_id.append(line[0])
    doc_topics.append(line[1])
    docs.append(line[2])

In [5]:
topic_words_dict = {}

In [6]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        topic_words_dict[topic_idx] = message.split()[2:] 
        print(message)
    print()

In [7]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(whole_docs)
print("Done")

Extracting tf features for LDA...
Done


In [8]:
print("Fitting LDA models with tf features...")
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)
print("Done")

Fitting LDA models with tf features...
Done


In [9]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: 10 25 12 15 20 11 00 14 000 16 13 30 17 18 19 team 50 40 arab 28 22 new 26 21 23 24 27 season 31 russia vs jehovah play 29 35 33 60 league power 34 38 1st 32 80 det 42 series 37 45 75
Topic #1: people don just think like know time said did say good way right does make believe ve going government years really want point year new things law ll didn fact come question let day mr life long state president use better sure children used thing got doesn true case course
Topic #2: myers cancer adl pts gm bullock lebanon dee w7 cx chronic clinical 17 pitching sky c_ liver corpses uw ck nutrition hz t7 cubs chz ss defeated b6 lk sp breast embargo defensive gant defamation w1 24 pms providence magnesium pitchers leaf hitter calcium bc ron smokeless al alomar a7
Topic #3: edu space com university 1993 nasa mail list information available ftp april cs pub send ca research server sun san army 1992 national news anonymous 93 gov contact mit pit address center subject i

In [10]:
topic_words_prob_dict = {}

In [11]:
lda.components_ /= lda.components_.sum(axis=1)[:, np.newaxis]
for i in range(n_components):
    topic_words_prob_dict[i] = sorted(lda.components_[i])[::-1][:n_top_words] 
    print("Topic ",i,": ", sorted(lda.components_[i])[::-1][:n_top_words])

Topic  0 :  [0.01759491971046672, 0.01239994974952365, 0.011806640988006557, 0.011583959365739515, 0.011437053197477983, 0.009939797385691245, 0.009776647072210085, 0.008902063714394181, 0.0086912192712741, 0.008519816139045252, 0.0077730891261401855, 0.007713252322532368, 0.007433130664784713, 0.007073472182527312, 0.006536247391303389, 0.0064022001357547645, 0.006295022963667286, 0.0059347417254066205, 0.0056630506800525995, 0.005504146842298651, 0.005467368788261871, 0.0054462600424007015, 0.0053613379331516175, 0.005315948081883595, 0.004780863109826516, 0.004639703480735854, 0.004595891629704095, 0.004154248494753469, 0.004041491589903295, 0.0037998996997051226, 0.0037099413467352664, 0.0036107124082551826, 0.00360747992271911, 0.0035860753364310707, 0.0034738252979943872, 0.0034340857105170097, 0.003268632956535425, 0.003210099694204126, 0.0030524408306140637, 0.0030456444282097625, 0.002912344884513777, 0.002888929677520292, 0.0028316779229265445, 0.002791236220568835, 0.0027492

Topic  8 :  [0.036350925693910664, 0.018372743238581016, 0.016467145286651382, 0.014015326785038291, 0.013916554056279666, 0.013723905699069429, 0.01203855252142939, 0.011254615152027558, 0.010663310945355062, 0.01047462746408932, 0.00939567020506815, 0.00909319518131555, 0.00854076540116057, 0.007899162103010442, 0.007151413655867338, 0.007043315863715094, 0.00610130259681285, 0.005958506856854531, 0.005827274989926142, 0.005602447287507066, 0.005348430430120089, 0.004840755090846378, 0.004550326119156347, 0.004230322737468553, 0.004196040276320416, 0.004087142053309006, 0.004039578469829165, 0.003991824895136781, 0.003891748405973123, 0.0036065613978433727, 0.0034949675830049766, 0.0034716554920020066, 0.003455000152157379, 0.0034258162580423895, 0.00340296475520623, 0.0033956067042299255, 0.0033628427141547575, 0.0031163157219543255, 0.0030729426661569004, 0.003026671615697633, 0.0030118338535545294, 0.002819157630207219, 0.0028106648906197616, 0.002802753371472271, 0.00276122089486

In [12]:
import nltk
import enchant
d = enchant.Dict("en_US")
def predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict):
    text = nltk.tokenize.word_tokenize(doc)
#     for word in text:
#         if not d.check(word):
#             if len(word) < 4:
#                 text.remove(word)
    temp_dist = nltk.FreqDist(text)
    word_dist = {}
    for word in temp_dist:
        word_dist[word] = temp_dist[word]
#     for word in temp_dist:
#         if word_dist[word] > 10:
#             word_dist.pop(word)
    doc_topics_prob = {}
    for topic in topic_words_dict:
        score = 0
        word_list = topic_words_dict[topic]
        prob_list = topic_words_prob_dict[topic]
        for index in range(len(word_list)):
            if word_list[index] in word_dist:
                score += word_dist[word_list[index]] * prob_list[index]
        doc_topics_prob[topic] = score
    return doc_topics_prob

In [None]:
docs[27]

In [13]:
new_vectors = []
for doc in docs:
    doc_topic_probs = predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict)
    vector = []
    for i in range(n_components):
        vector.append(doc_topic_probs[i])
    new_vectors.append(vector)

In [14]:
new_vectors = np.array(new_vectors)

In [15]:
new_vectors.shape

(4491, 10)

In [16]:
clf = mixture.GaussianMixture(n_components=n_clusters, covariance_type='full')
print("Creating Mixture object")
clf.fit(new_vectors)
print("Done fitting data")

Creating Mixture object
Done fitting data


In [17]:
pred = clf.predict_proba(new_vectors)

In [None]:
pred.shape

In [18]:
def group_by_label(l):
    it = itertools.groupby(l, operator.itemgetter(1))
    counts = []
    for key, subiter in it:
        counts.append(sum(item[0] for item in subiter))
    return counts

In [19]:
def compute_homogeneity(preds, labels):
    cluster_label_counts = []
    for pred in preds.transpose():
        cluster_label_counts.append(group_by_label([(p,label) for p,label in zip(pred,labels)]))
    
    entropys = []
    for cluster_label_count in cluster_label_counts:
        entropys.append(scipy.stats.entropy(cluster_label_count))
         
    return np.mean(entropys)

In [20]:
def compute_completeness(preds, labels, num_clusters, num_labels):
    label_cluster_counts = {label:np.zeros(num_clusters) for label in range(num_labels)}
    
    for pred, label in zip(preds, labels):
        label_cluster_counts[label] = np.sum([label_cluster_counts[label], pred], axis=0)
    
    entropys = []
    for label_cluster_count in label_cluster_counts.values():
        entropys.append(scipy.stats.entropy(label_cluster_count))
          
    return np.mean(entropys)

In [21]:
def v_measure(preds, labels, num_clusters, num_labels):
    if len(labels) == 0:
        return 1.0, 1.0, 1.0
      
    homogeneity = compute_homogeneity(preds, labels)
    completeness = compute_completeness(preds, labels, num_clusters, num_labels)
    
    if homogeneity==0.0 and completeness==0.0:
        return 0.0, 0.0, 0.0
    v_measure_score = (2.0 * homogeneity * completeness /
                   (homogeneity + completeness))
      
    return homogeneity, completeness, v_measure_score

In [22]:
topic_label_mapping = {}
label = 0
for topic in set(doc_topics):
    topic_label_mapping[topic] = label
    label += 1

In [23]:
labels = []
for topic in doc_topics:
    labels.append(topic_label_mapping[topic])

In [26]:
# print(v_measure(preds, labels, num_clusters, num_labels))
v_measure(pred, labels, 10, 20)

(1.1674909457219775, 0.8323492689969504, 0.9718378779264868)

In [30]:
pred.shape

(4491, 10)

In [37]:
soft_cluster_size = [0 for i in range(n_clusters)]
for p in pred:
    for i in range(len(p)):
        soft_cluster_size[i] += soft_cluster_size[i] + p[i]

  after removing the cwd from sys.path.


In [42]:
np.argmax(soft_cluster_size)

0

In [43]:
max_index = -1
max_size = -1234214141241431414
for cluster_size in soft_cluster_size:
    if cluster_size > max_size:
        max_size = cluster_size
        max_index = soft_cluster_size.index(cluster_size)

In [45]:
cluster_labels = clf.predict(new_vectors)

In [50]:
doc_indexs_in_cluster_0 = []
for index in range(len(cluster_labels)):
    if cluster_labels[index] == 0:
        doc_indexs_in_cluster_0.append(index)
    

In [51]:
doc_indexs_in_cluster_0

[2,
 3,
 4,
 9,
 10,
 16,
 18,
 28,
 30,
 31,
 33,
 34,
 37,
 44,
 54,
 57,
 59,
 60,
 62,
 63,
 64,
 65,
 67,
 73,
 77,
 78,
 83,
 91,
 95,
 98,
 104,
 105,
 109,
 110,
 115,
 124,
 126,
 128,
 129,
 132,
 138,
 143,
 146,
 149,
 151,
 152,
 159,
 187,
 201,
 206,
 209,
 217,
 234,
 268,
 285,
 293,
 298,
 317,
 341,
 346,
 360,
 378,
 385,
 388,
 399,
 403,
 414,
 431,
 448,
 467,
 473,
 521,
 567,
 621,
 640,
 653,
 700,
 727,
 824,
 843,
 905,
 945,
 1001,
 1035,
 1073,
 1076,
 1099,
 1108,
 1122,
 1133,
 1140,
 1152,
 1155,
 1164,
 1172,
 1177,
 1190,
 1219,
 1227,
 1252,
 1260,
 1266,
 1273,
 1327,
 1338,
 1349,
 1371,
 1394,
 1408,
 1434,
 1523,
 1537,
 1574,
 1607,
 1641,
 1665,
 1686,
 1690,
 1694,
 1712,
 1716,
 1717,
 1719,
 1721,
 1728,
 1731,
 1733,
 1751,
 1762,
 1764,
 1765,
 1766,
 1767,
 1768,
 1769,
 1779,
 1781,
 1788,
 1798,
 1799,
 1800,
 1807,
 1809,
 1815,
 1821,
 1828,
 1834,
 1848,
 1849,
 1853,
 1855,
 1859,
 1864,
 1866,
 1870,
 1873,
 1874,
 1879,
 1882,
 18

In [55]:
predict_doc_topic(docs[2], topic_words_dict, topic_words_prob_dict)

{0: 0,
 1: 0.02654177904481197,
 2: 0,
 3: 0,
 4: 0,
 5: 0,
 6: 0,
 7: 0.005984284250031998,
 8: 0.009681510181692755,
 9: 0}

In [59]:
topics_in_cluster = []
for index in doc_indexs_in_cluster_0:
    topics_probs = predict_doc_topic(docs[index], topic_words_dict, topic_words_prob_dict)
    topics_in_cluster.append(max(topics_probs.items(), key=operator.itemgetter(1))[0])

In [61]:
set(topics_in_cluster)

{0, 1, 2, 3, 6, 7, 8, 9}

In [68]:
topic_counter = {}
for i in range(n_components):
    topic_counter[i] = 0
for topic in topics_in_cluster:
    topic_counter[topic] += 1 

In [69]:
topic_counter

{0: 179, 1: 407, 2: 7, 3: 1, 4: 0, 5: 0, 6: 4, 7: 67, 8: 212, 9: 38}