In [51]:
import ast
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn import mixture
import itertools, operator
import scipy.stats

In [2]:
n_components = 20
n_top_words = 50

In [3]:
whole_docs = []
whole_doc_labels = []
whole_doc_id = []
file = open("whole_dataset.txt", 'r')
for line in file:
    line = ast.literal_eval(line)
    whole_doc_id.append(line[0])
    whole_doc_labels.append(line[1])
    whole_docs.append(line[2])

In [14]:
docs = []
doc_topics = []
doc_id = []
file = open("sample_dataset.txt", 'r')
for line in file:
    line = ast.literal_eval(line)
    doc_id.append(line[0])
    doc_topics.append(line[1])
    docs.append(line[2])

In [4]:
topic_words_dict = {}

In [5]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        topic_words_dict[topic_idx] = message.split()[2:] 
        print(message)
    print()

In [6]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(whole_docs)
print("Done")

Extracting tf features for LDA...
Done


In [7]:
print("Fitting LDA models with tf features...")
lda = LatentDirichletAllocation(n_components=n_components, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)
print("Done")

Fitting LDA models with tf features...
Done


In [8]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0: 10 israel 12 25 15 20 11 jesus 14 13 16 00 17 18 lord israeli 30 19 arab 22 21 28 myers 26 40 23 27 24 john jewish 50 31 vs 29 god st 34 33 arabs 35 38 son 60 42 37 insurance shall 32 36 elohim
Topic #1: turkish atf taxes msg printer flyers energy print husband apple canon girls caps raid baptism morris injury laser coach hp dust teacher tied paper edu kidney ii boston establishment wright dry dept armored outlets district ball struck preserve protest aluminum detectors michael mothers discharge mac coat clark outlet virginia bathroom
Topic #2: car 000 water miles south cars radio oil british engine air road power new vehicle years earth cold light auto city station professor fuel gm york high resistance vancouver pope radar palestine holocaust probe canada clutch ground 1914 wiring north chicago street great lights activities heavy american room bmw montreal
Topic #3: hiv zone lebanon candida clinical surgery wounded hicnet contest sharks defensive infe

In [9]:
topic_words_prob_dict = {}

In [10]:
lda.components_ /= lda.components_.sum(axis=1)[:, np.newaxis]
for i in range(n_components):
    topic_words_prob_dict[i] = sorted(lda.components_[i])[::-1][:n_top_words] 
    print("Topic ",i,": ", sorted(lda.components_[i])[::-1][:n_top_words])

Topic  0 :  [0.01954601605045436, 0.017362991786550895, 0.01612802089995114, 0.015491786803195889, 0.014130983417542608, 0.013938671022748872, 0.013665295321369845, 0.01319606657551081, 0.012470437867539751, 0.010659587130202115, 0.010490246737488438, 0.010327353307400006, 0.010152412218925006, 0.009972607600043063, 0.009381688131825092, 0.00937807966774125, 0.009238428697868109, 0.008981147749734214, 0.007718464986846207, 0.007431084824530577, 0.007410142967604377, 0.0073693273544623876, 0.007290161363043345, 0.007216093683073148, 0.007052593314617261, 0.006969794461210755, 0.0062915020175743136, 0.00626370371575111, 0.006034398207208108, 0.006025074306446298, 0.0054849312464691725, 0.005448951469361569, 0.00514916634116765, 0.004862128123941317, 0.004797224448347936, 0.004668863819701198, 0.004333482921848851, 0.004329173325040134, 0.004293069765054766, 0.004254993927514811, 0.004027048721446008, 0.003965598241877788, 0.003762786560413856, 0.0037179969325351796, 0.003548528886180912,

Topic  10 :  [0.03010940283855854, 0.02060459355183286, 0.018313531519718387, 0.012133135234985708, 0.012108149692922848, 0.01028438326028334, 0.01015334298898388, 0.008708293608547664, 0.007612387944007893, 0.007216126358335083, 0.006896894017382291, 0.006589876515255363, 0.0062660866621292366, 0.006163934769373111, 0.00594752021107348, 0.00548402027929277, 0.005404070481579255, 0.0053347701897506145, 0.0053320789289505255, 0.004966870737175993, 0.00477344405152298, 0.004288406436671742, 0.0042740797583517495, 0.003965654562460228, 0.0035251395493404317, 0.003478175288200262, 0.0034444885527906326, 0.0032704029488830437, 0.0031834016104880843, 0.0030960993028355433, 0.003050925647410552, 0.003049753967968371, 0.002964320565398728, 0.002878265204714951, 0.002797814172045718, 0.002725952833625145, 0.0026795703061869573, 0.002592743065044326, 0.0025856525146721356, 0.0025552043171786134, 0.002463286111648168, 0.002362293569591973, 0.0023441754408027766, 0.0021698345670238613, 0.002141452

In [29]:
import nltk
import enchant
d = enchant.Dict("en_US")
def predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict):
    text = nltk.tokenize.word_tokenize(doc)
#     for word in text:
#         if not d.check(word):
#             if len(word) < 4:
#                 text.remove(word)
    temp_dist = nltk.FreqDist(text)
    word_dist = {}
    for word in temp_dist:
        word_dist[word] = temp_dist[word]
#     for word in temp_dist:
#         if word_dist[word] > 10:
#             word_dist.pop(word)
    doc_topics_prob = {}
    for topic in topic_words_dict:
        score = 0
        word_list = topic_words_dict[topic]
        prob_list = topic_words_prob_dict[topic]
        for index in range(len(word_list)):
            if word_list[index] in word_dist:
                score += word_dist[word_list[index]] * prob_list[index]
        doc_topics_prob[topic] = score
    return doc_topics_prob

In [28]:
docs[27]

"this is hell.  hasn't anyone noticed?"

In [22]:
new_vectors = []
for doc in docs:
    doc_topic_probs = predict_doc_topic(doc, topic_words_dict, topic_words_prob_dict)
    vector = []
    for i in range(n_components):
        vector.append(doc_topic_probs[i])
    new_vectors.append(vector)

In [23]:
new_vectors = np.array(new_vectors)

In [None]:
new_vectors.shape

In [31]:
clf = mixture.GaussianMixture(n_components=10, covariance_type='full')
print("Creating Mixture object")
clf.fit(new_vectors)
print("Done fitting data")

Creating Mixture object
Done fitting data


In [33]:
pred = clf.predict_proba(new_vectors)

In [34]:
pred.shape

(4491, 10)

In [36]:
def group_by_label(l):
    it = itertools.groupby(l, operator.itemgetter(1))
    counts = []
    for key, subiter in it:
        counts.append(sum(item[0] for item in subiter))
    return counts

In [37]:
def compute_homogeneity(preds, labels):
    cluster_label_counts = []
    for pred in preds.transpose():
        cluster_label_counts.append(group_by_label([(p,label) for p,label in zip(pred,labels)]))
    
    entropys = []
    for cluster_label_count in cluster_label_counts:
        entropys.append(scipy.stats.entropy(cluster_label_count))
         
    return np.mean(entropys)

In [38]:
def compute_completeness(preds, labels, num_clusters, num_labels):
    label_cluster_counts = {label:np.zeros(num_clusters) for label in range(num_labels)}
    
    for pred, label in zip(preds, labels):
        label_cluster_counts[label] = np.sum([label_cluster_counts[label], pred], axis=0)
    
    entropys = []
    for label_cluster_count in label_cluster_counts.values():
        entropys.append(scipy.stats.entropy(label_cluster_count))
          
    return np.mean(entropys)

In [39]:
def v_measure(preds, labels, num_clusters, num_labels):
    if len(labels) == 0:
        return 1.0, 1.0, 1.0
      
    homogeneity = compute_homogeneity(preds, labels)
    completeness = compute_completeness(preds, labels, num_clusters, num_labels)
    
    if homogeneity==0.0 and completeness==0.0:
        return 0.0, 0.0, 0.0
    v_measure_score = (2.0 * homogeneity * completeness /
                   (homogeneity + completeness))
      
    return homogeneity, completeness, v_measure_score

In [45]:
topic_label_mapping = {}
label = 0
for topic in set(doc_topics):
    topic_label_mapping[topic] = label
    label += 1

In [47]:
labels = []
for topic in doc_topics:
    labels.append(topic_label_mapping[topic])

In [52]:
v_measure(pred, labels, 10, 20)

(1.579257172937448, 1.1403383403493508, 1.3243789341274603)