In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import NMF  # Data matrix A factored into W(weights)*H(features)
# from sklearn.decomposition import LatentDirichletAllocation  # For NMF vs. LDA comparisons
import nltk
from textprocessor import TextProcessor

In [2]:
def printTopics(model, featureNames, numTopWords):
    for i, topic in enumerate(model.components_):  # Enumerate through features (term-to-topic) matrix H
        print("Topic %d:" % i)
        print(", ".join([featureNames[j] for j in topic.argsort()[:-numTopWords-1:-1]])) # Top few words related to each topic

In [3]:
tp = TextProcessor.from_file(f_path='data/news.txt')
documents = [p[0] for p in tp.get_most_common_phrases(n=100)]
len(documents)

58

In [4]:
numFeatures = 10000  # Number of unique terms (columns) in term-document matrix

# Use tf-idf to process documents for NMF
tfidfVectorizer = TfidfVectorizer(vocabulary=None, max_df=0.95, min_df=1, max_features=numFeatures, stop_words='english')
tfidf = tfidfVectorizer.fit_transform(documents)
tfidfFeatureNames = tfidfVectorizer.get_feature_names()

In [5]:
# User input for number of topics or clusters (usually k in the literature)
numTopics = 10

# Run NMF
nmf = NMF(n_components=numTopics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

# Run LDA
#lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)

numTopWords = 10
printTopics(nmf, tfidfFeatureNames, numTopWords)
#printTopics(lda, tfFeatureNames, numTopWords)

Topic 0:
davis, bust, cds, equipment, explicit, software, firm, publishing, attorney, stricter
Topic 1:
computer, circles, equipment, nearly, magazine, national, porn, pay, play, sophisticated
Topic 2:
mail, electronic, private, inside, large, community, federal, explicit, equipment, entire
Topic 3:
attorney, district, holmes, bob, macy, computer, firm, federal, explicit, equipment
Topic 4:
city, oklahoma, police, clown, laws, district, computerized, computers, davis, electronic
Topic 5:
board, bulletin, newsletter, operators, service, commercial, sophisticated, numerous, boards, entire
Topic 6:
law, federal, pornography, enforcement, stricter, community, firm, explicit, equipment, entire
Topic 7:
material, pornographic, explicit, allegedly, illegal, materials, computerized, computers, firm, federal
Topic 8:
community, standards, real, national, firm, federal, explicit, equipment, entire, enforcement
Topic 9:
case, porn, stricter, community, firm, federal, explicit, equipment, entire, 

In [6]:
numTopWords = 10
wrds = {}
for i, topic in enumerate(nmf.components_):  # Enumerate through features (term-to-topic) matrix H
    for j in topic.argsort()[:-numTopWords-1:-1]:
        wrd = tfidfFeatureNames[j]
        w = wrds.get(wrd, 0)
        wrds[wrd] = max(w, nmf.components_[i][j])
    # wrds.update(set([tfidfFeatureNames[j] for j in topic.argsort()[:-numTopWords-1:-1]]))
print(wrds)

{'davis': 1.6495354270273892, 'bust': 0.15878782074079922, 'cds': 0.15878782074079922, 'equipment': 0.099579730537515232, 'explicit': 0.26972844619731651, 'software': 0.039314604897338623, 'firm': 0.039314604897338623, 'publishing': 0.039314604897338623, 'attorney': 1.250157088002154, 'stricter': 0, 'computer': 1.5990254703323554, 'circles': 0.17367561134112047, 'nearly': 0.091285159014620282, 'magazine': 0.080254082003283878, 'national': 0.1993127122157245, 'porn': 0.25605458286249178, 'pay': 0.044177041286732478, 'play': 0.044177041286732478, 'sophisticated': 0.12836245631912671, 'mail': 1.3349099921562573, 'electronic': 0.55829267159979623, 'private': 0.16632083394779326, 'inside': 0.16053287525614265, 'large': 0.16053287525614265, 'community': 1.1783483551280112, 'federal': 0.26973990307082224, 'entire': 0, 'district': 0.4209534055941701, 'holmes': 0.22819280356730276, 'bob': 0.12840283006481107, 'macy': 0.12840283006481107, 'city': 1.0853893231940561, 'oklahoma': 0.643608294963213

In [16]:
import operator
output = sorted(wrds.items(), key=operator.itemgetter(1), reverse=True)
for p in tp.get_most_common_phrases(filter_words=[w[0] for w in output[:10]]):
    print(p[0])

explicit material davis
davis computer
davis computer equipment
davis case
davis attorney
computer porn case
pornographic computerized materials
national computer community
computer bulletin board system
sophisticated commercial computer bulletin board system


In [8]:
tp.get_most_common_phrases()

[('computer bulletin board system', 3),
 ('sophisticated commercial computer bulletin board system', 3),
 ('explicit material davis', 2),
 ('davis computer', 2),
 ('davis computer equipment', 2),
 ('davis case', 2),
 ('davis attorney', 2),
 ('davis system', 2),
 ('computer porn case', 2),
 ('computer system', 2)]