In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score,recall_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 34118)

In [None]:
#NMF with TF-IDF and Naive Bayes
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),categories=categories)                                   
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                             min_df=2, stop_words='english',
                             use_idf=True)

vectors = tfidf_vectorizer.fit_transform(newsgroups_train.data)
no_topics = 50
nmf_model = NMF(n_components=no_topics, init = "nndsvd")
nmf_train = nmf_model.fit_transform(vectors)

clf = MultinomialNB(alpha=.01)
clf.fit(nmf_train, newsgroups_train.target)


newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories=categories) 
vectors_test = tfidf_vectorizer.transform(newsgroups_test.data)
nmf_test = nmf_model.transform(vectors_test)
pred = clf.predict(nmf_test)

print("F1 Score:"+str(metrics.f1_score(newsgroups_test.target, pred, average='macro')))
print("Accuracy:"+str(accuracy_score(newsgroups_test.target,pred)))
print("Precision:"+str(precision_score(newsgroups_test.target,pred, average='macro')))
print("Recall:"+str(recall_score(newsgroups_test.target,pred, average='macro')))

In [None]:
#NMF with Bag of Words and Naive Bayes
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),categories=categories)                                     
count_vectorizer = CountVectorizer(max_df=0.5, max_features=10000,
                             min_df=2, stop_words='english')

vectors = count_vectorizer.fit_transform(newsgroups_train.data)
no_topics = 50
nmf_model = NMF(n_components=no_topics, init = "nndsvd")
nmf_train = nmf_model.fit_transform(vectors)

clf = MultinomialNB(alpha=.01)
clf.fit(nmf_train, newsgroups_train.target)


newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories=categories) 
vectors_test = count_vectorizer.transform(newsgroups_test.data)
nmf_test = nmf_model.transform(vectors_test)
pred = clf.predict(nmf_test)

print("F1 Score:"+str(metrics.f1_score(newsgroups_test.target, pred, average='macro')))
print("Accuracy:"+str(accuracy_score(newsgroups_test.target,pred)))
print("Precision:"+str(precision_score(newsgroups_test.target,pred, average='macro')))
print("Recall:"+str(recall_score(newsgroups_test.target,pred, average='macro')))