In [29]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score,recall_score,matthews_corrcoef,cohen_kappa_score
from sklearn.preprocessing import MinMaxScaler

import time as time

In [30]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 34118)

In [31]:
### LDA with TF-IDF + Naive Bayes


newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),categories=categories)                      
tfidf_vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                             min_df=2, stop_words='english',
                             use_idf=True)
vectors = tfidf_vectorizer.fit_transform(newsgroups_train.data)
no_topics = 50

start1 = time.time()
lda_model = LatentDirichletAllocation(n_components=no_topics, random_state=0)
lda_train = lda_model.fit_transform(vectors)
end1 = time.time()

clf = MultinomialNB(alpha=.01)
clf.fit(lda_train, newsgroups_train.target)


newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories=categories) 
vectors_test = tfidf_vectorizer.transform(newsgroups_test.data)

start2 = time.time()
lda_test = lda_model.transform(vectors_test)
end2 = time.time()

pred = clf.predict(lda_test)

print("F1 Score:"+str(metrics.f1_score(newsgroups_test.target, pred, average='macro')))
print("Accuracy:"+str(accuracy_score(newsgroups_test.target,pred)))
print("Precision:"+str(precision_score(newsgroups_test.target,pred, average='macro')))
print("Recall:"+str(recall_score(newsgroups_test.target,pred, average='macro')))
print("Time:"+str((end1-start1)+(end2-start2)))
print("Matthew's correlation coefficient:"+str(matthews_corrcoef(newsgroups_test.target,pred)))
print("Cohen Kappa Score:"+str(cohen_kappa_score(newsgroups_test.target,pred)))

[1 1 1 ... 2 2 2]
F1 Score:0.2821432258352467
Accuracy:0.4050258684405026
Precision:0.3802636587408329
Recall:0.35232473491391836
Time:8.814049005508423
Matthew's correlation coefficient:0.2144463659722853
Cohen Kappa Score:0.16328908937545816


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
### LDA with Bag of Words + Naive Bayes

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),categories=categories)                                   
count_vectorizer = CountVectorizer(max_df=0.5, max_features=10000,
                             min_df=2, stop_words='english')

vectors = count_vectorizer.fit_transform(newsgroups_train.data)
no_topics = 50
lda_model = LatentDirichletAllocation(n_components=no_topics, random_state=0)
lda_train = lda_model.fit_transform(vectors)

clf = MultinomialNB(alpha=.01)
clf.fit(lda_train, newsgroups_train.target)


newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories=categories) 
vectors_test = count_vectorizer.transform(newsgroups_test.data)
lda_test = lda_model.transform(vectors_test)
pred = clf.predict(lda_test)

print("F1 Score:"+str(metrics.f1_score(newsgroups_test.target, pred, average='macro')))
print("Accuracy:"+str(accuracy_score(newsgroups_test.target,pred)))
print("Precision:"+str(precision_score(newsgroups_test.target,pred, average='macro')))
print("Recall:"+str(recall_score(newsgroups_test.target,pred, average='macro')))