In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score,recall_score

In [2]:
newsgroups_train = fetch_20newsgroups(subset='train')


Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [3]:
from pprint import pprint
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [4]:
newsgroups_train.filenames.shape

(11314,)

In [5]:
newsgroups_train.target.shape


(11314,)

In [6]:
newsgroups_train.target[:10]


array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [7]:
cats = ['alt.atheism', 'sci.space']

In [8]:
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)

In [9]:
list(newsgroups_train.target_names)


['alt.atheism', 'sci.space']

In [10]:
newsgroups_train.filenames.shape


(1073,)

In [11]:
newsgroups_train.target.shape


(1073,)

In [12]:
newsgroups_train.target[:10]


array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0])

In [13]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 34118)

In [14]:
vectors.nnz / float(vectors.shape[0])   #just to visualize how sparese is our vectors


159.0132743362832

In [15]:
### Naive Bayes

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),categories=categories)  #Too remove overfittinng                                     
vectors = vectorizer.fit_transform(newsgroups_train.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories=categories) 
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(vectors_test)
print("F1 Score:"+str(metrics.f1_score(newsgroups_test.target, pred, average='macro')))
print("Accuracy:"+str(accuracy_score(newsgroups_test.target,pred)))
print("Precision:"+str(precision_score(newsgroups_test.target,pred, average='macro')))
print("Recall:"+str(recall_score(newsgroups_test.target,pred, average='macro')))

F1 Score:0.7699517518452172
Accuracy:0.7908351810790836
Precision:0.774066879616238
Recall:0.7683692790737239


In [83]:
### LDA with TF-IDF + Naive Bayes

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),categories=categories)  #Too remove overfit,categories=categoriestinng                                     
tfidf_vectorizer = TfidfVectorizer()

vectors = tfidf_vectorizer.fit_transform(newsgroups_train.data)
no_topics = 50
lda_model = LatentDirichletAllocation(n_components=no_topics, random_state=0)
lda_train = lda_model.fit_transform(vectors)

clf = MultinomialNB(alpha=.01)
clf.fit(lda_train, newsgroups_train.target)


newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories=categories) 
vectors_test = tfidf_vectorizer.transform(newsgroups_test.data)
lda_test = lda_model.transform(vectors_test)
pred = clf.predict(lda_test)

print("F1 Score:"+str(metrics.f1_score(newsgroups_test.target, pred, average='macro')))
print("Accuracy:"+str(accuracy_score(newsgroups_test.target,pred)))
print("Precision:"+str(precision_score(newsgroups_test.target,pred, average='macro')))
print("Recall:"+str(recall_score(newsgroups_test.target,pred, average='macro')))




F1 Score:0.125610085073
Accuracy:0.284552845528
Precision:0.128295145049
Recall:0.244403520676


In [84]:
### LDA with Bag of Words + Naive Bayes

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),categories=categories)  #Too remove overfit,categories=categoriestinng                                     
count_vectorizer = CountVectorizer()

vectors = count_vectorizer.fit_transform(newsgroups_train.data)
no_topics = 50
lda_model = LatentDirichletAllocation(n_components=no_topics, random_state=0)
lda_train = lda_model.fit_transform(vectors)

clf = MultinomialNB(alpha=.01)
clf.fit(lda_train, newsgroups_train.target)


newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories=categories) 
vectors_test = count_vectorizer.transform(newsgroups_test.data)
lda_test = lda_model.transform(vectors_test)
pred = clf.predict(lda_test)

print("F1 Score:"+str(metrics.f1_score(newsgroups_test.target, pred, average='macro')))
print("Accuracy:"+str(accuracy_score(newsgroups_test.target,pred)))
print("Precision:"+str(precision_score(newsgroups_test.target,pred, average='macro')))
print("Recall:"+str(recall_score(newsgroups_test.target,pred, average='macro')))



F1 Score:0.517358150212
Accuracy:0.623059866962
Precision:0.727866040722
Recall:0.571272492872


In [17]:
#NMF with TF-IDF and Naive Bayes
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),categories=categories)  #Too remove overfit,categories=categoriestinng                                     
tfidf_vectorizer = TfidfVectorizer()

vectors = tfidf_vectorizer.fit_transform(newsgroups_train.data)
no_topics = 50
nmf_model = NMF(n_components=2, init='random', random_state=0)
nmf_train = nmf_model.fit_transform(vectors)

clf = MultinomialNB(alpha=.01)
clf.fit(nmf_train, newsgroups_train.target)


newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories=categories) 
vectors_test = tfidf_vectorizer.transform(newsgroups_test.data)
nmf_test = nmf_model.transform(vectors_test)
pred = clf.predict(nmf_test)

print("F1 Score:"+str(metrics.f1_score(newsgroups_test.target, pred, average='macro')))
print("Accuracy:"+str(accuracy_score(newsgroups_test.target,pred)))
print("Precision:"+str(precision_score(newsgroups_test.target,pred, average='macro')))
print("Recall:"+str(recall_score(newsgroups_test.target,pred, average='macro')))





F1 Score:0.11276473955352032
Accuracy:0.29120473022912047
Precision:0.07280118255728012
Recall:0.25


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
