In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix

In [3]:
newsgroups_train = fetch_20newsgroups(subset='train')


In [4]:
from pprint import pprint
pprint(list(newsgroups_train.target_names))

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']


In [5]:
newsgroups_train.filenames.shape

(11314,)

In [6]:
newsgroups_train.target.shape


(11314,)

In [7]:
newsgroups_train.target[:10]


array([ 7,  4,  4,  1, 14, 16, 13,  3,  2,  4])

In [8]:
cats = ['alt.atheism', 'sci.space']

In [9]:
newsgroups_train = fetch_20newsgroups(subset='train', categories=cats)

In [10]:
list(newsgroups_train.target_names)


['alt.atheism', 'sci.space']

In [11]:
newsgroups_train.filenames.shape


(1073,)

In [12]:
newsgroups_train.target.shape


(1073,)

In [13]:
newsgroups_train.target[:10]


array([0, 1, 1, 1, 0, 1, 1, 0, 0, 0])

In [14]:
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape

(2034, 34118)

In [15]:
vectors.nnz / float(vectors.shape[0])   #just to visualize how sparese is our vectors


159.0132743362832

In [21]:
### Naive Bayes

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),categories=categories)  #Too remove overfittinng                                     
vectors = vectorizer.fit_transform(newsgroups_train.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories=categories) 
vectors_test = vectorizer.transform(newsgroups_test.data)
pred = clf.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='micro')
confusion_matrix(y_test,y_pred)

0.79083518107908357

In [22]:
### LDA with TF-IDF + Naive Bayes

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),categories=categories)  #Too remove overfit,categories=categoriestinng                                     
tfidf_vectorizer = TfidfVectorizer()

vectors = tfidf_vectorizer.fit_transform(newsgroups_train.data)
no_topics = 50
lda_model = LatentDirichletAllocation(n_components=no_topics, random_state=0)
lda_train = lda_model.fit_transform(vectors)

clf = MultinomialNB(alpha=.01)
clf.fit(lda_train, newsgroups_train.target)


newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories=categories) 
vectors_test = tfidf_vectorizer.transform(newsgroups_test.data)
lda_test = lda_model.transform(vectors_test)
pred = clf.predict(lda_test)

metrics.f1_score(newsgroups_test.target, pred, average='micro')




0.28455284552845528

In [23]:
### LDA with Bag of Words + Naive Bayes

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'),categories=categories)  #Too remove overfit,categories=categoriestinng                                     
count_vectorizer = CountVectorizer()

vectors = count_vectorizer.fit_transform(newsgroups_train.data)
no_topics = 50
lda_model = LatentDirichletAllocation(n_components=no_topics, random_state=0)
lda_train = lda_model.fit_transform(vectors)

clf = MultinomialNB(alpha=.01)
clf.fit(lda_train, newsgroups_train.target)


newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'),categories=categories) 
vectors_test = count_vectorizer.transform(newsgroups_test.data)
lda_test = lda_model.transform(vectors_test)
pred = clf.predict(lda_test)

metrics.f1_score(newsgroups_test.target, pred, average='micro')



0.62305986696230597