<a href="https://colab.research.google.com/github/a-essa/20newsgroups/blob/master/Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import LatentDirichletAllocation

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
text_clf = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf', MultinomialNB()),
])

In [0]:
text_clf_svm = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',
                                          alpha=1e-3, random_state=42)),
])

In [7]:
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.8096123207647371

In [3]:
text_clf_svm = text_clf_svm.fit(twenty_train.data, twenty_train.target)
predicted_svm = text_clf_svm.predict(twenty_test.data)
np.mean(predicted_svm == twenty_test.target)

0.822623473181094

In [5]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'vect__stop_words': (None,stopwords.words('english')),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
gs_clf.best_params_
gs_clf.best_score_



0.9138239303476172

In [10]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
              'vect__stop_words': (None,stopwords.words('english')),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
gs_clf.best_score_



0.9157684864695698

In [11]:
gs_clf.best_params_

{'clf__alpha': 0.001,
 'tfidf__use_idf': True,
 'vect__ngram_range': (1, 2),
 'vect__stop_words': None}

In [4]:
from sklearn.model_selection import GridSearchCV
parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)],
                'tfidf__use_idf': (True, False),
                'clf-svm__alpha': (1e-2, 1e-3),
}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(twenty_train.data, twenty_train.target)
gs_clf_svm.best_params_
gs_clf_svm.best_score_



{'clf-svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}

In [5]:
gs_clf_svm.best_score_

0.9038363275277961

In [14]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=set(stopwords.words('english')))),
                    ('lda', LatentDirichletAllocation(n_components=400, random_state=0)),
                    ('clf', MultinomialNB()),
])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.6457780138077536

In [13]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=set(stopwords.words('english')))),
                    ('lda', LatentDirichletAllocation(n_components=200, random_state=0)),
                    ('clf', MultinomialNB()),
])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.6295804567180032

In [12]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=set(stopwords.words('english')))),
                    ('lda', LatentDirichletAllocation(n_components=100, random_state=0)),
                    ('clf', MultinomialNB()),
])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.5728890069038768

In [11]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=set(stopwords.words('english')))),
                    ('lda', LatentDirichletAllocation(n_components=50, random_state=0)),
                    ('clf', MultinomialNB()),
])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.5561603823685608

In [10]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=set(stopwords.words('english')))),
                    ('lda', LatentDirichletAllocation(n_components=20, random_state=0)),
                    ('clf', MultinomialNB()),
])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.460169941582581

In [9]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words=set(stopwords.words('english')))),
                    ('lda', LatentDirichletAllocation(n_components=10, random_state=0)),
                    ('clf', MultinomialNB()),
])
text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
predicted = text_clf.predict(twenty_test.data)
np.mean(predicted == twenty_test.target)

0.36152416356877326