In [8]:
import numpy as np 
import sklearn
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split

twenty_train = fetch_20newsgroups(subset='train', shuffle=True)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True)

print(twenty_train.keys())


dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


Using the guide given in https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a
The data is in key 'data' with the target labels in key 'target'

As sklearn already provides the data in the form of train and test data, I tried combining them into one but was unable to do so. So I will split the train data in an 80-20 ratio and then finally run the classifier on the unused default test data.

In [9]:
X_train, X_test, y_train, y_test = train_test_split(twenty_train['data'], twenty_train['target'], test_size=0.2, random_state=42)

Testing out the CountVectorizer and TfidfTransformer

In [10]:
# print(twenty_train.target_names) #prints all the categories
# print("\n".join(twenty_train.data[0].split("\n"))) 

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print(X_train_counts.shape)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

(11314, 130107)
(11314, 130107)


Using pipelines to remove stopwords, use the tf-idf vectorizer and train using Naive Bayes as the classifier for the training data. We'll use the 80-20 splitted data here

In [11]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),('tfidf', TfidfTransformer()), ('clf', MultinomialNB())])
text_clf = text_clf.fit(X_train, y_train)


Use the test data to measure accuracy

In [12]:
predicted = text_clf.predict(X_test)
accuracy  = np.mean(predicted == y_test)
f1_score  = sklearn.metrics.f1_score(y_test, predicted, average='macro')
print(f" accuracy is {accuracy} and f1 score is {f1_score}")

 accuracy is 0.8842244807777286 and f1 score is 0.8733314452741745


We will try to improve some hyperparameter using gridsearch here

In [13]:
from sklearn.model_selection import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3)}

gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

gs_clf.best_score_
gs_clf.best_params_

print(f"best performance was {gs_clf.best_score_} with parameters {gs_clf.best_params_}")

best performance was 0.8932714617169374 with parameters {'clf__alpha': 0.01, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}


The performance improved marginally from 88% to 89%. The result shows that use of tf-idf is beneficial. As there was a section of the data set as test data that we haven't used until now, let's test our classifier in that unseen data.


In [14]:
X_train, X_test, y_train, y_test = train_test_split(twenty_test['data'], twenty_test['target'], test_size=0.2, random_state=42)
predicted = text_clf.predict(X_test)
accuracy  = np.mean(predicted == y_test)
f1_score  = sklearn.metrics.f1_score(y_test, predicted, average='macro')
print(f" accuracy is {accuracy} and f1 score is {f1_score}")

 accuracy is 0.814200398142004 and f1 score is 0.796604882220717
