In [1]:
Data_Dir = "./bbc"

In [2]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import load_files # Helps to load data from different class in an individual folder


In [3]:
data = load_files(Data_Dir,encoding='utf-8',decode_error='replace')

In [7]:
# calculate count of each category

labels,counts = np.unique(data.target, return_counts=True)

print(labels,counts)



(array([0, 1, 2, 3, 4]), array([510, 386, 417, 511, 401]))

In [9]:
labels_str = np.array(data.target_names)[labels]
print(dict(zip(labels_str,counts)))

{'business': 510, 'entertainment': 386, 'politics': 417, 'sport': 511, 'tech': 401}


# Data Preparation

In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(data.data,data.target,random_state=42)
len(x_train),len(x_test)

(1668, 557)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [17]:
vectorizer = TfidfVectorizer(stop_words='english',max_features=1000,decode_error='ignore')
vectorizer.fit(x_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='ignore',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=1000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

# Build Model

In [18]:
from sklearn.naive_bayes import MultinomialNB
cls = MultinomialNB()
cls.fit(vectorizer.transform(x_train),y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
from sklearn.metrics import classification_report,accuracy_score
y_train_pred = cls.predict(vectorizer.transform(x_train))
y_test_pred = cls.predict(vectorizer.transform(x_test))


In [25]:
print('Accuracy on training subset is: \n\n{:.3f}'.format(accuracy_score(y_train,y_train_pred)))
print('Accuracy on test subset is: \n\n{:.3f}'.format(accuracy_score(y_test,y_test_pred)))


Accuracy on training subset is: 

0.974
Accuracy on test subset is: 

0.959


In [28]:
print('Classification report on test subset : \n\n',classification_report(y_test,y_test_pred))

Classification report on test subset : 

              precision    recall  f1-score   support

          0       0.96      0.95      0.95       135
          1       0.95      0.96      0.96       102
          2       0.96      0.95      0.95        98
          3       0.99      0.99      0.99       134
          4       0.93      0.93      0.93        88

avg / total       0.96      0.96      0.96       557

