# Text classification with Naive Bayes classifier

In [None]:
# Data loading
from sklearn.datasets import fetch_20newsgroups

#Preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer

#Model/Estimators
from sklearn.naive_bayes import MultinomialNB

#Pipeline utility
from sklearn.pipeline import Pipeline, make_pipeline

#Model evaluation
from sklearn.metrics import ConfusionMatrixDisplay

#Plotting
import matplotlib.pyplot as plt

## Dataset

In [None]:
data = fetch_20newsgroups()

In [None]:
data.target_names

There are 20 categories in the dataset. For simplicity, we will select 4 of these categories

In [None]:
categories = ['talk.religion.misc', 'soc.religion.christian', 'sci.space', 'comp.graphics']

train = fetch_20newsgroups(subset='train', categories=categories)
test = fetch_20newsgroups(subset='test', categories=categories)

In [None]:
print(train.data[5])

## Data preprocessing and modelling

In [None]:
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [None]:
model.fit(train.data, train.target)

## Model evaluation

In [None]:
ConfusionMatrixDisplay.from_estimator(model,
                                        test.data, test.target,
                                        display_labels = test.target_names,
                                        xticks_rotation='vertical')

plt.show()

## Using the Model

In [None]:
def predict_category(s, train=train, model=model):
    pred = model.predict([s])
    return train.target_names[pred[0]]


In [None]:
predict_category('sending a payload to the ISS')

In [None]:
predict_category('discussing hinduism vs atheism')

In [None]:
predict_category('detemining the screen resolution')