In [1]:
# Block 1: Imports and Load Data
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, accuracy_score

newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

X_train = newsgroups_train.data
y_train = newsgroups_train.target
X_test = newsgroups_test.data
y_test = newsgroups_test.target
target_names = newsgroups_train.target_names

In [3]:
# Block 2: Build and Train Pipeline (TF-IDF + Naive Bayes)
model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model.fit(X_train, y_train)

In [4]:
# Block 3: Predict and Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=target_names)

print(f"Accuracy: {accuracy:.4f}\n")
print("Classification Report:")
print(report)

Accuracy: 0.6062

Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.81      0.07      0.13       319
           comp.graphics       0.72      0.62      0.67       389
 comp.os.ms-windows.misc       0.70      0.50      0.59       394
comp.sys.ibm.pc.hardware       0.55      0.75      0.64       392
   comp.sys.mac.hardware       0.81      0.61      0.69       385
          comp.windows.x       0.83      0.74      0.78       395
            misc.forsale       0.86      0.69      0.77       390
               rec.autos       0.82      0.68      0.74       396
         rec.motorcycles       0.89      0.63      0.73       398
      rec.sport.baseball       0.95      0.69      0.80       397
        rec.sport.hockey       0.59      0.90      0.71       399
               sci.crypt       0.47      0.80      0.59       396
         sci.electronics       0.77      0.43      0.55       393
                 sci.med       0.8

In [5]:
# Block 4: Example Prediction
def predict_category(s, train=newsgroups_train, model=model):
    pred = model.predict([s])
    return train.target_names[pred[0]]

example_text = "Sending graphics card drivers update soon"
print(f"\nExample text: '{example_text}'")
print(f"Predicted category: {predict_category(example_text)}")

example_text_2 = "The final frontier, exploring space and planets"
print(f"\nExample text: '{example_text_2}'")
print(f"Predicted category: {predict_category(example_text_2)}")


Example text: 'Sending graphics card drivers update soon'
Predicted category: comp.os.ms-windows.misc

Example text: 'The final frontier, exploring space and planets'
Predicted category: sci.space
