<a href="https://colab.research.google.com/github/Zh5rakib/data-science-portfolio/blob/main/05-text-classification-20newsgroups.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Project 5: NLP — Text Classification (20 Newsgroups subset)
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

categories = ["sci.space", "rec.sport.hockey", "talk.politics.misc"]
train = fetch_20newsgroups(subset="train", categories=categories, remove=("headers","footers","quotes"))
test = fetch_20newsgroups(subset="test", categories=categories, remove=("headers","footers","quotes"))

clf = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words="english", max_df=0.9)),
    ("svm", LinearSVC())
])

clf.fit(train.data, train.target)
pred = clf.predict(test.data)

print(f"Accuracy: {accuracy_score(test.target, pred):.3f}")
print("\nClassification report:\n")
print(classification_report(test.target, pred, target_names=categories))

samples = [
    "The rocket successfully entered orbit around Mars.",
    "The referee missed an obvious offside call in the third period.",
    "The new tax bill sparked debate in Congress."
]
pred_ids = clf.predict(samples)
for s, i in zip(samples, pred_ids):
    print(f"\nText: {s}\nPredicted class: {categories[i]}")

Accuracy: 0.899

Classification report:

                    precision    recall  f1-score   support

         sci.space       0.95      0.93      0.94       399
  rec.sport.hockey       0.85      0.93      0.89       394
talk.politics.misc       0.90      0.83      0.86       310

          accuracy                           0.90      1103
         macro avg       0.90      0.89      0.90      1103
      weighted avg       0.90      0.90      0.90      1103


Text: The rocket successfully entered orbit around Mars.
Predicted class: rec.sport.hockey

Text: The referee missed an obvious offside call in the third period.
Predicted class: sci.space

Text: The new tax bill sparked debate in Congress.
Predicted class: talk.politics.misc
