In [8]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd


categories = ['sci.space', 'alt.atheism']
data = fetch_20newsgroups(subset='all', categories=categories)

df = pd.DataFrame({'review': data.data, 'target': data.target})

df['review'] = df['review'].str.replace(r'[^a-zA-Z\s]', '', regex=True).str.lower()

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['target'], test_size=0.2, random_state=42)

# 5. Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=2000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# 6. Model Building
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

# 7. Prediction & Results
y_pred = model.predict(X_test_tfidf)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 98.60%

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.97      0.98       157
           1       0.98      1.00      0.99       201

    accuracy                           0.99       358
   macro avg       0.99      0.98      0.99       358
weighted avg       0.99      0.99      0.99       358

