In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# Load dataset
categories = None  # None for all categories, or list of categories to filter
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

In [3]:
X = newsgroups.data
y = newsgroups.target

In [4]:
y

array([10,  3, 17, ...,  3,  1,  7])

In [5]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [6]:
# Define models and their hyperparameters for tuning within pipeline
models = {
    "MultinomialNB": {
        "model": MultinomialNB(),
        "params": {
            'clf__alpha': [0.1, 0.5, 1.0]
        }
    },
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=1000, solver='liblinear'),
        "params": {
            'clf__C': [0.1, 1, 10]
        }
    },
    "Random Forest": {
        "model": RandomForestClassifier(),
        "params": {
            'clf__n_estimators': [50, 100],
            'clf__max_depth': [None, 10]
        }
    },
    "SVM": {
        "model": SVC(),
        "params": {
            'clf__C': [0.1, 1, 10],
            'clf__kernel': ['linear', 'rbf']
        }
    }
}

In [7]:
best_models = {}

for name, mp in models.items():
    print(f"\n Tuning {name}...")

    # Create pipeline: TF-IDF vectorizer + classifier
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', mp['model'])
    ])

    grid = GridSearchCV(pipeline, mp['params'], cv=3, n_jobs=-1, scoring='accuracy')
    grid.fit(X_train, y_train)

    best_models[name] = grid.best_estimator_

    y_pred = grid.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f" Best Parameters: {grid.best_params_}")
    print(f" Accuracy: {acc:.4f}")
    print(" Classification Report:")
    print(classification_report(y_test, y_pred, target_names=newsgroups.target_names))
    print("-" * 60)


 Tuning MultinomialNB...
 Best Parameters: {'clf__alpha': 0.1}
 Accuracy: 0.7653
 Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.76      0.44      0.56       160
           comp.graphics       0.75      0.74      0.75       195
 comp.os.ms-windows.misc       0.79      0.68      0.73       197
comp.sys.ibm.pc.hardware       0.69      0.83      0.75       196
   comp.sys.mac.hardware       0.85      0.74      0.79       193
          comp.windows.x       0.87      0.88      0.88       198
            misc.forsale       0.83      0.74      0.78       195
               rec.autos       0.83      0.80      0.82       198
         rec.motorcycles       0.87      0.73      0.80       199
      rec.sport.baseball       0.94      0.85      0.89       199
        rec.sport.hockey       0.58      0.95      0.72       200
               sci.crypt       0.81      0.87      0.84       198
         sci.electronics       0.84