# Execute the 20news experiment

In [1]:
import nltk
import warnings
import pandas as pd
import numpy as np

import sklearn.metrics as mtr
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer

np.random.seed(42)
warnings.filterwarnings('ignore')

# Download the 20news dataset

In [2]:
data_train = fetch_20newsgroups(subset='train')
data_test = fetch_20newsgroups(subset='test')

# Set up the competing algorithms

In [3]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Support Vector Machine': SVC(),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier()
}

## Prepare train and test sets

Use a simple tokenization with `nltk.word_tokenize` and vectorize with Tf-Idf.

In [4]:
vectorizer = TfidfVectorizer(tokenizer=nltk.word_tokenize)
X_train = vectorizer.fit_transform(data_train.data)
X_test = vectorizer.transform(data_test.data)
y_train, y_test = data_train.target, data_test.target

## Fit

In [5]:
for model_name, model in models.items():
    print("Training: {}".format(model_name))
    models[model_name].fit(X_train, y_train)

Training: Logistic Regression
Training: Support Vector Machine
Training: Multinomial Naive Bayes
Training: Decision Tree
Training: Random Forest
Training: K-Nearest Neighbors


## Predict

In [6]:
predictions = {}
for model_name, model in models.items():
    print("Predicting: {}".format(model_name))
    predictions[model_name] = model.predict(X_test)

Predicting: Logistic Regression
Predicting: Support Vector Machine
Predicting: Multinomial Naive Bayes
Predicting: Decision Tree
Predicting: Random Forest
Predicting: K-Nearest Neighbors


# Print evaluation metrics

In [10]:
E = []
for estimator, y_pred in predictions.items():
    report = mtr.classification_report(y_test, y_pred, output_dict=True, zero_division=0)
    E.append({
        'Model': estimator, 'Accuracy': report['accuracy'],
        'Avg Precision (macro)': report['macro avg']['precision'],
        'Avg Recall (macro)': report['macro avg']['recall'],
        'Avg F1-score (macro)': report['macro avg']['f1-score'],
        'Avg Precision (weighted)': report['weighted avg']['precision'],
        'Avg Recall (weighted)': report['weighted avg']['recall'],
        'Avg F1-score (weighted)': report['weighted avg']['f1-score']
    })
E = pd.DataFrame(E).set_index('Model', inplace=False)

In [11]:
E

Unnamed: 0_level_0,Accuracy,Avg Precision (macro),Avg Recall (macro),Avg F1-score (macro),Avg Precision (weighted),Avg Recall (weighted),Avg F1-score (weighted)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Logistic Regression,0.805497,0.807608,0.79548,0.796723,0.810022,0.805497,0.803939
Support Vector Machine,0.78837,0.798908,0.779376,0.78363,0.801205,0.78837,0.789875
Multinomial Naive Bayes,0.744158,0.82181,0.724686,0.724273,0.817013,0.744158,0.739516
Decision Tree,0.549389,0.545602,0.542891,0.543218,0.552772,0.549389,0.550044
Random Forest,0.738582,0.747812,0.726654,0.725027,0.747611,0.738582,0.732729
K-Nearest Neighbors,0.529474,0.598623,0.528026,0.538529,0.607524,0.529474,0.544031
STC-Q,0.863516,0.86304,0.855514,0.85554,0.865961,0.863516,0.861311
STC-Q (p=1/3),0.873208,0.871232,0.866238,0.866334,0.874176,0.873208,0.871496
