## Imports

In [None]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from tqdm import tqdm

## Classifiers & Features
Final classifiers not decided yet

In [None]:
classifiers = {'perceptron': Perceptron(), 'naive_bayes': MultinomialNB(), 'sgdc': SGDClassifier()}
features = {'count': None, 'tf': TfidfTransformer(use_idf=False), 'tfidf': TfidfTransformer()}

## Sets of parameters for Grid Search
Split into 3, because we need ngram for word and character

In [None]:
parameters = [
            {
                'vect__lowercase': (True, False),
                'vect__stop_words': (None, 'english'),
                'vect__max_features': (1000, 5000, 10000)
            },
            {
                'vect__lowercase': (True, False),
                'vect__stop_words': (None, 'english'),
                'vect__analyzer': ['word'],
                'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
                'vect__max_features': (1000, 5000, 10000)
            },
            {
                'vect__lowercase': (True, False),
                'vect__stop_words': (None, 'english'),
                'vect__analyzer': ['char'],
                'vect__ngram_range': [(1, 2), (1, 3), (1, 4)],
                'vect__max_features': (1000, 5000, 10000)
            }
]

## Combinations of pipelines

In [None]:
def build_pipelines(classifiers: dict, features: dict):
    pipelines = []
    for clf in classifiers.values():
        for feat in features.values():
            pipe = Pipeline([
                            ('vect', CountVectorizer()),
                            ('tfidf', feat),
                            ('clf', clf)
            ])
            pipelines.append(pipe)
    return pipelines

## Load train and test data

In [None]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

In [None]:
pipelines = build_pipelines(classifiers=classifiers, features=features)

## Test for time (No need to run)
Just a small test to see how much time it takes for only the first pipeline created with all the parameters to run .fit()</br>
First parameter set: 29.36s</br>
Second parameter set: 141.29s</br>
Third parameter set: 668.34s</br>
Total run time: 13m 59s

In [None]:
import time
for params in parameters:
    gs = GridSearchCV(pipelines[0], params, cv=5, n_jobs=-1)
    start_time = time.time()
    gs = gs.fit(twenty_train.data, twenty_train.target)
    end_time = time.time()
    total_time = end_time - start_time
    print("Current parameters", params)
    print(f"Total runtime is {total_time}")

## First Experiment
Test with all the default settings, each classifier with each of the parameters. No parameters

In [None]:
all_mean_acc = []
all_prf = []

for i in tqdm(range(len(pipelines))):
    pipelines[i].fit(twenty_train.data, twenty_train.target)
    predictions = pipelines[i].predict(twenty_test.data)

    # mean accuracy
    mean_acc = np.mean(predictions == twenty_test.target)
    all_mean_acc.append(mean_acc)

    # precision, recall, f1 score
    prf = metrics.precision_recall_fscore_support(twenty_test.target, predictions, average='weighted')
    all_prf.append(prf)

    # do we need more metrics? For example: metrics.classification_report which gives for each class f1, precision & recall?

## Second Experiment: All combinations with all parameters
Needs more code to gather results each time

In [None]:
for pipe in tqdm(range(len(pipelines))):
    for params in parameters:
        gs_clf = GridSearchCV(pipelines[pipe], params, cv=5, n_jobs=-1)
        gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)