## Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from tqdm import tqdm

## Load train and test data

In [None]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

## Classifiers & Features

In [None]:
classifiers = {'knn': KNeighborsClassifier(), 'nb': MultinomialNB(), 'sgdc': SGDClassifier()}
features = {'counts': None, 'tf': TfidfTransformer(use_idf=False), 'tfidf': TfidfTransformer()}

## Sets of parameters for Grid Search

In [None]:
parameters = [
            {
                'vect__lowercase': (True, False),
            },
            {
                'vect__stop_words': (None, 'english'),
            },
            {
                'vect__analyzer': ['word'],
                'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
            },
            {
                'vect__analyzer': ['char'],
                'vect__ngram_range': [(2, 2), (3, 3), (4, 4)]
            },
            {
                'vect__max_features': (1000, 5000, 10000)
            }
]

In [None]:
doc_root_knn = int(np.sqrt(len(twenty_train.data)))
half_root_knn = int(doc_root_knn / 2)
hyperparameters = {
                  "knn": {
                              "clf__n_neighbors": [5, 11, 21, half_root_knn, doc_root_knn]
                        },
                  "nb": {
                              "clf__alpha": [0.0001, 0.001, 0.01, 0.1, 1]
                        },
                  "sgdc": {
                              "clf__alpha": [0.0001, 0.001, 0.01, 0.1, 1]
                          }
                  }

## Combinations of pipelines

In [None]:
def build_pipelines(classifiers: dict, features: dict):
    pipelines = {}
    for clf_key in classifiers.keys():
        for feat_key in features.keys():
            clf = classifiers.get(clf_key)
            feat = features.get(feat_key)
            pipe = Pipeline([
                            ('vect', CountVectorizer()),
                            ('tfidf', feat),
                            ('clf', clf)
            ])
            pipelines[(clf_key, feat_key)] = pipe
    return pipelines

In [None]:
pipelines = build_pipelines(classifiers=classifiers, features=features)

## First Experiment
Test with all the default settings, each classifier with each of the parameters. No parameters

In [None]:
all_mean_acc = []
all_prf = []
all_conf = []
classifiers = []
features = []

for key, pipe in tqdm(pipelines.items()):
    pipe.fit(twenty_train.data, twenty_train.target)
    predicted = pipe.predict(twenty_test.data)

    # mean accuracy
    mean_acc = np.mean(predicted == twenty_test.target)
    all_mean_acc.append(mean_acc)

    # precision, recall, f1 score
    prf = metrics.precision_recall_fscore_support(twenty_test.target, predicted, average='weighted')
    all_prf.append(prf)

    conf_matrix = metrics.confusion_matrix(twenty_test.target, predicted)
    all_conf.append(conf_matrix)

    classifiers.append(key[0])
    features.append(key[1])

df_data = {"Classifier": classifiers,
            "Features": features,
            "Mean Accuracy": all_mean_acc
          }
results = pd.DataFrame(df_data)
prf_df = pd.DataFrame(all_prf, columns=["Precision", "Recall", "F1", "_"]).drop(columns=['_'])
results = results.merge(prf_df, left_index=True, right_index=True)
results = results.round(3)
results.to_csv('results/experiment_default_knn.csv', index=False)
results


## Hyperparameter Tuning on Pipelines

In [None]:
all_mean_acc = []
all_prf_hyper = []
best_pipelines = {}
best_params = []
classifier_names = []
feature_names = []
all_cv_results = []
for key, pipe in tqdm(pipelines.items()):
    parameter_set = hyperparameters[key[0]]

    gs_clf = GridSearchCV(pipe, parameter_set, cv=5, n_jobs=-1)
    gs_clf.fit(twenty_train.data, twenty_train.target)
    predicted = gs_clf.predict(twenty_test.data)

    all_cv_results.append(gs_clf.cv_results_)
    
    mean_acc = np.mean(predicted == twenty_test.target)
    all_mean_acc.append(mean_acc)
    prf = metrics.precision_recall_fscore_support(twenty_test.target, predicted, average='weighted')
    all_prf_hyper.append(prf)

    best_estimator = gs_clf.best_estimator_
    best_pipelines[key] = best_estimator
    # best_pipelines.append(best_estimator)

    best_param = gs_clf.best_params_
    best_params.append(best_param)

    classifier_names.append(key[0])
    feature_names.append(key[1])
    
df_data = {"Classifier": classifier_names,
    "Features": feature_names,
    "Best Parameters": best_params,
    "Mean Accuracy": all_mean_acc
    }
results_exp2 = pd.DataFrame(df_data)
prf_hyper_df = pd.DataFrame(all_prf_hyper, columns=["Precision", "Recall", "F1", "_"]).drop(columns=['_'])
results_exp2 = results_exp2.merge(prf_hyper_df, left_index=True, right_index=True)
results_exp2 = results_exp2.round(3)
results_exp2.to_csv('results/experiment2_knn.csv')
results_exp2

## Third Experiment: Vectorizer Parameters
Uses best estimators from previous experiment

In [None]:
prf_params = []
mean_acc_params = []
classifier_names = []
feature_names = []
vect_params = []

all_cv_results_params = []

for key, pipe in tqdm(best_pipelines.items()):
    for params in parameters:
        gs_clf = GridSearchCV(pipe, params, cv=5, n_jobs=-1)
        gs_clf.fit(twenty_train.data, twenty_train.target)
        predicted = gs_clf.predict(twenty_test.data)

        mean_acc = np.mean(predicted == twenty_test.target)
        mean_acc_params.append(mean_acc)
        prf = metrics.precision_recall_fscore_support(twenty_test.target, predicted, average='weighted')
        prf_params.append(prf)

        vect_params.append(gs_clf.best_params_)
        best_estimator = gs_clf.best_estimator_
        classifier_names.append(key[0])
        feature_names.append(key[1])

        all_cv_results_params.append(gs_clf.cv_results_)

df_data = {"Classifier": classifier_names,
            "Features": feature_names,
            "Vectorizer Parameters": vect_params,
            "Mean Accuracy": mean_acc_params
          }
results_exp3 = pd.DataFrame(df_data)
prf_params_df = pd.DataFrame(prf_params, columns=["Precision", "Recall", "F1", "_"]).drop(columns=['_'])
results_exp3 = results_exp3.merge(prf_params_df, left_index=True, right_index=True)
results_exp3 = results_exp3.round(3)
results_exp3.to_csv('results/experiment3_knn.csv')
results_exp3