## Imports

In [22]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from tqdm import tqdm

## Classifiers & Features
Final classifiers not decided yet

In [47]:
classifiers = {'lg': LogisticRegression(), 'nb': MultinomialNB(), 'sgdc': SGDClassifier()}
features = {'count': None, 'tf': TfidfTransformer(use_idf=False), 'tfidf': TfidfTransformer()}

## Sets of parameters for Grid Search

In [75]:
parameters = [
            {
                'vect__lowercase': (True, False),
            },
            {
                'vect__stop_words': (None, 'english'),
            },
            {
                'vect__analyzer': ['word'],
                'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
            },
            {
                'vect__analyzer': ['char'],
                'vect__ngram_range': [(1,4), (2,2), (2,3)]
            },
            {
                'vect__max_features': (1000, 5000, 10000)
            }
]

In [76]:
hyperparameters = {
                  "lg": {
                              "clf__random_state": [1, 2],
                              "clf__max_iter": [100, 500, 1000]
                        },
                  "nb": {
                              "clf__alpha": [0.0001, 0.001, 0.01, 0.1, 1]
                        },
                  "sgdc": {
                              "clf__alpha": [0.0001, 0.001, 0.01, 0.1, 1]
                          }
                  }

## Combinations of pipelines

In [77]:
def build_pipelines(classifiers: dict, features: dict):
    pipelines = {}
    for clf_key in classifiers.keys():
        for feat_key in features.keys():
            clf = classifiers.get(clf_key)
            feat = features.get(feat_key)
            pipe = Pipeline([
                            ('vect', CountVectorizer()),
                            ('tfidf', feat),
                            ('clf', clf)
            ])
            pipelines[(clf_key, feat_key)] = pipe
    return pipelines

## Load train and test data

In [78]:
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, random_state=42)
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, random_state=42)

In [79]:
pipelines = build_pipelines(classifiers=classifiers, features=features)

## First Experiment
Test with all the default settings, each classifier with each of the parameters. No parameters

In [None]:
all_mean_acc = []
all_prf = []
all_conf = []
classifiers = []
features = []

for i in tqdm(range(len(pipelines.values()))):
    pipelines[i].fit(twenty_train.data, twenty_train.target)
    predicted = pipelines[i].predict(twenty_test.data)

    # mean accuracy
    mean_acc = np.mean(predicted == twenty_test.target)
    all_mean_acc.append(mean_acc)

    # precision, recall, f1 score
    prf = metrics.precision_recall_fscore_support(twenty_test.target, predicted, average='weighted')
    all_prf.append(prf)

    conf_matrix = metrics.confusion_matrix(twenty_test.target, predicted)
    all_conf.append(conf_matrix)

    classifiers.append(str(pipelines[i].named_steps['clf']))
    feat = pipelines[i].named_steps['tfidf']
    if feat is None:
      feat = "Counts"
    features.append(str(feat))

df_data = {"Classifier": classifiers,
            "Features": features,
            "Mean Accuracy": all_mean_acc
          }
results = pd.DataFrame(df_data)
prf_df = pd.DataFrame(all_prf, columns=["Precision", "Recall", "F1", "_"]).drop(columns=['_'])
results.merge(prf_df, left_index=True, right_index=True)


In [38]:
results.to_csv('results/experiment_default.csv', index=False)

## Hyperparameter Tuning on Pipelines

In [None]:
all_mean_acc = []
all_prf = []
best_pipelines = []
best_params = []
classifier_names = []
feature_names = []
all_cv_results = []
for key, pipe in tqdm(pipelines.items()):
    parameter_set = hyperparameters[key[0]]

    gs_clf = GridSearchCV(pipe, parameter_set, cv=5, n_jobs=-1)
    gs_clf.fit(twenty_train.data, twenty_train.target)
    predicted = gs_clf.predict(twenty_test.data)

    all_cv_results.append(gs_clf.cv_results_)
    
    mean_acc = np.mean(predicted == twenty_test.target)
    all_mean_acc.append(mean_acc)
    prf = metrics.precision_recall_fscore_support(twenty_test.target, predicted, average='weighted')
    all_prf.append(prf)

    best_estimator = gs_clf.best_estimator_
    best_pipelines.append(best_estimator)

    best_param = gs_clf.best_params_
    best_params.append(best_param)

    classifier_names.append(str(best_estimator.named_steps['clf']))
    feature_names.append(str(best_estimator.named_steps['tfidf']))

In [101]:
df_data = {"Classifier": classifier_names,
            "Features": feature_names,
            "Best Parameters": best_params,
            "Mean Accuracy": all_mean_acc
          }
results_exp2 = pd.DataFrame(df_data)
prf_df = pd.DataFrame(all_prf, columns=["Precision", "Recall", "F1", "_"]).drop(columns=['_'])
results_exp2.merge(prf_df, left_index=True, right_index=True)
results_exp2.to_csv('results/experiment2.csv')
results_exp2

Unnamed: 0,Classifier,Features,Best Parameters,Mean Accuracy
0,LogisticRegression(random_state=1),,"{'clf__max_iter': 100, 'clf__random_state': 1}",0.792087
1,LogisticRegression(random_state=1),TfidfTransformer(use_idf=False),"{'clf__max_iter': 100, 'clf__random_state': 1}",0.726633
2,LogisticRegression(random_state=1),TfidfTransformer(),"{'clf__max_iter': 100, 'clf__random_state': 1}",0.827403
3,MultinomialNB(alpha=0.0001),,{'clf__alpha': 0.0001},0.796601
4,MultinomialNB(alpha=0.001),TfidfTransformer(use_idf=False),{'clf__alpha': 0.001},0.832714
5,MultinomialNB(alpha=0.01),TfidfTransformer(),{'clf__alpha': 0.01},0.835236
6,SGDClassifier(alpha=0.01),,{'clf__alpha': 0.01},0.807223
7,SGDClassifier(),TfidfTransformer(use_idf=False),{'clf__alpha': 0.0001},0.810542
8,SGDClassifier(),TfidfTransformer(),{'clf__alpha': 0.0001},0.853956


## Third Experiment: Vectorizer Parameters
Uses best estimators from previous experiment

In [None]:
prf_params = []
mean_acc_params = []
classifier_names = []
feature_names = []
vect_params = []

for p in tqdm(range(len(best_pipelines))):
    for params in parameters:
        gs_clf = GridSearchCV(best_pipelines[p], params, cv=5, n_jobs=-1)
        gs_clf.fit(twenty_train.data, twenty_train.target)
        predicted = gs_clf.predict(twenty_test.data)

        mean_acc = np.mean(predicted == twenty_test.target)
        mean_acc_params.append(mean_acc)
        prf = metrics.precision_recall_fscore_support(twenty_test.target, predicted, average='weighted')
        prf_params.append(prf)

        vect_params.append(gs_clf.best_params_)
        classifier_names.append(str(best_estimator.named_steps['clf']))
        feature_names.append(str(best_estimator.named_steps['tfidf']))

In [None]:
df_data = {"Classifier": classifier_names,
            "Features": feature_names,
            "Vectorizer Parameters": vect_params,
            "Mean Accuracy": mean_acc_params
          }
results_exp3 = pd.DataFrame(df_data)
prf_params_df = pd.DataFrame(prf_params, columns=["Precision", "Recall", "F1", "_"]).drop(columns=['_'])
results_exp3.merge(prf_params_df, left_index=True, right_index=True)
results_exp3.to_csv('results/experiment3.csv')