In [None]:
import argparse
import requests
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.calibration import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import Perceptron, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from spacy.lang.en.stop_words import STOP_WORDS
from sentence_transformers import SentenceTransformer
import torch

*A remark about this notebook:* Because we used computing resources of the RWTH Aachen High Performance Computing Cluster, we did not use a single coherent notebook. Instead, we used scripts to submit jobs to the cluster, and then collected all methods and results in this notebook. This is why the notebook is not as coherent as it could be.

### Convenience functions

In [None]:
# This is only required for running the hyperparameter search on the HPC cluster
def parse_arguments():
    parser = argparse.ArgumentParser(description="finding hyperparameters")
    # model_id = SLURM_ARRAY_TASK_ID
    parser.add_argument("-i", "--model_id", help="ID specifying a model", type=int)
    return parser.parse_args()

In [None]:
def save_model(model, fname):
  with open(fname,'wb') as f:
    pickle.dump(model, f)

def load_model(fname):
  with open(fname, 'rb') as f:
    return pickle.load(f)

In [None]:
# Datasets
TRAIN_SET = Path("./a3_train_final.tsv")
TEST_SET = Path("./a3_test.tsv")

In [None]:
# Telegram settings, used to retrieve real-time updates from the cluster
TELEGRAM_BOT_TOKEN = "1234"
TELEGRAM_CHAT_ID = "1234"

def send_telegram_message(text):
    requests.get(f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage?chat_id={TELEGRAM_CHAT_ID}&text={text}")

### Helper functions

In [None]:
label_dict = {
    0: "anti-vaccination",
    1: "pro-vaccination"
}

def remove_ambiguous_annotations(df):
  def check_ambiguity(annotation):
    individual_annotations = [int(i) for i in annotation.split("/")]
    # check if all annotations are equal
    if individual_annotations.count(individual_annotations[0]) == len(individual_annotations):
      return individual_annotations[0] # if yes, keep whatever that unanimous annotation was
    else:
      return -1 # if no, indicate an ambiguous annotation

  # apply ambiguity check to every element
  df["Annotation"] = df["Annotation"].apply(check_ambiguity)

  # calculate annotator accuracy
  unanimous_annotations = df[df["Annotation"] != -1]
  annotator_accuracy = len(unanimous_annotations) / len(df["Annotation"])
  print(f'Annotator agreement: {annotator_accuracy:.3%}')

  # drop all ambiguous annotations
  df.drop(df[df["Annotation"] == -1].index, inplace=True)
  return df

def data_cleanser(df, col):
    df[col] = df[col].replace('[^A-Za-z\'\s]+', ' ', regex=True)
    return df

def get_model_pipeline(model):
    return Pipeline(steps = [("tfidf", TfidfVectorizer()), ("clf", model)])

def training_pipeline(X, Y, model, parameters):
  pipeline = get_model_pipeline(model)
  randomizedSearchCV = RandomizedSearchCV(pipeline, param_distributions=parameters, n_jobs=-1, random_state=0)
  randomizedSearchCV.fit(X,Y)
  return randomizedSearchCV.best_estimator_, randomizedSearchCV.best_params_

### Hyperparameter dictionaries
The hyperparameter dictionaries are used to store the hyperparameter distributions for the models. The hyperparameters are sampled from these distributions to create the models.

In [None]:
## Vectorizer hyperparameters
parameters_tfidf = {
    # Decide on whether to limit the maximum number of features
    "tfidf__max_features": [None, 1000],

    # Decide wether the feature should be made of word or character n-grams
    "tfidf__analyzer": ["word", "char"],

    # Smooth idf weights or not
    "tfidf__smooth_idf": [True, False],

    # Upper boundary of n-values for different n-grams, here: unigrams and bigrams
    "tfidf__ngram_range": [(1, 1), (1, 2)],

    # Enable inverse-document-frequency reweighting. or not
    "tfidf__use_idf": [True, False],

    # Decide wether to remove stop words like "and", "the", "him"...
    "tfidf__stop_words": [None, list(STOP_WORDS)]
}

## Model hyperparameters
parameters_dc = {}

parameters_random_forest = {
# Number of trees in random forest
'clf__n_estimators': [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)],

# Maximum number of levels in tree
'clf__max_features': ['auto', 'sqrt', 'log2', None],

# Maximum number of levels in tree
'clf__max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],

# Minimum number of samples required to split a node
'clf__min_samples_split': [2, 5, 10],

# Minimum number of samples required at each leaf node
'clf__min_samples_leaf': [1, 2, 4],

# With or without replacement
'clf__bootstrap': [True, False]
}

parameters_svc = {
  # Determine loss function
  'clf__loss': ["hinge", "squared_hinge"],

  # Regularization parameter - regularization i inversely proportional to C
  'clf__C': [0.1, 1, 10, 100, 1000],

  # Tolerance for stopping criteria
  'clf__tol': [1e-3,1e-4,1e-5]
}

parameters_knn = {
  # Determine number of neighbors
  'clf__n_neighbors': list(range(1,31)),

  # Determine weight function used in prediction
  'clf__weights': ["uniform", "distance"],

  # Determine algorithm used to compute the nearest neighbors
  'clf__algorithm': ["auto", "ball_tree", "kd_tree", "brute"]
}

parameters_naive_b = {
    # Decide wether to learn class prior probabilities or not
    'clf__fit_prior': (True, False),

    # Additive smooting paramter
    'clf__alpha': (0.5, 1.0)
}

parameters_perceptron = {
    # Determine the learning rate
    'clf__eta0': (0.1, 0.01, 0.001),

    # Determine the learning rate
    'clf__penalty': ["l2", "l1", "elasticnet"],

    # Determine the learning rate
    'clf__alpha': (0.0001, 0.00001, 0.000001)
}

parameters_decision_tree = {
    # Maximum number of levels in tree
    'clf__max_depth': [int(x) for x in np.linspace(10, 110, num = 11)],

    # Minimum number of samples required to split a node
    'clf__min_samples_split': [2, 5, 10],

    # Minimum number of samples required at each leaf node
    'clf__min_samples_leaf': [1, 2, 4],

    # Maximum number of levels in tree
    'clf__max_features': ['auto', 'sqrt', 'log2', None]
}

parameters_logistic_regression = {
    # Regularization parameter
    'clf__C': [0.1, 1, 10, 100, 1000],

    # Regularization parameter
    'clf__penalty': ["l2", "l1", "elasticnet"],

    # Regularization parameter
    'clf__solver': ["newton-cg", "lbfgs", "liblinear", "sag", "saga"]
}

parameters_gradient_boosting = {
    # Number of boosting stages
    'clf__n_estimators': [100, 200, 300, 400, 500],

    # Maximum depth of the individual estimators
    'clf__max_depth': [3, 4, 5, 6, 7],

    # Learning rate shrinks the contribution of each tree
    'clf__learning_rate': [0.1, 0.01, 0.001],

    # Subsample ratio of the training instance
    'clf__subsample': [0.7, 0.8, 0.9, 1.0],

    # Maximum number of features to consider for making splits
    'clf__max_features': ['auto', 'sqrt', 'log2', None]
}

In [None]:
# Mapping models to their parameter distribution
models_param_dict = {
    DummyClassifier(): parameters_dc,
    GradientBoostingClassifier(): parameters_gradient_boosting,
    RandomForestClassifier(): parameters_random_forest,
    Perceptron(): parameters_perceptron,
    DecisionTreeClassifier(): parameters_decision_tree,
    LogisticRegression(): parameters_logistic_regression,
    MultinomialNB(): parameters_naive_b,
    LinearSVC(): parameters_svc,
    KNeighborsClassifier(): parameters_knn
}

### Preprocessing
The preprocessing functions are used to preprocess the data before it is used to train the models. We have separate preprocessing for the TfIdf and the Sentence Transformer.

In [None]:
# Preprocessing for TfIdf
train_data = pd.read_csv(TRAIN_SET, sep='\t', header = None)
train_data.columns = ["Annotation", "Comment"]
remove_ambiguous_annotations(train_data)
print(f"{len(train_data)} training samples remain.")
data_cleanser(train_data, "Comment")
data_shuffled = train_data.sample(frac=1.0, random_state=0)

Xtrain = data_shuffled.iloc[:, 1]
Ytrain = data_shuffled.iloc[:, 0]


test_data = pd.read_csv(TEST_SET, sep='\t', header=None)
test_data.columns = ["Annotation", "Comment"]
print(f"{len(test_data)} test samples.")
data_cleanser(test_data, "Comment")
data_shuffled_test = test_data.sample(frac=1.0, random_state=0)

Xtest = data_shuffled_test.iloc[:, 1]
Ytest = data_shuffled_test.iloc[:, 0]

In [None]:
# Preprocessing for Sentence Transformer

# The encoding needs to be run just once and can be stored using pickle
# This is very handy, because the encoding takes quite some time

# This is just to make it run on the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SentenceTransformer("all-mpnet-base-v2").to(device)
x_train_emb = model.encode(list(Xtrain))
x_test_emb = model.encode(list(Xtest))
with open("xtest_embd.pkl",'wb') as f:
    pickle.dump(x_test_emb, f)
with open("xtrain_embd.pkl",'wb') as f:
    pickle.dump(x_train_emb, f)

# Afterwards, we can just load the serialized embedding, which is a lot faster
x_train_emb = load_model("xtrain_embd.pkl")
x_test_emb = load_model("xtest_embd.pkl")

### Model evaluation

In [None]:
# First: fit each model without tuning any parameters

vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(Xtrain)
X_test_tfidf = vectorizer.transform(Xtest)

result_accuracy = {}
for model in models_param_dict.keys():
    clf = model
    clf.fit(X_train_tfidf, Ytrain)
    result_accuracy[clf] = accuracy_score(Ytest, clf.predict(X_test_tfidf))

# Print the accuracies in order of best to worst classifier
sorted_accuracy = sorted(result_accuracy.items(), key=lambda x: x[1], reverse=True)
for clf, accuracy in sorted_accuracy:
    print(clf, accuracy)

# sort models by accuracy in sorted_accuracy
sorted_default_models = [mdl for (mdl, _) in sorted_accuracy]

In [None]:
# Second: Find the best hyperparameter combination for each model

def find_best_hyperparameters_tfidf(id):
  clf = list(models_param_dict.keys())[id]
  print(f"Train {str(clf)}!")
  clf_params = models_param_dict[clf]
  best_clf, best_clf_params = training_pipeline(Xtrain, Ytrain, clf, parameters_tfidf | clf_params)
  print(f"{str(clf)} trained!")
  clf_test_accuracy = accuracy_score(Ytest, best_clf.predict(Xtest))
  send_telegram_message(f"{clf} [{clf_test_accuracy}]: {best_clf_params}")
  return (best_clf, clf_test_accuracy, best_clf_params)

def find_best_hyperparameters_mpnet(id):
  clf = list(models_param_dict.keys())[id]
  model_name = str(clf).split('(')[0]
  clf_params = models_param_dict[clf]
  # we have to use the pipeline, otherwise the parameter dicts wont work
  pipeline = Pipeline(steps=[("clf", clf)])
  # Create the RandomizedSearchCV object
  random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=clf_params,
    n_iter=5, # Adjust the number of iterations based on computational resources
    n_jobs=-1, # Use all available CPU cores
    random_state=0
  )
  send_telegram_message(f"{model_name}: start")
  random_search.fit(x_train_emb, Ytrain)
  best_clf = random_search.best_estimator_
  best_clf_params = random_search.best_params_
  clf_test_accuracy = accuracy_score(Ytest, best_clf.predict(x_test_emb))
  save_model(best_clf, f"{model_name}_embd.pkl")
  send_telegram_message(f"{model_name} [{clf_test_accuracy}]: {best_clf_params}")
  return (best_clf, clf_test_accuracy, best_clf_params)

# The best models could be obtained through performing the hyperparameter search every time we run the notebook
best_models = []
for i in range(1, len(models_param_dict)):
   best_models.append(find_best_hyperparameters_tfidf(i))

In [None]:
# Collection of the best found hyperparameter combinations
# Note the limitation that due to computational limitations, only one random seed was used which can lead to highly skewed results!
best_models = [
  (Perceptron(), 0.7984306032368809, {'tfidf__use_idf': True, 'tfidf__stop_words': None, 'tfidf__smooth_idf': True, 'tfidf__ngram_range': (1, 1), 'tfidf__max_features': None, 'tfidf__analyzer': 'word', 'clf__penalty': 'l1', 'clf__eta0': 0.1, 'clf__alpha': 1e-06}),
  (MultinomialNB(), 0.8734673859735165, {'tfidf__use_idf': True, 'tfidf__stop_words': None, 'tfidf__smooth_idf': True, 'tfidf__ngram_range': (1, 2), 'tfidf__max_features': None, 'tfidf__analyzer': 'word', 'clf__fit_prior': False, 'clf__alpha': 1.0}),
  (DecisionTreeClassifier(), 0.6949485041687101, {'tfidf__use_idf': False, 'tfidf__stop_words': ['perhaps', '‘d', 'are', 'that', 'toward', 'last', 'once', 'amount', 'serious', 'own', 'meanwhile', 'noone', 'thereafter', 'too', 'would', 'two', 'else', 'never', 'n‘t', 'thru', 'both', 'and', 'as', 'hereupon', 'take', 'along', 'becoming', 'becomes', 'how', 'there', 'forty', 'whom', 'fifteen', 'such', 'empty', 'those', 'hers', 'next', 'in', 'see', 'during', 'its', "'d", 'four', 'itself', '‘ve', 'just', 'regarding', 'whereafter', 'can', 'neither', 'these', 'ten', 'whereby', 'i', 'within', 'a', 'but', '’ve', 'make', 'since', 'therein', 'what', 'around', 'you', 'most', 'my', 'former', 'ca', '‘ll', 'because', 'beyond', 'where', 'ourselves', 'on', 'nobody', 'across', 'someone', 'somewhere', 'us', 'did', 'seemed', 'whither', 'many', 'cannot', 'being', 'ours', 'via', 'if', 'though', 'mostly', 'top', 'used', 'out', 'sometimes', 'sometime', 'than', 'except', 'fifty', 'off', 'made', 'below', 'hence', 'before', 'however', 'thus', 'doing', 'they', 'already', 'throughout', 'between', 'front', 'to', 'does', 'seems', 'nowhere', 'it', 'anyway', 'now', 'really', 'besides', 'very', 'twenty', 'his', 'wherever', 'always', 'none', 'herein', 'some', 'same', 'almost', 'an', 'well', 'often', 'whoever', 'by', 'say', 'another', 'seeming', 'everything', 'each', 'any', 'six', 'must', 'eleven', 'of', 'our', 'namely', 'due', 'eight', 'latter', 'do', 'was', 'had', 'whereupon', 'wherein', 'twelve', 'himself', 'why', '‘s', 'nothing', 'behind', 'yourselves', 'quite', 'side', 're', "'ll", 'please', 'afterwards', "n't", 'first', 'after', 'from', 'sixty', 'seem', '’d', '’re', 'anywhere', 'were', 'herself', 'more', 'several', 'who', 'will', 'towards', 'unless', 'has', 'everywhere', 'without', 'is', 'also', 'their', 'go', 'less', 'name', "'s", 'therefore', 'anyhow', 'over', 'bottom', 'might', 'thence', 'every', 'latterly', 'together', 'n’t', 'not', 'nor', 'anyone', 'full', 'move', 'other', 'about', 'he', 'thereupon', 'up', 'for', 'no', 'somehow', 'using', 'enough', 'others', 'this', 'third', 'least', 'beforehand', 'so', 'her', 'may', 'the', 'show', 'him', 'all', 'themselves', 'have', "'m", 'elsewhere', 'when', 'mine', 'rather', 'either', '’s', 'whenever', 'ever', 'moreover', 'yours', 'hundred', 'be', 'should', 'indeed', 'keep', 'until', 'them', 'put', 'still', 'much', 'whole', 'again', 'otherwise', 'through', 'various', "'ve", 'into', 'whence', 'became', 'hereafter', 'back', 'here', 'something', 'everyone', 'myself', 'formerly', 'become', 'she', 'under', 'few', 'per', 'whose', 'further', '‘re', 'five', 'am', 'hereby', 'among', 'onto', 'yourself', 'only', 'could', 'nine', 'we', 'amongst', 'yet', 'whereas', 'which', 'with', 'your', '’ll', '’m', 'while', "'re", 'against', 'whatever', 'down', 'beside', 'thereby', '‘m', 'above', 'alone', 'get', 'one', 'nevertheless', 'although', 'give', 'call', 'part', 'then', 'even', 'three', 'or', 'whether', 'anything', 'me', 'done', 'upon', 'been', 'at'], 'tfidf__smooth_idf': True, 'tfidf__ngram_range': (1, 1), 'tfidf__max_features': 1000, 'tfidf__analyzer': 'word', 'clf__min_samples_split': 5, 'clf__min_samples_leaf': 1, 'clf__max_features': 'log2', 'clf__max_depth': 110}),
  (DummyClassifier(), 0.49975478175576266, {'tfidf__use_idf': False, 'tfidf__stop_words': None, 'tfidf__smooth_idf': False, 'tfidf__ngram_range': (1, 2), 'tfidf__max_features': None, 'tfidf__analyzer': 'char'}),
  (LinearSVC(), 0.8847474252084355, {'tfidf__use_idf': False, 'tfidf__stop_words': None, 'tfidf__smooth_idf': True, 'tfidf__ngram_range': (1, 2), 'tfidf__max_features': None, 'tfidf__analyzer': 'word', 'clf__tol': 0.0001, 'clf__loss': 'squared_hinge', 'clf__C': 10}),
  (GradientBoostingClassifier(), 0.7827366356056891, {'tfidf__use_idf': True, 'tfidf__stop_words': None, 'tfidf__smooth_idf': True, 'tfidf__ngram_range': (1, 2), 'tfidf__max_features': 1000, 'tfidf__analyzer': 'word', 'clf__subsample': 0.8, 'clf__n_estimators': 500, 'clf__max_features': 'log2', 'clf__max_depth': 5, 'clf__learning_rate': 0.01}),
  (LogisticRegression(), 0.8916135360470819, {'tfidf__use_idf': True, 'tfidf__stop_words': None, 'tfidf__smooth_idf': False, 'tfidf__ngram_range': (1, 2), 'tfidf__max_features': None, 'tfidf__analyzer': 'word', 'clf__solver': 'saga', 'clf__penalty': 'l1', 'clf__C': 100}),
  (KNeighborsClassifier(), 0.8263854830799412, {'tfidf__use_idf': True, 'tfidf__stop_words': None, 'tfidf__smooth_idf': True, 'tfidf__ngram_range': (1, 1), 'tfidf__max_features': None, 'tfidf__analyzer': 'word', 'clf__weights': 'distance', 'clf__n_neighbors': 26, 'clf__algorithm': 'auto'}),
  (RandomForestClassifier(), 0.784698381559588, {'tfidf__use_idf': False, 'tfidf__stop_words': ['them', '’m', 'everywhere', 'two', 'such', 'however', 'me', 'no', 'make', 'what', 'various', 'see', 'per', 'made', 'top', 'least', 'front', 'mine', 'four', "'s", 'afterwards', 'not', 'until', 'do', 'get', 'beyond', 'both', 'side', 'whether', 'might', 'above', 'we', 'here', 'the', 'themselves', 'had', 'across', 'too', 'through', 'anywhere', 'wherein', 'eight', "n't", 'twelve', 'may', 'yours', 'seemed', 'take', 'he', 'among', 'besides', 'really', 'your', 'and', 'almost', 'how', 'seems', 'go', 'during', 'either', 'my', 'whither', 'each', 'moreover', 'empty', 'part', 'cannot', 'someone', 'call', 'even', 'onto', 'seeming', 'fifteen', 'last', 'throughout', 'ever', 'she', '’re', '‘m', 'six', 'already', '‘d', 'full', 'under', 'whence', 'quite', 'enough', 'around', 'somewhere', 'sixty', 'there', 'or', 'then', 'nowhere', 'keep', 'its', 'also', 'up', 'but', 'hereafter', 'yourself', 'against', 'five', 'some', 'therefore', 'sometimes', 'done', 'into', 'over', '‘ve', 'these', 'now', 'neither', 'mostly', 'does', 'further', 'beside', 'thereafter', 'rather', 'hers', 'something', 'did', 'ca', 'same', 'her', 'ten', 'noone', '’ve', 'for', 'forty', 'used', 'whole', 'few', 'from', 'with', 'whose', "'ve", 'below', 'elsewhere', 'latterly', 'him', 'on', 'first', '‘s', 'yet', 'where', 'anything', 'so', 'their', 'about', 'several', 'must', 'amount', 'everyone', 'those', 'this', 'thus', 'was', 'could', 'being', 'though', 'at', 'amongst', 'formerly', '’d', 'because', "'d", 'while', 'out', 'one', 'after', 'serious', 'should', 'hereupon', 'were', 'often', '’s', 'hence', 'without', 'much', 'an', 'n’t', 'nor', 'regarding', 'never', 'a', 'become', 'latter', 'behind', '’ll', '‘ll', 'whenever', 'fifty', 'within', 'wherever', 'seem', 'three', 'thereby', 'down', 'became', 'nevertheless', 'once', 'his', 'herein', "'ll", 'it', 'will', 'whereas', 'himself', 'ourselves', 'together', 'herself', 'would', 'show', 'give', 'to', 'nine', 'thence', 'whom', 'be', 'next', 'toward', 'can', 'just', 'namely', 'bottom', 'ours', 'every', 'please', 'twenty', 'otherwise', 'along', 'move', 'whereafter', 'although', 'sometime', 'off', 'only', "'m", 'say', 're', 'thru', 'whereby', 'using', 'if', 'again', 'most', 'whereupon', 'am', 'except', 'which', 'you', 'own', 'former', 'unless', 'becomes', 'less', 'anyway', 'due', 'another', 'they', 'third', 'everything', 'via', 'always', 'has', 'hereby', 'back', 'more', 'any', 'whatever', 'whoever', 'why', 'somehow', 'put', 'name', 'before', 'is', 'our', 'upon', "'re", 'as', 'doing', 'beforehand', 'meanwhile', 'when', 'nobody', 'others', 'else', 'all', 'i', 'towards', 'other', 'since', 'in', 'yourselves', 'indeed', 'anyone', 'us', 'n‘t', 'of', 'hundred', 'still', 'none', 'becoming', 'between', 'itself', 'anyhow', 'than', 'by', 'thereupon', 'been', 'perhaps', 'myself', 'well', 'eleven', 'alone', '‘re', 'nothing', 'very', 'therein', 'have', 'that', 'who', 'are', 'many'], 'tfidf__smooth_idf': True, 'tfidf__ngram_range': (1, 1), 'tfidf__max_features': None, 'tfidf__analyzer': 'word', 'clf__n_estimators': 1200, 'clf__min_samples_split': 5, 'clf__min_samples_leaf': 1, 'clf__max_features': 'log2', 'clf__max_depth': 50, 'clf__bootstrap': False}),
]

# Define the best performing classifiers
best_models = sorted(best_models, key=lambda x: x[1], reverse=True)

### Generating confusion matrices

In [None]:
# Confusion matrices
def generate_confusion_matrices(id):
    best_model = best_models[id]
    clf = best_model[0]
    model_name = str(clf).split('(')[0]
    clf_score = best_model[1]
    clf_params = best_model[2]
    pipe = get_model_pipeline(clf)
    pipe.set_params(**clf_params)
    pipe.fit(Xtrain, Ytrain)
    save_model(pipe, f"{model_name}.pkl")
    Ypred = pipe.predict(Xtest)
    cm = confusion_matrix(Ytest, Ypred)
    plt.figure(figsize=(6,6))
    sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap='OrRd', xticklabels=['anti-vaccination', 'pro-vaccination'], yticklabels=['anti-vaccination', 'pro-vaccination'])
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    all_sample_title = f'{model_name}\nAccuracy Score: {clf_score:.4f}'
    plt.title(all_sample_title, size=15)
    plt.show()
    plt.savefig(Path(f"./{model_name}.pdf"), bbox_inches='tight')

    # Display a few samples that were wrongly classified
    misclassified_samples = Xtest[Ytest != Ypred]
    misclassified_labels = Ytest[Ytest != Ypred]
    with open(Path(f"./{model_name}.txt"), "w+") as f:
      for sample, label in zip(misclassified_samples[:10], misclassified_labels[:10]):
          s = f"Sample: {sample}\nClassified as: {label_dict[int(not(label))]}\nTrue Label: {label_dict[label]}\n\n"
          print(s)
          f.write(s)


for i in range(1, len(models_param_dict)):
  generate_confusion_matrices(i)

In [None]:
# When performing the hyperparameter search on the RWTH Aachen High Performance Computing cluster, we submit a SLURM Array Job and use the task ID as a model identifier
# Note that we would use a Python script, not a notebook. Thus, also some part of this notebook might seem a bit unorganized and cluttered. But as a matter of fact, we never used one single notebook to run our code because it was infeasible to do so.
if __name__ == "__main__":
  args = parse_arguments()
  _ = find_best_hyperparameters_tfidf(args.model_id)
  generate_confusion_matrices(args.model_id)

# Extract important features

In [None]:
pipeline = Pipeline(steps = [("tfidf",
    TfidfVectorizer(
        use_idf=False,
        stop_words=['perhaps', '‘d', 'are', 'that', 'toward', 'last', 'once', 'amount', 'serious', 'own', 'meanwhile', 'noone', 'thereafter', 'too', 'would', 'two', 'else', 'never', 'n‘t', 'thru', 'both', 'and', 'as', 'hereupon', 'take', 'along', 'becoming', 'becomes', 'how', 'there', 'forty', 'whom', 'fifteen', 'such', 'empty', 'those', 'hers', 'next', 'in', 'see', 'during', 'its', "'d", 'four', 'itself', '‘ve', 'just', 'regarding', 'whereafter', 'can', 'neither', 'these', 'ten', 'whereby', 'i', 'within', 'a', 'but', '’ve', 'make', 'since', 'therein', 'what', 'around', 'you', 'most', 'my', 'former', 'ca', '‘ll', 'because', 'beyond', 'where', 'ourselves', 'on', 'nobody', 'across', 'someone', 'somewhere', 'us', 'did', 'seemed', 'whither', 'many', 'cannot', 'being', 'ours', 'via', 'if', 'though', 'mostly', 'top', 'used', 'out', 'sometimes', 'sometime', 'than', 'except', 'fifty', 'off', 'made', 'below', 'hence', 'before', 'however', 'thus', 'doing', 'they', 'already', 'throughout', 'between', 'front', 'to', 'does', 'seems', 'nowhere', 'it', 'anyway', 'now', 'really', 'besides', 'very', 'twenty', 'his', 'wherever', 'always', 'none', 'herein', 'some', 'same', 'almost', 'an', 'well', 'often', 'whoever', 'by', 'say', 'another', 'seeming', 'everything', 'each', 'any', 'six', 'must', 'eleven', 'of', 'our', 'namely', 'due', 'eight', 'latter', 'do', 'was', 'had', 'whereupon', 'wherein', 'twelve', 'himself', 'why', '‘s', 'nothing', 'behind', 'yourselves', 'quite', 'side', 're', "'ll", 'please', 'afterwards', "n't", 'first', 'after', 'from', 'sixty', 'seem', '’d', '’re', 'anywhere', 'were', 'herself', 'more', 'several', 'who', 'will', 'towards', 'unless', 'has', 'everywhere', 'without', 'is', 'also', 'their', 'go', 'less', 'name', "'s", 'therefore', 'anyhow', 'over', 'bottom', 'might', 'thence', 'every', 'latterly', 'together', 'n’t', 'not', 'nor', 'anyone', 'full', 'move', 'other', 'about', 'he', 'thereupon', 'up', 'for', 'no', 'somehow', 'using', 'enough', 'others', 'this', 'third', 'least', 'beforehand', 'so', 'her', 'may', 'the', 'show', 'him', 'all', 'themselves', 'have', "'m", 'elsewhere', 'when', 'mine', 'rather', 'either', '’s', 'whenever', 'ever', 'moreover', 'yours', 'hundred', 'be', 'should', 'indeed', 'keep', 'until', 'them', 'put', 'still', 'much', 'whole', 'again', 'otherwise', 'through', 'various', "'ve", 'into', 'whence', 'became', 'hereafter', 'back', 'here', 'something', 'everyone', 'myself', 'formerly', 'become', 'she', 'under', 'few', 'per', 'whose', 'further', '‘re', 'five', 'am', 'hereby', 'among', 'onto', 'yourself', 'only', 'could', 'nine', 'we', 'amongst', 'yet', 'whereas', 'which', 'with', 'your', '’ll', '’m', 'while', "'re", 'against', 'whatever', 'down', 'beside', 'thereby', '‘m', 'above', 'alone', 'get', 'one', 'nevertheless', 'although', 'give', 'call', 'part', 'then', 'even', 'three', 'or', 'whether', 'anything', 'me', 'done', 'upon', 'been', 'at'],
        smooth_idf=True,
        ngram_range=(1, 1),
        max_features=1000,
        analyzer='word'
    )), ("clf",
    DecisionTreeClassifier(
        min_samples_split=5,
        min_samples_leaf=1,
        max_features='log2',
        max_depth=110
    ))])


pipeline.fit(Xtrain, Ytrain)

feature_names = pipeline.named_steps['tfidf'].get_feature_names_out()
feature_importances = pipeline.named_steps['clf'].feature_importances_
feature_importance_dict = {feature_names[i]: feature_importances[i] for i in range(len(feature_names))}
top_ten_importances = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)[:10]
for feature, importance in top_ten_importances:
    print(f"Feature: {feature}, Importance: {importance}")

In [None]:
pipeline = Pipeline(steps = [("tfidf",
    TfidfVectorizer(
        use_idf=True,
        stop_words=None,
        smooth_idf=False,
        ngram_range=(1, 2),
        max_features=None,
        analyzer='word'
    )),  ("clf",
    LogisticRegression(
        solver='saga',
        penalty='l1',
        C=100
    ))])

pipeline.fit(Xtrain, Ytrain)

feature_names = pipeline.named_steps['tfidf'].get_feature_names_out()

coefficients_class_1 = pipeline.named_steps['clf'].coef_[0]
coefficients_class_0 = -coefficients_class_1

# Ten highest weights and corresponding words for class 1
top_ten_weights_class_1 = sorted(zip(feature_names, coefficients_class_1), key=lambda x: x[1], reverse=True)[:10]

# Ten highest weights and corresponding words for class 0
top_ten_weights_class_0 = sorted(zip(feature_names, coefficients_class_0), key=lambda x: x[1], reverse=True)[:10]

print("Top features for Class 1:")
for feature, weight in top_ten_weights_class_1:
    print(f"Feature: {feature}, Weight: {weight}")

print("\nTop features for Class 0:")
for feature, weight in top_ten_weights_class_0:
    print(f"Feature: {feature}, Weight: {weight}")

