# Comparisons File

For our two real datasets, let's go through the different combinations of pipeline and compare the results.

## Setup

Import the library components, construct the datasets, and get the lists of methods.

In [1]:
# imports
from nlp_pipelines.pipeline import Pipeline
from nlp_pipelines.evaluate import evaluate
from nlp_pipelines.dataset import Dataset

In [2]:
# datasets

newsgroups = Dataset.from_parquet("./demo_data/sample_5_newsgroup_text.parquet", text_field="text", truth_field="label_text")

abstracts = Dataset.from_json("./demo_data/springer-127-parsed.json", text_field="abstract", truth_field="keywords")

In [3]:
# get the lists of methods

# vectorizers
from nlp_pipelines.vectorizer import __all__ as vectorizers

# classifiers
from nlp_pipelines.classifier import __all__ as classifiers

# clusterers
from nlp_pipelines.clusterer import __all__ as clusterers

# labelers
from nlp_pipelines.labeler import __all__ as labelers

# preprocesors
from nlp_pipelines.preprocess import __all__ as preprocessors
preprocessors.append(None)

  from .autonotebook import tqdm as notebook_tqdm


## Clustering

Clustering is assigning each datapoint to a cluster, where the cluster meaning is not predefined.

In [4]:
cluster_test_results = {}

# combinations of preprocessor or none, vectorizer, clusterer
import itertools
import copy

combinations = itertools.product(preprocessors, vectorizers, clusterers) # pick one from each list (including None for preprocess)

# same test, train, val split
cluster_test, cluster_train = newsgroups.split(ratio=0.8, labeled=True, splitLabeled=True)
for x in combinations:
    preproc, vec, method = x
    key = f"{str(preproc)}.{vec}.{method}"
    try:
        m = {"name": "cluster", "method": f"clusterer.{method}"}
        if method in ['Kmeans, GraphAffinity']:
            m['params'] = {'num_clusters': 5}
        pipeline_list = [
            {"name": "vectorize", "method": f"vectorizer.{vec}"},
            m
        ]
        if preproc is not None:
            pipeline_list = [{"name": "preprocess", "method": f"preprocess.{preproc}"}] + pipeline_list
        # construct pipeline
        pipeline = Pipeline(pipeline_list)
        # copy data
        train = copy.deepcopy(cluster_train)
        test = copy.deepcopy(cluster_test)
        # set data
        pipeline.set_data(train_data=train, run_data=test)
        # run
        pipeline.run()
        # evaluate
        cluster_test_results[key] = evaluate(pipeline.run_data, cluster_mode=True)
    except BaseException as e:
        print(f"ERROR in {key}", e)



ERROR in Lemmatize.Tfidf.UmapHdbscan Mix of label input types (string and number)


Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  131569 lr:  0.000000 avg.loss:  2.705948 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  131374 lr:  0.000000 avg.loss:  2.704229 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  133761 lr:  0.000000 avg.loss:  2.668200 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  132890 lr:  0.000000 avg.loss:  2.679577 ETA:   0h 0m 0s
  w = np.where(isolated_node_mask, 1, np.sqrt(w))


ERROR in Lemmatize.SentenceEmbedding.GraphAffinity Input contains NaN.
ERROR in PosRemoval.Tfidf.UmapHdbscan Mix of label input types (string and number)




ERROR in PosRemoval.BagOfWords.UmapHdbscan Mix of label input types (string and number)


Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  131899 lr:  0.000000 avg.loss:  2.697429 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  131688 lr:  0.000000 avg.loss:  2.640370 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  132664 lr:  0.000000 avg.loss:  2.700014 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  132222 lr:  0.000000 avg.loss:  2.693625 ETA:   0h 0m 0s
  w = np.where(isolated_node_mask, 1, np.sqrt(w))


ERROR in PosRemoval.SentenceEmbedding.GraphAffinity Input contains NaN.


Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  133236 lr:  0.000000 avg.loss:  2.652409 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  131165 lr:  0.000000 avg.loss:  2.643854 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  133768 lr:  0.000000 avg.loss:  2.692045 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  131222 lr:  0.000000 avg.loss:  2.690133 ETA:   0h 0m 0s
  w = np.where(isolated_node_mask, 1, np.sqrt(w))


ERROR in Stem.SentenceEmbedding.GraphAffinity Input contains NaN.




ERROR in StopwordRemove.BagOfWords.UmapHdbscan Mix of label input types (string and number)


Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  131763 lr:  0.000000 avg.loss:  2.689323 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  131451 lr:  0.000000 avg.loss:  2.687964 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  132880 lr:  0.000000 avg.loss:  2.689950 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  131047 lr:  0.000000 avg.loss:  2.702494 ETA:   0h 0m 0s
  w = np.where(isolated_node_mask, 1, np.sqrt(w))


ERROR in StopwordRemove.SentenceEmbedding.GraphAffinity Input contains NaN.




ERROR in TokenFilter.BagOfWords.UmapHdbscan Mix of label input types (string and number)


Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  134036 lr:  0.000000 avg.loss:  2.695890 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  131145 lr:  0.000000 avg.loss:  2.697594 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  131123 lr:  0.000000 avg.loss:  2.673788 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  132034 lr:  0.000000 avg.loss:  2.682243 ETA:   0h 0m 0s
  w = np.where(isolated_node_mask, 1, np.sqrt(w))


ERROR in TokenFilter.SentenceEmbedding.GraphAffinity Input contains NaN.
ERROR in None.Tfidf.UmapHdbscan Mix of label input types (string and number)




ERROR in None.BagOfWords.UmapHdbscan Mix of label input types (string and number)


Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  132985 lr:  0.000000 avg.loss:  2.675301 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  130367 lr:  0.000000 avg.loss:  2.691245 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  128791 lr:  0.000000 avg.loss:  2.700564 ETA:   0h 0m 0s
Read 0M words
Number of words:  2086
Number of labels: 0
Progress: 100.0% words/sec/thread:  129073 lr:  0.000000 avg.loss:  2.699965 ETA:   0h 0m 0s
  w = np.where(isolated_node_mask, 1, np.sqrt(w))


ERROR in None.SentenceEmbedding.GraphAffinity Input contains NaN.


In [5]:
sorted_results = sorted(cluster_test_results.items(), key=lambda x: x[1]['f1_macro'], reverse=True)
for rank, (config, score) in enumerate(sorted_results, start=1):
    print(f"{rank:2}. {config:<40} | F1 Macro: {score['f1_macro']:.4f} | Accuracy: {score['accuracy']:.4f}")


 1. Lemmatize.SentenceEmbedding.Kmeans       | F1 Macro: 0.3869 | Accuracy: 0.5500
 2. StopwordRemove.SentenceEmbedding.Kmeans  | F1 Macro: 0.3807 | Accuracy: 0.5200
 3. TokenFilter.SentenceEmbedding.Kmeans     | F1 Macro: 0.3447 | Accuracy: 0.4700
 4. None.SentenceEmbedding.Kmeans            | F1 Macro: 0.3240 | Accuracy: 0.4500
 5. PosRemoval.SentenceEmbedding.Kmeans      | F1 Macro: 0.2943 | Accuracy: 0.4300
 6. Stem.SentenceEmbedding.Kmeans            | F1 Macro: 0.2789 | Accuracy: 0.4000
 7. StopwordRemove.Tfidf.Kmeans              | F1 Macro: 0.2603 | Accuracy: 0.3500
 8. Lemmatize.Tfidf.Kmeans                   | F1 Macro: 0.2280 | Accuracy: 0.3300
 9. None.SentenceEmbedding.UmapHdbscan       | F1 Macro: 0.2122 | Accuracy: 0.4500
10. TokenFilter.FastText.Kmeans              | F1 Macro: 0.2089 | Accuracy: 0.3300
11. Stem.SentenceEmbedding.UmapHdbscan       | F1 Macro: 0.2065 | Accuracy: 0.4100
12. Lemmatize.SentenceEmbedding.UmapHdbscan  | F1 Macro: 0.2056 | Accuracy: 0.4400
13. 

# Classification

Like clusters, but with predefined classes.

In [6]:
class_test_results = {}

newsgroup_labels = list(set(newsgroups.truths))

combinations = itertools.product(preprocessors, vectorizers, classifiers) # pick one from each list (including None for preprocess)

# same test train split within loop
class_test, class_train = newsgroups.split(ratio=0.8, labeled=True, splitLabeled=True)
for x in combinations:
    preproc, vec, method = x
    key = f"{str(preproc)}.{vec}.{method}"
    try:
        pipeline_list = [
            {"name": "vectorize", "method": f"vectorizer.{vec}"},
            {"name": "classifier", "method": f"classifier.{method}"}
        ]
        if preproc is not None:
            pipeline_list = [{"name": "preprocess", "method": f"preprocess.{preproc}"}] + pipeline_list
        # construct pipeline
        pipeline = Pipeline(pipeline_list)
        # copy data
        train = copy.deepcopy(class_train)
        test = copy.deepcopy(class_test)
        # set data
        pipeline.set_data(train_data=train, run_data=test, possible_labels=newsgroup_labels)
        # run
        pipeline.run()
        # evaluate
        class_test_results[key] = evaluate(pipeline.run_data)
    except BaseException as e:
        print(f"ERROR in {key}", e)



Device set to use mps:0


ERROR in Lemmatize.Tfidf.BartTag Length of `truths` and `results` must match.


Device set to use mps:0


ERROR in Lemmatize.BagOfWords.BartTag Length of `truths` and `results` must match.


Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  126137 lr:  0.000000 avg.loss:  2.733311 ETA:   0h 0m 0s
Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  127073 lr:  0.000000 avg.loss:  2.716863 ETA:   0h 0m 0s
Device set to use mps:0
Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  129403 lr:  0.000000 avg.loss:  2.693324 ETA:   0h 0m 0s


ERROR in Lemmatize.FastText.BartTag Length of `truths` and `results` must match.


Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  127687 lr:  0.000000 avg.loss:  2.683336 ETA:   0h 0m 0s
Device set to use mps:0


ERROR in Lemmatize.SentenceEmbedding.BartTag Length of `truths` and `results` must match.


Device set to use mps:0


ERROR in PosRemoval.Tfidf.BartTag Length of `truths` and `results` must match.


Device set to use mps:0


ERROR in PosRemoval.BagOfWords.BartTag Length of `truths` and `results` must match.


Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  128147 lr:  0.000000 avg.loss:  2.694401 ETA:   0h 0m 0s
Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  125778 lr:  0.000000 avg.loss:  2.682763 ETA:   0h 0m 0s
Device set to use mps:0
Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  125669 lr:  0.000000 avg.loss:  2.689538 ETA:   0h 0m 0s


ERROR in PosRemoval.FastText.BartTag Length of `truths` and `results` must match.


Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  127671 lr:  0.000000 avg.loss:  2.692022 ETA:   0h 0m 0s
Device set to use mps:0


ERROR in PosRemoval.SentenceEmbedding.BartTag Length of `truths` and `results` must match.


Device set to use mps:0


ERROR in Stem.Tfidf.BartTag Length of `truths` and `results` must match.


Device set to use mps:0


ERROR in Stem.BagOfWords.BartTag Length of `truths` and `results` must match.


Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  124659 lr:  0.000000 avg.loss:  2.710859 ETA:   0h 0m 0s
Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  125826 lr:  0.000000 avg.loss:  2.675167 ETA:   0h 0m 0s
Device set to use mps:0
Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  126191 lr:  0.000000 avg.loss:  2.687732 ETA:   0h 0m 0s


ERROR in Stem.FastText.BartTag Length of `truths` and `results` must match.


Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  127166 lr:  0.000000 avg.loss:  2.696028 ETA:   0h 0m 0s
Device set to use mps:0


ERROR in Stem.SentenceEmbedding.BartTag Length of `truths` and `results` must match.


Device set to use mps:0


ERROR in StopwordRemove.Tfidf.BartTag Length of `truths` and `results` must match.


Device set to use mps:0


ERROR in StopwordRemove.BagOfWords.BartTag Length of `truths` and `results` must match.


Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  126402 lr:  0.000000 avg.loss:  2.696874 ETA:   0h 0m 0s
Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  127681 lr:  0.000000 avg.loss:  2.681041 ETA:   0h 0m 0s
Device set to use mps:0
Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  125901 lr:  0.000000 avg.loss:  2.684826 ETA:   0h 0m 0s


ERROR in StopwordRemove.FastText.BartTag Length of `truths` and `results` must match.


Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  123689 lr:  0.000000 avg.loss:  2.719604 ETA:   0h 0m 0s
Device set to use mps:0


ERROR in StopwordRemove.SentenceEmbedding.BartTag Length of `truths` and `results` must match.


Device set to use mps:0


ERROR in TokenFilter.Tfidf.BartTag Length of `truths` and `results` must match.


Device set to use mps:0


ERROR in TokenFilter.BagOfWords.BartTag Length of `truths` and `results` must match.


Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  126127 lr:  0.000000 avg.loss:  2.717223 ETA:   0h 0m 0s
Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  130660 lr:  0.000000 avg.loss:  2.718211 ETA:   0h 0m 0s
Device set to use mps:0
Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  126989 lr:  0.000000 avg.loss:  2.684438 ETA:   0h 0m 0s


ERROR in TokenFilter.FastText.BartTag Length of `truths` and `results` must match.


Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:   99816 lr:  0.000000 avg.loss:  2.700806 ETA:   0h 0m 0s
Device set to use mps:0


ERROR in TokenFilter.SentenceEmbedding.BartTag Length of `truths` and `results` must match.


Device set to use mps:0


ERROR in None.Tfidf.BartTag Length of `truths` and `results` must match.


Device set to use mps:0


ERROR in None.BagOfWords.BartTag Length of `truths` and `results` must match.


Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  135684 lr:  0.000000 avg.loss:  2.683640 ETA:   0h 0m 0s
Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  131980 lr:  0.000000 avg.loss:  2.689818 ETA:   0h 0m 0s
Device set to use mps:0
Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:  132671 lr:  0.000000 avg.loss:  2.698976 ETA:   0h 0m 0s


ERROR in None.FastText.BartTag Length of `truths` and `results` must match.


Read 0M words
Number of words:  2105
Number of labels: 0
Progress: 100.0% words/sec/thread:   98782 lr:  0.000000 avg.loss:  2.699819 ETA:   0h 0m 0s
Device set to use mps:0


ERROR in None.SentenceEmbedding.BartTag Length of `truths` and `results` must match.


In [7]:
sorted_results = sorted(class_test_results.items(), key=lambda x: x[1]['f1_macro'], reverse=True)
for rank, (config, score) in enumerate(sorted_results, start=1):
    print(f"{rank:2}. {config:<40} | F1 Macro: {score['f1_macro']:.4f} | Accuracy: {score['accuracy']:.4f}")


 1. PosRemoval.SentenceEmbedding.LabelProp   | F1 Macro: 0.8501 | Accuracy: 0.8500
 2. None.SentenceEmbedding.LabelProp         | F1 Macro: 0.8406 | Accuracy: 0.8400
 3. PosRemoval.SentenceEmbedding.SimpleNNClassifier | F1 Macro: 0.8392 | Accuracy: 0.8400
 4. None.SentenceEmbedding.SimpleNNClassifier | F1 Macro: 0.8392 | Accuracy: 0.8400
 5. TokenFilter.SentenceEmbedding.LabelProp  | F1 Macro: 0.8323 | Accuracy: 0.8300
 6. Lemmatize.SentenceEmbedding.SimpleNNClassifier | F1 Macro: 0.8301 | Accuracy: 0.8300
 7. StopwordRemove.SentenceEmbedding.LabelProp | F1 Macro: 0.8225 | Accuracy: 0.8200
 8. Lemmatize.SentenceEmbedding.LabelProp    | F1 Macro: 0.8206 | Accuracy: 0.8200
 9. TokenFilter.SentenceEmbedding.SimpleNNClassifier | F1 Macro: 0.8102 | Accuracy: 0.8100
10. Stem.SentenceEmbedding.SimpleNNClassifier | F1 Macro: 0.8002 | Accuracy: 0.8000
11. Stem.SentenceEmbedding.LabelProp         | F1 Macro: 0.7952 | Accuracy: 0.7900
12. StopwordRemove.SentenceEmbedding.SimpleNNClassifier | F1 M