In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pathlib import Path
import json
from sklearn.metrics import classification_report
from spacy.cli.train import train

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import spacy
from spacy.tokens import DocBin
from spacy.lang.en import English
nlp = spacy.blank("en")

## Bagging to BERT: A tour of applied NLP
### Part 2: Beyond bagging
### Table of Contents
* [CNN TextCat](#cnn)
* [BERT](#bert)



### Data processing <a class="anchor" id="data"></a>

Copied from part 1

You'll either need to download the [imdb review data](https://ai.stanford.edu/~amaas/data/sentiment/) and save it to this directory OR download the [processed data](https://drive.google.com/file/d/1oN_fO91IBkDHD_u6WXiUCvhhyNexQDJq/view?usp=sharinghttps://drive.google.com/file/d/1oN_fO91IBkDHD_u6WXiUCvhhyNexQDJq/view?usp=sharing).

In [3]:
# # processing the original data into DataFrame
# # here for reference, don't need to run this if you're using reviews.pkl.gz
# source_path = Path('./aclImdb/')
# #neg_files = source_path.glob('./*/neg/*.txt')
# #pos_files = source_path.glob('./*/pos/*.txt')
# all_files = []
# for f in source_path.glob('./*/*/*.txt'):
#     filename = f.as_posix()
#     if 'unsup' not in filename:
#         # split up into useful components
#         _, split, sent, idx = filename.split('/')
#         idx = int(idx.split('_')[0])
#         all_files.append([idx, split, sent, f.read_text()])
# review_df = pd.DataFrame(all_files)
# review_df.columns = ['idx', 'split', 'label', 'text']
# # some minor html cruft is in here
# review_df['text'] = review_df['text'].str.replace('<br /><br />', '')
# review_df = review_df.to_pickle('reviews.pkl.gz')

In [None]:
# you may need to restart after this in Collab
!pip install spacy-transformers

In [4]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [5]:
!git clone -b revised_2023 https://github.com/bpben/bagging_to_bert

Cloning into 'bagging_to_bert'...
remote: Enumerating objects: 72, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (52/52), done.[K
remote: Total 72 (delta 36), reused 53 (delta 17), pack-reused 0[K
Unpacking objects: 100% (72/72), 27.39 MiB | 11.39 MiB/s, done.


In [3]:
def preprocess(text, labels, name):
    # preprocessing utility for saving a serialized corpus for spaCy
    # initialize spaCy's DocBin format (easier use with spaCy pipeline)
    db = DocBin()
    output_path = f'{name}.spacy'
    data_tuples = zip(text, labels)
    # pipe is slightly faster than individually processing each
    for doc, label in nlp.pipe(data_tuples, as_tuples=True):
        # store the labels in the document's .cat attribute
        if label == 'pos':
            doc.cats['pos'] = True
            doc.cats['neg'] = False
        else:
            doc.cats['pos'] = False
            doc.cats['neg'] = True
        db.add(doc)
    # save the DocBin
    db.to_disk(output_path)

In [4]:
# spaCy default corpus reader has certain expectations about format
#review_df = pd.read_pickle(
#    '/content/drive/MyDrive/talks/odsc_2023/reviews.pkl.gz')
review_df = pd.read_pickle('reviews.pkl.gz')
# copied from part 1: want to use the same train/test split
seed = 37
np.random.seed(seed)
pct_train = 0.7
X_train, X_test, y_train, y_test = train_test_split(
    review_df['text'],
    review_df['label'], train_size=pct_train)

In [5]:
# running the preprocessing on each split
preprocess(X_train, y_train, 'train')
preprocess(X_test, y_test, 'test')

In [9]:
# can override config info with overrides
# the tutorial config file doesn't have the paths for train/dev corpora
# going to just run this for a few epochs, see how it works
train("./spacy_materials/config.cfg",
      output_path='cnn_model',
      overrides={"paths.train": "train.spacy", 
                 "paths.dev": "test.spacy",
                 "training.max_epochs": 5},
      use_gpu = 0)

[38;5;4mℹ Saving to output directory: cnn_model[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['textcat'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  ------------  ----------  ------
  0       0          0.25       33.27    0.33
  0     200         50.01       33.43    0.33
  0     400         50.05       52.54    0.53
  0     600         49.71       33.27    0.33
  0     800         50.01       33.47    0.33
  0    1000         49.35       33.31    0.33
  0    1200         47.20       38.59    0.39
  0    1400         46.09       67.90    0.68
  0    1600         45.65       67.69    0.68
  0    1800         42.11       64.96    0.65
  0    2000         41.06       69.31    0.69
  0    2200         38.22       72.62    0.73
  0    2400         37.78       69.83    0.70
  0    2600         38.62       71.15    0.71
  0    2800         35.41       73.56    0.74
  0    3000    

In [5]:
# load the best version of the model
m = spacy.load('/content/drive/MyDrive/talks/odsc_2023/trained_cnn')

In [8]:
# load the best version of the model
m = spacy.load('./cnn_model/model-best/')

In [9]:
# looking at the outputs from running a simple example
m('This movie is great').cats

{'pos': 0.9997627139091492, 'neg': 0.00023723322374280542}

SpaCy has its own evaluation capabilities, but for comparison's sake, let's use the same evaluation approach we did with out other applications.

In [9]:
# if you're interested in trying out spacy's own evaluate
#from spacy.cli.evaluate import evaluate
#evaluate(model='./example_model/model-best/', data_path='test.spacy')


In [7]:
# if you're interested in trying out spacy's own evaluate
from spacy.cli.evaluate import evaluate
evaluate(model='./cnn_model/model-best/', data_path='test.spacy')

In [11]:
# get the predicted category from the model
pred = np.array([max(d.cats, key=d.cats.get) for d in m.pipe(X_test)])

In [12]:
print(f'accuracy: {np.where(pred == y_test)[0].shape[0]/y_test.shape[0]}')
print(
    classification_report(y_pred=pred,
                          y_true=y_test))

accuracy: 0.8256
              precision    recall  f1-score   support

         neg       0.83      0.82      0.83      7522
         pos       0.82      0.83      0.83      7478

    accuracy                           0.83     15000
   macro avg       0.83      0.83      0.83     15000
weighted avg       0.83      0.83      0.83     15000



### BERT <a class="anchor" id="bert!pip install transformers"></a>
From [HF tutorials](https://huggingface.co/blog/sentiment-analysis-python).  The sentiment analysis pipeline packages together the tokenizer and the BERT model with a classification layer.  The default pipeline uses this [distilBERT model](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english). 

In [None]:
# this will need to be run if you don't already have this package
#!pip install transformers

In [11]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis", truncation=True, padding=True)

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
Downloading: 100%|██████████| 629/629 [00:00<00:00, 394kB/s]
Downloading: 100%|██████████| 255M/255M [00:04<00:00, 57.3MB/s] 
Downloading: 100%|██████████| 48.0/48.0 [00:00<00:00, 14.5kB/s]
Downloading: 100%|██████████| 226k/226k [00:00<00:00, 4.39MB/s]


In [16]:
# some manipulations for speed and to play nice with BERT
bert_pred = sentiment_pipeline(X_test.apply(lambda x: x).head(n=50).tolist())
bert_pred = ['pos' if p['label']=='POSITIVE' else 'neg' for p in bert_pred]

In [17]:
np.where(bert_pred == y_test[:50])
print(f'accuracy: {np.where(bert_pred == y_test[:50])[0].shape[0]/50}')
print(
    classification_report(y_pred=bert_pred,
                          y_true=y_test[:50]))

accuracy: 0.88
              precision    recall  f1-score   support

         neg       0.90      0.90      0.90        30
         pos       0.85      0.85      0.85        20

    accuracy                           0.88        50
   macro avg       0.88      0.88      0.88        50
weighted avg       0.88      0.88      0.88        50



This is pretty good! But with some minor modifications, we can use our spacy configuration with a transformer model

In [None]:
# can override config info with overrides
# the tutorial config file doesn't have the paths for train/dev corpora
# going to just run this for a few epochs, see how it works
train("./spacy_materials/config_trf.cfg",
      output_path='example_model',
      overrides={"paths.train": "train.spacy", 
                 "paths.dev": "test.spacy",
                 "training.max_epochs": 5},
      use_gpu = 0)

[38;5;4mℹ Saving to output directory: example_model[0m
[38;5;4mℹ Using GPU: 0[0m
[1m


Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'textcat'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS TEXTCAT  CATS_SCORE  SCORE 
---  ------  -------------  ------------  ----------  ------
  0     400           0.01         29.13       84.27    0.84
  0     600           0.02         21.04       84.58    0.85
  0     800           0.04         28.55       83.93    0.84
  0    1000           0.05         30.94       84.81    0.85
  0    1200           0.06         27.66       88.08    0.88
  0    1400           0.06         20.69       82.18    0.82
  0    1600           0.09         28.52       86.84    0.87
  0    1800           0.09         22.07       84.13    0.84
  0    2000           0.09         25.36       82.88    0.83
  0    2200           0.14         21.18       86.39    0.86
  0    2400           0.16         28.54       85.74    0.86
  0    2600           0.12         15.82       79.71    0.80
  0    2800   

In [5]:
# load the best version of the model
m = spacy.load('/content/drive/MyDrive/talks/odsc_2023/trained_trf')

In [3]:
# load the best version of the model
m = spacy.load('./example_model/model-best/')

In [4]:
# looking at the outputs from running a simple example
m('This movie is great').cats

{'pos': 0.9998667240142822, 'neg': 0.00013326172484084964}

In [None]:
# if you're interested in trying out spacy's own evaluate
from spacy.cli.evaluate import evaluate
evaluate(model='./example_model/model-best/', data_path='test.spacy')

In [13]:
# get the predicted category from the model
pred = np.array([max(d.cats, key=d.cats.get) for d in m.pipe(X_test)])

In [14]:
print(f'accuracy: {np.where(pred == y_test)[0].shape[0]/y_test.shape[0]}')
print(
    classification_report(y_pred=pred,
                          y_true=y_test))

accuracy: 0.902
              precision    recall  f1-score   support

         neg       0.92      0.89      0.90      7522
         pos       0.89      0.92      0.90      7478

    accuracy                           0.90     15000
   macro avg       0.90      0.90      0.90     15000
weighted avg       0.90      0.90      0.90     15000

