In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pathlib import Path
import json
from sklearn.metrics import classification_report
from spacy.cli.train import train

In [None]:
import spacy
from spacy.tokens import DocBin
from spacy.lang.en import English
nlp = spacy.blank("en")

In [None]:
# some people may need this "fix" for installing packages in colab
# source: https://stackoverflow.com/questions/56081324/why-are-google-colab-shell-commands-not-working
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

## Bagging to BERT: A tour of applied NLP
### Part 2: Beyond bagging
### Table of Contents
* [Data processing](#data)
* [CNN](#cnn)
* [BERT](#bert)



### Data processing <a class="anchor" id="data"></a>

Copied from part 1

You'll either need to download the [imdb review data](https://ai.stanford.edu/~amaas/data/sentiment/) and save it to this directory OR download the [processed data](https://drive.google.com/file/d/1oN_fO91IBkDHD_u6WXiUCvhhyNexQDJq/view?usp=sharinghttps://drive.google.com/file/d/1oN_fO91IBkDHD_u6WXiUCvhhyNexQDJq/view?usp=sharing).

In [None]:
# # processing the original data into DataFrame
# # here for reference, don't need to run this if you're using reviews.pkl.gz
# source_path = Path('./aclImdb/')
# #neg_files = source_path.glob('./*/neg/*.txt')
# #pos_files = source_path.glob('./*/pos/*.txt')
# all_files = []
# for f in source_path.glob('./*/*/*.txt'):
#     filename = f.as_posix()
#     if 'unsup' not in filename:
#         # split up into useful components
#         _, split, sent, idx = filename.split('/')
#         idx = int(idx.split('_')[0])
#         all_files.append([idx, split, sent, f.read_text()])
# review_df = pd.DataFrame(all_files)
# review_df.columns = ['idx', 'split', 'label', 'text']
# # some minor html cruft is in here
# review_df['text'] = review_df['text'].str.replace('<br /><br />', '')
# review_df = review_df.to_pickle('reviews.pkl.gz')

In [None]:
# restart your Colab runtime after this
!pip install spacy-transformers

In the live tutorial, I will load some resources from my GDrive so I don't need to wait on the training.  I'll keep these resources available publicly for a bit, feel free to copy them to your Drive.  Access the directory [here](https://drive.google.com/drive/folders/1Xqn66AIm19icDfFtsG1NUQVeIwPEVhML?usp=share_link).  

The path to use to load these resources is dependent on where you store the files.

In [None]:
# mounting GDrive to the colab instance
from google.colab import drive
drive.mount('/content/drive/')

In [None]:
!git clone -b revised_2023 https://github.com/bpben/bagging_to_bert

In [None]:
def preprocess(text, labels, name):
    # preprocessing utility for saving a serialized corpus for spaCy
    # initialize spaCy's DocBin format (easier use with spaCy pipeline)
    db = DocBin()
    output_path = f'{name}.spacy'
    data_tuples = zip(text, labels)
    # pipe is slightly faster than individually processing each
    for doc, label in nlp.pipe(data_tuples, as_tuples=True):
        # store the labels in the document's .cat attribute
        if label == 'pos':
            doc.cats['pos'] = True
            doc.cats['neg'] = False
        else:
            doc.cats['pos'] = False
            doc.cats['neg'] = True
        db.add(doc)
    # save the DocBin
    db.to_disk(output_path)

In [None]:
# if you are using gdrive - you can change that directory here
data_dir = '/content/drive/MyDrive/talks/odsc_2023/'
# otherwise, you can just use
data_dir = './'

In [None]:
# read in reviews pickle file - refer to part 1 for how this is created
review_df = pd.read_pickle(
    f'{data_dir}reviews.pkl.gz')
# copied from part 1: want to use the same train/test split
seed = 37
np.random.seed(seed)
pct_train = 0.7
X_train, X_test, y_train, y_test = train_test_split(
    review_df['text'],
    review_df['label'], train_size=pct_train)

In [None]:
# running the preprocessing on each split
preprocess(X_train, y_train, 'train')
preprocess(X_test, y_test, 'test')

### CNN <a class="anchor" id="cnn"></a>

In [None]:
# can override config info with overrides
# the tutorial config file doesn't have the paths for train/dev corpora
# going to just run this for a few epochs, see how it works
train("./bagging_to_bert/spacy_materials/config.cfg",
      output_path='cnn_model',
      overrides={"paths.train": "train.spacy", 
                 "paths.dev": "test.spacy",
                 "training.max_epochs": 1},
      use_gpu = 0)

In [None]:
# load the best version of the model from my GDrive, your directory structure will be different
m = spacy.load(f'{data_dir}trained_cnn')

In [None]:
# looking at the outputs from running a simple example
m('This movie is great').cats

SpaCy has its own evaluation capabilities, but for comparison's sake, let's use the same evaluation approach we did with out other applications.

In [None]:
# if you're interested in trying out spacy's own evaluate
#from spacy.cli.evaluate import evaluate
#evaluate(model='./cnn_model/model-best/', data_path='test.spacy')

In [None]:
# get the predicted category from the model
pred = np.array([max(d.cats, key=d.cats.get) for d in m.pipe(X_test)])

In [None]:
print(f'accuracy: {np.where(pred == y_test)[0].shape[0]/y_test.shape[0]}')
print(
    classification_report(y_pred=pred,
                          y_true=y_test))

### BERT <a class="anchor" id="bert"></a>
From [HF tutorials](https://huggingface.co/blog/sentiment-analysis-python).  The sentiment analysis pipeline packages together the tokenizer and the BERT model with a classification layer.  The default pipeline uses this [distilBERT model](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english). 

In [None]:
# this will need to be run if you don't already have this package
!pip install transformers

In [None]:
from transformers import pipeline
sentiment_pipeline = pipeline("sentiment-analysis", truncation=True, padding=True)

In [None]:
# some manipulations for speed and to play nice with BERT
bert_pred = sentiment_pipeline(X_test.apply(lambda x: x).head(n=50).tolist())
bert_pred = ['pos' if p['label']=='POSITIVE' else 'neg' for p in bert_pred]

In [None]:
np.where(bert_pred == y_test[:50])
print(f'accuracy: {np.where(bert_pred == y_test[:50])[0].shape[0]/50}')
print(
    classification_report(y_pred=bert_pred,
                          y_true=y_test[:50]))

This is pretty good! But with some minor modifications, we can use our spacy configuration with a transformer model

In [None]:
# can override config info with overrides
# the tutorial config file doesn't have the paths for train/dev corpora
# going to just run this for a few epochs, see how it works
train("./bagging_to_bert/spacy_materials/config_trf.cfg",
      output_path='example_model',
      overrides={"paths.train": "train.spacy", 
                 "paths.dev": "test.spacy",
                 "training.max_epochs": 1},
      use_gpu = 0)

In [None]:
# load the best version of the model from my GDrive, your directory structure will be different
m = spacy.load(f'{data_dir}trained_trf')

In [None]:
# looking at the outputs from running a simple example
m('This movie is great').cats

In [None]:
# if you're interested in trying out spacy's own evaluate
# warning - this may bump up against RAM limits for free versions of colab
from spacy.cli.evaluate import evaluate
evaluate(model='./example_model/model-best/', data_path='test.spacy')

In [None]:
# get the predicted category from the model
# again - may run into RAM limits on free colab
# for the sake of just running this, you can subset the text set
sample_set_size = 500
pred = np.array([max(d.cats, key=d.cats.get) for d in m.pipe(X_test[:sample_set_size])])

In [None]:
accuracy = np.where(
    pred == y_test[:sample_set_size])[0].shape[0]/y_test[:sample_set_size.shape[0]
print(f'accuracy: {accuracy}')
print(
    classification_report(y_pred=pred,
                          y_true=y_test[:sample_set_size]))