# PyData NLP workshop

In [None]:
# setup
import sys
import subprocess
import pkg_resources
from collections import Counter
import re


required = {'spacy', 'scikit-learn', 'spacy-transformers'}
installed = {pkg.key for pkg in pkg_resources.working_set}
missing = required - installed

if missing:
    python = sys.executable
    subprocess.check_call([python, '-m', 'pip', 'install', *missing], stdout=subprocess.DEVNULL)

import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation


## Tokenization
This section shows some of the considerations to make when tokenizing your data.

Token = "Useful semantic unit"

But what does that mean? This section will detail some considerations here.

In [None]:
# importing different languages in spacy
# blank English model
from spacy.lang.en import English
en = English()
# blank Chinese model
# to run, will need to install jieba tokenizer (optional)
#!pip install jieba
from spacy.lang.zh import Chinese

zh = spacy.lang.zh.Chinese()
zh_text = '我们正在做NLP。'
print('Tokenize in Chinese:', [x.text for x in zh(zh_text)])
print('Tokenize in English:', [x.text for x in en(zh_text)])

In [None]:
# lowercasing
text = 'We are doing NLP.'
print('Base python: ', text.lower())
print('SpaCy:', [x.lower_ for x in en(text)])

In [None]:
# handling non-alpha
text = 'We are doing NLP.'
# base python
strip_punct = '[^A-Za-z0-9 ]'
print(re.sub(strip_punct, '', text))
# spacy
print([x.text for x in en(text) if x.is_alpha])

In [None]:
# but what about contractions?
text = "We're doing NLP."
# base python
strip_punct = '[^A-Za-z0-9 ]'
print('Just removing punctuation:', re.sub(strip_punct, '', text))
# spacy
print('Removing non-alpha', [x.text for x in en(text) if x.is_alpha])

You can see here that the is_alpha flag is False for any tokens that have non-alpha characters.  We'll look into a better way for dealing with contractions later.

### Exercise: Create a tokenizer
In this exercise, you will make a function that uses spaCy's base English model to tokenize a dataset according to specific parameters.  The functions will take a list of documents and output a list of tokens.  In this case we're interested in outputting strings, rather than spaCy tokens.

In [None]:
# data
text_data = ["I'm at a workshop with PyData.",
            "I'm learning about Natural Language Processing.",
            "We are studying tokenization, vectorization and modelling.",
            "Check out the workshop on Github: https://github.com/bpben/pydata_nlp_workshop"]

In [None]:
# initialize model
en = English()

def tokenize_base(docs, model=en):
    # tokenizer that just parses using spaCy's base model
    tokenized_docs = []
    for d in docs:
        parsed = model(d)
        tokenized_docs.append([t.text for t in parsed])
    return(tokenized_docs)

def tokenize_lower_alpha(docs, model=en):
    # tokenizer that lowercases and removes any non-alpha character
    tokenized_docs = []
    for d in docs:
        parsed = model(d)
        tokenized_docs.append([t.lower_ for t in parsed if t.is_alpha])
    return(tokenized_docs)

def tokenize_lower_alpha_url(docs, model=en):
    # tokenizer that lowercases, removes any non-alpha character and removes urls
    tokenized_docs = []
    for d in docs:
        parsed = model(d)
        tokenized_docs.append([t.lower_ for t in parsed if (t.is_alpha)&(not t.like_url)])
    return(tokenized_docs)

### Lemmatization and Stemming
Though word tense can sometimes carry with it a lot of useful information, a lot of time it might be useful to reduce words to their common root.  For example, the word "be" has various forms like "are", "is", "been".  We might not want our vocabulary to contain all these forms and rather count them all as instances of "be".

In [None]:
# read in English model with tagging/entity pipeline components
# you will need to run the line below beforehand
#!python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm')
text = 'I am taking an NLP course.'
print(text)
print([x.lemma_ for x in nlp(text)])

### Stop words
Dealing with stop words involves making some pretty impactful decisions with your data.  Refer to the slides for details.  Here, we just remove stop words based on [spaCy's default set](https://github.com/explosion/spaCy/blob/master/spacy/lang/en/stop_words.py).

In [None]:
en = English()
text = 'In May 2020, I went to a PyData meetup in Cambridge.'
print(text)
print([x.text for x in en(text) if not x.is_stop])

### Non-standard tokens (e.g. named-entities)
In text, some some n-grams should not be treated as a concatenation of unigrams.  For example, New York City is fundamentally different from the individual words "new", "york" and "city".

Here we attempt to deal with some of these non-standard tokens

In [None]:
# urls
# base python
# regex from textacy: https://github.com/chartbeat-labs/textacy
SHORT_URL_REGEX = re.compile(
    r"(?:^|(?<![\w/.]))"
    # optional scheme
    r"(?:(?:https?://)?)"
    # domain
    r"(?:\w-?)*?\w+(?:\.[a-z]{2,12}){1,3}"
    r"/+",
    flags=re.IGNORECASE)
text = 'Check out these meetups: https://www.meetup.com/PyData-Boston-Cambridge/'
print(text)
print(SHORT_URL_REGEX.sub('', text))
# spacy
print([x for x in en(text) if not x.like_url])
# spacy - replace with a standard token
print(['-URL-' if x.like_url else x for x in en(text)])

In [None]:
# named-entities
# read in English model with tagging/entity pipeline components
nlp = spacy.load('en_core_web_sm')
text = 'I am attending a meetup on Zoom on May 27th, 2020'
parsed = nlp(text)
# look at the individual tokens
tokens = [t for t in parsed]
print(tokens)
# look at the identified named-entities and their types
for e in parsed.ents:
    print(e, type(e), e.label_, spacy.explain(e.label_))

### Exercise: A comprehensive tokenization pipeline

In [None]:
# data
text_data = ["I'm at a workshop with PyData.",
            "I'm learning about Natural Language Processing.",
            "We are studying tokenization, vectorization and modelling.",
            "Check out the workshop on Github: https://github.com/bpben/pydata_nlp_workshop"]

In [None]:
# initialize model
nlp = spacy.load('en_core_web_sm')

def tokenize_full(docs, model=nlp, 
                  entities=False, 
                  stop_words=False, 
                  lowercase=True, 
                  alpha_only=True, 
                  lemma=True):
    """Full tokenizer with flags for processing steps
    entities: If False, replaces with entity type
    stop_words: If False, removes stop words
    lowercase: If True, lowercases all tokens
    alpha_only: If True, removes all non-alpha characters
    lemma: If True, lemmatizes words
    """
    tokenized_docs = []
    for d in docs:
        parsed = model(d)
        # token collector
        tokens = []
        # index pointer
        i = 0
        # entity collector
        ent = ''
        for t in parsed:
            # only need this if we're replacing entities
            if not entities:
                # replace URLs
                if t.like_url:
                    tokens.append('URL')
                    continue
                # if there's entities collected and current token is non-entity
                if (t.ent_iob_=='O')&(ent!=''):
                    tokens.append(ent)
                    ent = ''
                    continue
                elif t.ent_iob_!='O':
                    ent = t.ent_type_
                    continue
            # only include stop words if stop words==True
            if (t.is_stop)&(not stop_words):
                continue
            # only include non-alpha is alpha_only==False
            if (not t.is_alpha)&(alpha_only):
                continue
            if lemma:
                t = t.lemma_
            else:
                t = t.text
            if lowercase:
                t.lower()
            tokens.append(t)
        tokenized_docs.append(tokens)
    return(tokenized_docs)

In [None]:
tokenize_full(text_data, stop_words=True, alpha_only=False, entities=True)

## Word counts
A very basic way to use a sanitized list of tokens is to do a word count.  This unlocks a lot of insights right off and is an important step in exploratory data analysis in text.

In [None]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en import English
en = English()

def simple_tokenizer(doc, model=en):
    # a simple tokenizer for individual documents (different from above)
    tokenized_docs = []
    parsed = model(doc)
    return([t.lower_ for t in parsed if (t.is_alpha)&(not t.like_url)])

In [None]:
# data
text_data = ["I'm at a meetup for PyData.",
            "I'm learning about Natural Language Processing.",
            "We are studying tokenization, vectorization and modelling.",
            "Check out the course on Github: https://github.com/bpben/nlp_lessons"]
tokenized = [simple_tokenizer(doc) for doc in text_data]

In [None]:
# base python: create an make use of a Counter object
counts = [Counter(d) for d in tokenized]
print('List of counts:', counts)
# sum together all counts
all_counts = Counter()
for d in tokenized:
    all_counts += Counter(d)
print(counts)
print('\nCombined count:', all_counts)

In [None]:
# scikit-learn's countvectorizer
# use our custom tokenizer
cv = CountVectorizer(tokenizer=simple_tokenizer)
# outputs sparse array, want to use a normal numpy array
v = cv.fit_transform(text_data).toarray()
# get_feature_names gets the vocabulary of the vectorizer in order
dict(zip(cv.get_feature_names(), v.sum(axis=0)))
# result is the same as above

### Exercise: Sentiment analysis with word counts
Imagine you are a hot dog restaurant owner and you want to analyze a corpus of reviews from diners to see whether people generally think your hot dogs are "good" or "bad".  Specifically, you're going to count up the number of times the word "good" and word "bad" appears.  Depending on how you process the text, you will arrive at different conclusions.  Try a couple ways to see what I mean.

You might also want to think about whether all the reviews are relevant.  Those sorts of choices may also affect your results.  Is there an automatic way you can remove non-relevant reviews?

In [None]:
reviews = ['These hot dogs are really good.',
          'These hot dogs are really bad.',
          'Good hot dogs!',
          'The hot dogs pair well with a Good Humor bar.',
          "I didn't eat anything, I felt bad."]

## Word counts revisited
Let's remind ourselves how sklearn's CountVectorizer worked (from last week).

In [None]:
# scikit-learn's countvectorizer
# use our custom tokenizer
cv = CountVectorizer(tokenizer=simple_tokenizer)

In [None]:
# data
text_data = ["I'm at a workshop with PyData.",
            "I'm learning about Natural Language Processing.",
            "We are studying tokenization, vectorization and modelling.",
            "Check out the workshop on Github: https://github.com/bpben/pydata_nlp_workshop"]
# outputs sparse array, want to use a normal numpy array
v = cv.fit_transform(text_data).toarray()
# get_feature_names gets the vocabulary of the vectorizer in order
dict(zip(cv.get_feature_names(), v.sum(axis=0)))

Works as expected.  Why don't we try this on a real dataset?

In [None]:
# you will need to change this to where ever the file is stored
data_location = '../data/movie_reviews_subset.pkl'
with open(data_location, 'rb') as f:
    all_text = pickle.load(f)
# corpora size
print([(k, len(all_text[k])) for k in all_text])
# for simplicity, let's split these into separate sets
neg, pos = all_text.values()

In [None]:
# running this on negative reviews
cv = CountVectorizer(tokenizer=simple_tokenizer)
neg_vectors = cv.fit_transform(neg).toarray()
# get_feature_names gets the vocabulary of the vectorizer in order
word_count = dict(zip(cv.get_feature_names(), neg_vectors.sum(axis=0)))
# get the top 10 words
sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:10]

In [None]:
# now do it for positive reviews
cv = CountVectorizer(tokenizer=simple_tokenizer)
pos_vectors = cv.fit_transform(pos).toarray()
# get_feature_names gets the vocabulary of the vectorizer in order
word_count = dict(zip(cv.get_feature_names(), pos_vectors.sum(axis=0)))
# get the top 10 words
sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:10]

These words aren't particularly informative about the content.  Sklearn's CountVectorizer has some additional options that may lead to somewhat more informative frequent terms.

In [None]:
for corpus in [neg, pos]:
    cv = CountVectorizer(tokenizer=simple_tokenizer, min_df=0.01, max_df=0.9,
                        stop_words='english')
    vectors = cv.fit_transform(corpus).toarray()
    # get_feature_names gets the vocabulary of the vectorizer in order
    word_count = dict(zip(cv.get_feature_names(), vectors.sum(axis=0)))
    # get the top 10 words
    print(sorted(word_count.items(), key=lambda x: x[1], reverse=True)[:10])

This is better, but it seems like we'd have to tweak these thresholds a lot and carefully choose our stop words.  Is there a more standard way to extract the most informative words from documents?

## Term Frequency-Inverse Document Frequency (TF-IDF)
See the slides for more information on this.  In this section we'll show how TF-IDF is essentially just a weighting of the count vectors.  We'll then use sklearn's built-in TfidfVectorizer on our sentiment corpora.

In [None]:
docs = ['The movie was good',
        'The movie was bad',
        'The movie was great']

cv = CountVectorizer(tokenizer=simple_tokenizer)
vecs = cv.fit_transform(docs).toarray()
# we'll use pandas DF for easier display
pd.DataFrame(vecs, columns=cv.get_feature_names())

You'll notice that `vecs` contains the term frequencies.  If we use sklearn's `TfidfVectorizer`, it will calculate those term counts and then multiply them by the Inverse Document Frequency (IDF).

The formula sklearn uses is a bit different from the textbook:

$$log(\frac{N+1}{df(t)+1}) + 1$$

Where $N$ is the number of documents.  It also normalizes this value to account for different size vectors (see slides).

In [None]:
tfidf = TfidfVectorizer(tokenizer=simple_tokenizer)
# we'll use pandas DF for easier display
tfidf_vecs = tfidf.fit_transform(docs).toarray()
tfidf_df = pd.DataFrame(tfidf_vecs, columns=tfidf.get_feature_names())
tfidf_df

You can see that the discriminative words (i.e. bad, good, great) have higher weight than the non-discriminative words.  

We see this at the document level, but is there a way we could get some kind of aggregate measure of discriminative words?

### Exercise: Find the top 3 discriminative words
Use the dataset above to try and identify the words that, across the corpus, are particularly representative of content.

Hint: Think about what a weight of zero versus weight of non-zero means.

In [None]:
def top_tfidf_words(tfidf_df):
    return(tfidf_df[tfidf_df>0].mean(axis=0))

In [None]:
top_tfidf_words(tfidf_df)

Now let's run that on our movie reviews dataset.

In [None]:
for corpus in [neg, pos]:
    # adding in a minimum document frequency, so words need to occur at least somewhat often
    tfidf = TfidfVectorizer(tokenizer=simple_tokenizer, min_df=0.02)
    vectors = tfidf.fit_transform(corpus).toarray()
    tfidf_df = pd.DataFrame(vectors, columns=tfidf.get_feature_names())
    # get representative words
    tfidf_word_count = top_tfidf_words(tfidf_df)
    # get the top 10 words
    print(tfidf_word_count.sort_values().iloc[-10:])

These are somewhat useful aggregate measures.  But most of the information in TF-IDF is document-specific.


## Topic models: Non-negative Matrix Factorization and Latent Dirichlet Allocation

In [None]:
def display_components(model, word_features, top_display=5):
    # utility for displaying respresentative words per component for topic models
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        top_words_idx = topic.argsort()[::-1][:top_display]
        top_words = [word_features[i] for i in top_words_idx]
        print(" ".join(top_words))

In [None]:
# in this case, excluding standard english stop words
tfidf = TfidfVectorizer(tokenizer=simple_tokenizer, stop_words='english')
tfidf_vecs = tfidf.fit_transform(all_reviews)
cv = CountVectorizer(tokenizer=simple_tokenizer, stop_words='english')
count_vecs = cv.fit_transform(all_reviews)

In [None]:
# choose the number of components (topics)
n_components = 10
# basic configuration
nmf = NMF(n_components=n_components)
# NMF requires tfidf, not word counts
# same syntax as vectorizer
nmf_vecs = nmf.fit_transform(tfidf_vecs)
# LDA uses word counts
lda = LatentDirichletAllocation(n_components=n_components)
lda_vecs = lda.fit_transform(count_vecs)

Both NMF and LDA provide a components matrix which corresponds to the loading of each word on each topic.  Higher values means the word is more relevant to that topic.

In [None]:
print(nmf.components_)

For evaluating performance, both methods use different ways to quantify the loss from using the topic model versus the actual data.  (In the matrix formulation, $UV$ rather than $X$).  For NMF, it's reconstruction error, which is more directly the difference between the matrix decomposition and the actual data.  For LDA, it uses [ELBO](https://en.wikipedia.org/wiki/Evidence_lower_bound), which is a too complicated to explain here.  In both, higher values means worse performance.  They can't be compared to one another, though.

In [None]:
print(nmf.reconstruction_err_, lda.bound_)

In [None]:
display_components(nmf, tfidf.get_feature_names())

In [None]:
display_components(lda, cv.get_feature_names())

NMF seems to have come up with some reasonable topics, but LDA doesn't seem to work particularly well here.  It may make sense to try some additional token processing and see how that affects what we get out of the topic modelling process.

### Exercise: Tokenization decisions and topic models
Using the tokenizer from week 1 or your own tokenizer, explore how your tokenization decisions up stream might affect your results downstream.

In [None]:
# initialize model
nlp = spacy.load('en_core_web_sm')

def tokenize_full(docs, model=nlp, 
                  entities=False, 
                  stop_words=False, 
                  lowercase=True, 
                  alpha_only=True, 
                  lemma=True):
    """Full tokenizer with flags for processing steps
    entities: If False, replaces with entity type
    stop_words: If False, removes stop words
    lowercase: If True, lowercases all tokens
    alpha_only: If True, removes all non-alpha characters
    lemma: If True, lemmatizes words
    """
    tokenized_docs = []
    for d in docs:
        parsed = model(d)
        # token collector
        tokens = []
        # index pointer
        i = 0
        # entity collector
        ent = ''
        for t in parsed:
            # only need this if we're replacing entities
            if not entities:
                # replace URLs
                if t.like_url:
                    tokens.append('URL')
                    continue
                # if there's entities collected and current token is non-entity
                if (t.ent_iob_=='O')&(ent!=''):
                    tokens.append(ent)
                    ent = ''
                    continue
                elif t.ent_iob_!='O':
                    ent = t.ent_type_
                    continue
            # only include stop words if stop words==True
            if (t.is_stop)&(not stop_words):
                continue
            # only include non-alpha is alpha_only==False
            if (not t.is_alpha)&(alpha_only):
                continue
            if lemma:
                t = t.lemma_
            else:
                t = t.text
            if lowercase:
                t.lower()
            tokens.append(t)
        tokenized_docs.append(tokens)
    return(tokenized_docs)

In [None]:
tokenized = tokenize_full(all_reviews, entities=True)

In [None]:
# if passing a list of tokens to a vectorizer, you can use the following syntax
tfidf = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)
tfidf_vecs = tfidf.fit_transform(tokenized)
cv = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
count_vecs = cv.fit_transform(tokenized)

In [None]:
n_components = 10
nmf = NMF(n_components=n_components)
nmf_vecs = nmf.fit_transform(tfidf_vecs)
lda = LatentDirichletAllocation(n_components=n_components)
lda_vecs = lda.fit_transform(count_vecs)

In [None]:
display_components(nmf, tfidf.get_feature_names())

In [None]:
display_components(lda, cv.get_feature_names())