# Case Study: Natural Language Processing

This notebook walks through how to 
- extract keywords described in comments 
- use a pre-trained text analytics model to classify text

In [1]:
!pip install transformers==3.1.0 &> /dev/null
!pip install pyyaml==5.4.1 &> /dev/null

In [None]:
# Data Representation
import numpy as np
import pandas as pd

# Data Modeling
import spacy
import tensorflow
nlp = spacy.load('en_core_web_sm')

# https://github.com/huggingface/transformers
import transformers


random_state = 42
pd.set_option('display.max_rows', 100)

In [None]:
print(f"Transformers version: {transformers.__version__}")
print(f"TensorFlow version: {tensorflow.__version__}")
print(f"Pandas version: {pd.__version__}")

In [None]:
doc = nlp(u"Apple is looking at buying a U.K. startup for $1 billion")
doc

In [None]:
type(doc)

In [None]:
for token in doc:
    print(token.text) # tokens in the processed string

In [None]:
doc = nlp(u"Apple is looking at buying a U.K. startup for $1 billion")
doc.ents

In [None]:
spacy.displacy.render(doc, style='dep', jupyter=True)

In [None]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

## Task 1: Extract Entities
Let's take the first 50 reviews and grab any entities referred to

In [None]:
link = 'https://drive.google.com/file/d/1-JRyJEw1K9SysORKOCu36uxujjxFBKq5/view?usp=sharing'
path = 'https://drive.google.com/uc?export=download&id='+link.split('/')[-2]

In [None]:
reviews_df = pd.read_csv(path)
reviews_df.head(15)

In [None]:
reviews_df.shape

In [None]:
print(f"The reviews are from {reviews_df['date'].min()} to {reviews_df['date'].max()}")

#### Subtask 1: Create an entity extractor

In [None]:
# Get the entity
def extract_entities(text):
    doc = nlp(text)
    entities = [entity.text for entity in doc.ents]
    return entities

# Get the entity label
def extract_entity_labels(text):
    doc = nlp(text)
    entities = [entity.label_ for entity in doc.ents]
    return entities

In [None]:
# Demo text
text = [
    'Google amazon texas ten',
    'Amazon AWS rangers Washington',
    'Apple is looking at buying U.K. startup for $1 Billion',
    'Carnegie Mellon University is great'
]
text_df = pd.DataFrame({'X': text})
text_df

In [None]:
text_df['X'].apply(extract_entities)

In [None]:
text_df['X'].apply(extract_entity_labels)

#### Subtask 2: Apply entity extractor on data

In [None]:
# Now let's try it on the reviews 
reviews_df['comments'].head(50).apply(extract_entities)

## Task 2: Classify Text
Please see [Zero-Shot Learning in Modern NLP](https://joeddav.github.io/blog/2020/05/29/ZSL.html)

A few notes on this example:


*   The [zero-shot-classifier](https://huggingface.co/facebook/bart-large-mnli) is a generalized pre-trained model - for greater performance, this model should be specialized using an approach like [fine-tuning](https://github.com/huggingface/notebooks/blob/main/transformers_doc/custom_datasets.ipynb)
*   Additional pre-trained models that work with the transformers library can be found via [HuggingFace's model repository](https://huggingface.co/models)

In [None]:
classifier = transformers.pipeline("zero-shot-classification") # you can specify to use GPU with the option, device=0

In [None]:
sequence = "Python is the best langauge ever!!!"
candidate_labels = ["negative", "positive"]

classifier(sequence, candidate_labels)

In [None]:
classifier('NY Giants Sucks', candidate_labels)['labels'][0]

In [None]:
classifier("C is a so-so okay language", candidate_labels)


In [None]:
classifier("C is neither positive nor negative", candidate_labels)

#### Subtask 1: Create an sentiment classifier

In [None]:
sentiment_labels = ['positive', 'negative']

def label_sentiment(text):
    return classifier(text, sentiment_labels)['labels'][0]

def sentiment_score(text):
    return classifier(text, sentiment_labels)['scores'][0]

In [None]:
# Demo text
text = [
    'Google amazon texas ten',
    'Apple is looking at buying U.K. startup for $1 Billion',
    'Carnegie Mellon University is great',
    'NY Giants suck',
    'NY Giants are the worst team',
    "Dallas Cowboys are America's Favorite Team!"
]
text_df = pd.DataFrame({'X': text})
text_df

In [None]:
text_df['sentiment'] = text_df['X'].apply(label_sentiment)
text_df['score'] = text_df['X'].apply(sentiment_score)
text_df

In [None]:
reviews_sentiment = reviews_df['comments'].head(250).apply(label_sentiment)
reviews_sentiment

In [None]:
reviews_sentiment.value_counts()

In [None]:
negative_listing_indicies = reviews_sentiment[reviews_sentiment=='negative'].index.tolist()
negative_listing_indicies

In [None]:
reviews_df[reviews_df.index.isin(negative_listing_indicies)]['comments']

In [None]:
reviews_df[reviews_df.index.isin(negative_listing_indicies)]['comments'][14]

In [None]:
reviews_df[reviews_df.index.isin(negative_listing_indicies)]['comments'][80]

In [None]:
reviews_df[reviews_df.index.isin(negative_listing_indicies)]['comments'][83]

In [None]:
reviews_df[reviews_df.index.isin(negative_listing_indicies)]['comments'][132]

In [None]:
reviews_df[reviews_df.index.isin(negative_listing_indicies)]['comments'][230]

## Task 3: Generate Text
Please see [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate)

In [None]:
tokenizer = transformers.GPT2Tokenizer.from_pretrained("gpt2")
tokenizer

In [None]:
# add the EOS token as PAD token to avoid warnings
model = transformers.TFGPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
model

In [None]:
# encode context the generation is conditioned on
input_ids = tokenizer.encode('I enjoy walking with my cute dog', return_tensors='tf')
input_ids

In [None]:
type(input_ids)

In [None]:
# activate beam search and early_stopping
beam_output = model.generate(
    input_ids, 
    max_length=50, 
    num_beams=5, 
    no_repeat_ngram_size=3, 
    early_stopping=True
)
beam_output

In [None]:
print("Output:\n" + 100 * '-')
print(tokenizer.decode(beam_output[0], skip_special_tokens=True))