# start

In [1]:
import numpy as np
import re
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from wordcloud import STOPWORDS

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')
nlp

<spacy.lang.en.English at 0x1d6325e93f0>

In [3]:
# # Define the filepath
# fpath= "Data/Dracula.txt"
# # Use with open syntax
# with open(fpath) as f:
#     txt = f.read()
# # Report length of the text
# print(f"There are {len(txt)} characters in the full text.\n")

In [4]:
# Steps in the pipeline
nlp.pipe_names


['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [5]:
# Disable ner
nlp_no_ner = spacy.load("en_core_web_sm", disable=['ner'])
# Print active components
nlp_no_ner.pipe_names



['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer']

In [6]:
# define text for demonstration
sample_text = "While running in Central Park, \nI noticed a discarded McDonald's container,surounded by buzzing flies was annoying."
print(sample_text)

While running in Central Park, 
I noticed a discarded McDonald's container,surounded by buzzing flies was annoying.


SpaCy Docs

In [7]:
# Create a doc with the nlp pipeline
doc = nlp(sample_text)
type(doc)

spacy.tokens.doc.Doc

In [8]:
print(doc)
doc

While running in Central Park, 
I noticed a discarded McDonald's container,surounded by buzzing flies was annoying.


While running in Central Park, 
I noticed a discarded McDonald's container,surounded by buzzing flies was annoying.

# tokens in doc

In [9]:
# Printing the first 10 tokens separately
for token in doc:
  print(token)

While
running
in
Central
Park
,


I
noticed
a
discarded
McDonald
's
container
,
surounded
by
buzzing
flies
was
annoying
.


In [10]:
# Slicing a token from the doc
token = doc[1]
token

running

text

In [11]:
print(token.text)

running


lemma

In [12]:
print(token.lemma_)

run


part of speech

In [13]:
print(token.pos_)

VERB


is stop word?

In [14]:
print(token.is_stop)

False


is punctuation? 

In [15]:
print(token.is_punct)

False


is whitespace?

In [16]:
print(token.is_space)

False


## token attributes

In [17]:
import pandas as pd
# Create dictionary for desired attributes for each token
token_data = []
for token in doc:
    token_dict = {
        ".text": token.text,
        ".lemma_": token.lemma_,
        ".pos_": token.pos_,
        ".is_stop": token.is_stop,
        ".is_punct": token.is_punct,
        ".is_space": token.is_space
    }
    token_data.append(token_dict)
# Save dictionary as a dataframe
spacy_df = pd.DataFrame(token_data) 
spacy_df.head(10)

Unnamed: 0,.text,.lemma_,.pos_,.is_stop,.is_punct,.is_space
0,While,while,SCONJ,True,False,False
1,running,run,VERB,False,False,False
2,in,in,ADP,True,False,False
3,Central,Central,PROPN,False,False,False
4,Park,Park,PROPN,False,False,False
5,",",",",PUNCT,False,True,False
6,\n,\n,SPACE,False,False,True
7,I,I,PRON,True,False,False
8,noticed,notice,VERB,False,False,False
9,a,a,DET,True,False,False


# preprocessing with SpaCy

In [18]:
# For loop to remove stopwords
cleaned_tokens = []
# For each token 
for token in doc:
    
    # If the token is a stopword, skip it
    if token.is_stop == True:
        continue 
    
    # Otherwise,
    else: 
        # keep the tokens'.text for the final list of tokens
        cleaned_tokens.append(token.text.lower())
print(cleaned_tokens)

['running', 'central', 'park', ',', '\n', 'noticed', 'discarded', 'mcdonald', 'container', ',', 'surounded', 'buzzing', 'flies', 'annoying', '.']


obtaining lemmas

In [19]:
## Adding onto our preprocessing for loop
# For loop to remove stopwords & punctuation
cleaned_lemmas = []
# For each token 
for token in doc:

    # If the token is a stopword,
    if token.is_stop == True:
        # skip it and move onto next token
        continue 
    
    # if the token is punctuation,
    if token.is_punct == True:
        # skip it and move onto next token
        continue
    # if the token is a whitespace  (spaces, new lines, etc)
    if token.is_space == True:
        # skip it and move onto next token
        continue
    
    # Otherwise,
    else: 
        # # keep the tokens'.text for the final list of tokens
        # cleaned_tokens.append(token.text.lower())
        # keep the tokens's .lemma_ for the final list of tokens
        cleaned_lemmas.append(token.lemma_.lower())
        
print(cleaned_lemmas)

['run', 'central', 'park', 'notice', 'discard', 'mcdonald', 'container', 'surounde', 'buzz', 'fly', 'annoying']


compare cleane text and the lemmas

In [20]:
# Compare text and lemmas
print("Tokenized words:\n", cleaned_tokens,"\n")
print("Lemmatized words:\n", cleaned_lemmas)

Tokenized words:
 ['running', 'central', 'park', ',', '\n', 'noticed', 'discarded', 'mcdonald', 'container', ',', 'surounded', 'buzzing', 'flies', 'annoying', '.'] 

Lemmatized words:
 ['run', 'central', 'park', 'notice', 'discard', 'mcdonald', 'container', 'surounde', 'buzz', 'fly', 'annoying']


preprocessing function

In [21]:
def preprocess_doc(doc, remove_stopwords=True, remove_punct=True, use_lemmas=False):
    """Temporary Fucntion - for Education Purposes (we will make something better below)
    """
    tokens = [ ]
    for token in doc:
        # Check if should remove stopwords and if token is stopword
        if (remove_stopwords == True) and (token.is_stop == True):
            # Continue the loop with the next token
            continue
    
        # Check if should remove stopwords and if token is stopword
        if (remove_punct == True) and (token.is_punct == True):
            continue
    
        # Check if should remove stopwords and if token is stopword
        if (remove_punct == True) and (token.is_space == True):
            continue
    
        ## Determine final form of output list of tokens/lemmas
        if use_lemmas:
            tokens.append(token.lemma_.lower())
        else:
            tokens.append(token.text.lower())
    return tokens

In [22]:
# Convert the text to a doc.
doc = nlp(sample_text)
# Tokenizing, keeping stopwords and punctuatin
dirty_tokens = preprocess_doc(doc, remove_stopwords=False,remove_punct=False)
print(dirty_tokens)

['while', 'running', 'in', 'central', 'park', ',', '\n', 'i', 'noticed', 'a', 'discarded', 'mcdonald', "'s", 'container', ',', 'surounded', 'by', 'buzzing', 'flies', 'was', 'annoying', '.']


In [23]:
# Tokenizing, removing stopwords and punctuation
cleaned_tokens = preprocess_doc(doc, remove_stopwords=True,remove_punct=True)
print(cleaned_tokens)

['running', 'central', 'park', 'noticed', 'discarded', 'mcdonald', 'container', 'surounded', 'buzzing', 'flies', 'annoying']


In [24]:
# Lemmatizing, removing stopwords and punctuation
cleaned_lemmas = preprocess_doc(doc, remove_stopwords=True,remove_punct=True, use_lemmas=True)
print(cleaned_lemmas)

['run', 'central', 'park', 'notice', 'discard', 'mcdonald', 'container', 'surounde', 'buzz', 'fly', 'annoying']


## Batch Preprocessing with SpaCy

In [27]:
# # Example Framework (Not runnable)
# lists_of_texts = [text1, text2, text3]
# processed_texts = []
# for doc in nlp.pipe(list_of_texts):
#     doc_tokens = []
#     for token in doc:
#         # ... the same logic from our preprocess docs function.
#         doc_tokens.append(token.text.lower())
        
#     # Append the list of tokens for current doc to processed_texts
#     processed_texts.append(doc_tokens)

NameError: name 'text1' is not defined

In [28]:
def batch_preprocess_texts(
    texts,
    nlp=None,
    remove_stopwords=True,
    remove_punct=True,
    use_lemmas=False,
    disable=["ner"],
    batch_size=50,
    n_process=-1,
):
    """Efficiently preprocess a collection of texts using nlp.pipe()
    Args:
        texts (collection of strings): collection of texts to process (e.g. df['text'])
        nlp (spacy pipe), optional): Spacy nlp pipe. Defaults to None; if None, it creates a default 'en_core_web_sm' pipe.
        remove_stopwords (bool, optional): Controls stopword removal. Defaults to True.
        remove_punct (bool, optional): Controls punctuation removal. Defaults to True.
        use_lemmas (bool, optional): lemmatize tokens. Defaults to False.
        disable (list of strings, optional): named pipeline elements to disable. Defaults to ["ner"]: Used with nlp.pipe(disable=disable)
        batch_size (int, optional): Number of texts to process in a batch. Defaults to 50.
        n_process (int, optional): Number of CPU processors to use. Defaults to -1 (meaning all CPU cores).
    Returns:
        list of tokens
    """
    # from tqdm.notebook import tqdm
    from tqdm import tqdm
    if nlp is None:
        nlp = spacy.load("en_core_web_sm")
    processed_texts = []
    for doc in tqdm(nlp.pipe(texts, disable=disable, batch_size=batch_size, n_process=n_process)):
        tokens = []
        for token in doc:
            # Check if should remove stopwords and if token is stopword
            if (remove_stopwords == True) and (token.is_stop == True):
                # Continue the loop with the next token
                continue
            # Check if should remove stopwords and if token is stopword
            if (remove_punct == True) and (token.is_punct == True):
                continue
            # Check if should remove stopwords and if token is stopword
            if (remove_punct == True) and (token.is_space == True):
                continue
            
            ## Determine final form of output list of tokens/lemmas
            if use_lemmas:
                tokens.append(token.lemma_.lower())
            else:
                tokens.append(token.text.lower())
        processed_texts.append(tokens)
    return processed_texts

In [29]:
# Default args will produce tokens
tokens = batch_preprocess_texts([sample_text])
tokens = tokens[0]
print(tokens)

1it [00:28, 28.16s/it]

['running', 'central', 'park', 'noticed', 'discarded', 'mcdonald', 'container', 'surounded', 'buzzing', 'flies', 'annoying']





test the function

In [None]:
# Setting use_lemmas = True will produce lemmas
lemmas = batch_preprocess_texts([sample_text], use_lemmas=True)
lemmas = lemmas[0]
print(lemmas)

In [30]:
# Setting use_lemmas = True will produce lemmas
lemmas = batch_preprocess_texts([sample_text], use_lemmas=True)
lemmas = lemmas[0]
print(lemmas)

1it [00:33, 33.65s/it]

['run', 'central', 'park', 'notice', 'discard', 'mcdonald', 'container', 'surounde', 'buzz', 'fly', 'annoying']





# more with SpaCy

In [32]:
sample_text = "While running in Central Park, I noticed that the constant buzzing of flies was annoying. However, I couldn't be too upset as they were likely attracted to the McDonald's food that someone carelessly dropped. I wondered, 'How can they be so uncaring?'"
doc = nlp(sample_text)
doc

While running in Central Park, I noticed that the constant buzzing of flies was annoying. However, I couldn't be too upset as they were likely attracted to the McDonald's food that someone carelessly dropped. I wondered, 'How can they be so uncaring?'

extracting a list of sentences from the doc

In [33]:
# Extracting sentences from doc
sentences = list(doc.sents)
len(sentences)

3

extracting named entities

In [34]:
# Print any named entities in the doc and its label
for ent in doc.ents:
    print(ent.text, ent.label_)

Central Park LOC
McDonald ORG
