## Text Preprocessing

### 0. Create Data Sets

In [2]:
import pandas as pd

In [3]:
data = [
    "When life gives you lemons, make lemonade! ðŸ™‚",
    "She bought 2 lemons for $1 at Maven Market.",
    "A dozen lemons will make a gallon of lemonade. [AllRecipes]",
    "lemon, lemon, lemons, lemon, lemon, lemons",
    "He's running to the market to get a lemon â€” there's a great sale today.",
    "Does Maven Market carry Eureka lemons or Meyer lemons?",
    "An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",
    "iced tea is my favorite"
]

In [4]:
pd.set_option('display.max_colwidth', None)  # To display full text in DataFrame cells

In [5]:
data_df = pd.DataFrame(data, columns=['sentence'])

In [6]:
data_df

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! ðŸ™‚"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon â€” there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]"
7,iced tea is my favorite


In [7]:
test = [
    "We're going to start this course with traditional NLP applications.",
    "Then we'll move on to modern NLP theory.",
    "Finally, we'll wrap things up with modern NLP applications."
]

test_series = pd.Series(test)
test_series

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

### 1. Text Preporcessing with Pandas

In [8]:
df = data_df.copy()
df

Unnamed: 0,sentence
0,"When life gives you lemons, make lemonade! ðŸ™‚"
1,She bought 2 lemons for $1 at Maven Market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon â€” there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]"
7,iced tea is my favorite


In [9]:
# lowercase
df['sentence_clean'] = df['sentence'].str.lower()

In [10]:
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! ðŸ™‚","when life gives you lemons, make lemonade! ðŸ™‚"
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for $1 at maven market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade. [allrecipes]
3,"lemon, lemon, lemons, lemon, lemon, lemons","lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon â€” there's a great sale today.,he's running to the market to get a lemon â€” there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]","an arnold palmer is half lemonade, half iced tea. [wikipedia]"
7,iced tea is my favorite,iced tea is my favorite


In [11]:
# remove [] 
df['sentence_clean'] = df['sentence_clean'].str.replace(r'\[.*?\]', '', regex=True)
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! ðŸ™‚","when life gives you lemons, make lemonade! ðŸ™‚"
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for $1 at maven market.
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade.
3,"lemon, lemon, lemons, lemon, lemon, lemons","lemon, lemon, lemons, lemon, lemon, lemons"
4,He's running to the market to get a lemon â€” there's a great sale today.,he's running to the market to get a lemon â€” there's a great sale today.
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons?
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]","an arnold palmer is half lemonade, half iced tea."
7,iced tea is my favorite,iced tea is my favorite


In [12]:
# remove punctuation
df['sentence_clean'] = df['sentence_clean'].str.replace(r'[^\w\s]', '', regex=True)
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! ðŸ™‚",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade
3,"lemon, lemon, lemons, lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon â€” there's a great sale today.,hes running to the market to get a lemon theres a great sale today
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


In [13]:
# Creating a function to encapsulate the preprocessing steps
df['sentence_clean'] = df['sentence'].str.lower()
df['sentence_clean'] = df['sentence_clean'].str.replace(r'\[.*?\]', '', regex=True)
df['sentence_clean'] = df['sentence_clean'].str.replace(r'[^\w\s]', '', regex=True)

In [14]:
def lower_replace(text_series):
    text_series = text_series.str.lower()
    text_series = text_series.str.replace(r'\[.*?\]', '', regex=True)
    text_series = text_series.str.replace(r'[^\w\s]', '', regex=True)
    return text_series

In [15]:
test_series

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

In [16]:
lower_replace(test_series)

0    were going to start this course with traditional nlp applications
1                               then well move on to modern nlp theory
2             finally well wrap things up with modern nlp applications
dtype: object

In [17]:
df.sentence

0                               When life gives you lemons, make lemonade! ðŸ™‚
1                                She bought 2 lemons for $1 at Maven Market.
2                A dozen lemons will make a gallon of lemonade. [AllRecipes]
3                                 lemon, lemon, lemons, lemon, lemon, lemons
4    He's running to the market to get a lemon â€” there's a great sale today.
5                     Does Maven Market carry Eureka lemons or Meyer lemons?
6              An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]
7                                                    iced tea is my favorite
Name: sentence, dtype: object

In [18]:
lower_replace(df.sentence)

0                              when life gives you lemons make lemonade 
1                              she bought 2 lemons for 1 at maven market
2                         a dozen lemons will make a gallon of lemonade 
3                                  lemon lemon lemons lemon lemon lemons
4    hes running to the market to get a lemon  theres a great sale today
5                  does maven market carry eureka lemons or meyer lemons
6                       an arnold palmer is half lemonade half iced tea 
7                                                iced tea is my favorite
Name: sentence, dtype: object

### 2. Text Preprocessing with spaCy

In [19]:
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! ðŸ™‚",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade
3,"lemon, lemon, lemons, lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon â€” there's a great sale today.,hes running to the market to get a lemon theres a great sale today
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


In [21]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [22]:
phrase = df.sentence_clean[0]
phrase

'when life gives you lemons make lemonade '

In [23]:
doc = nlp(phrase)
doc

when life gives you lemons make lemonade 

In [25]:
# tokenize
[token.text for token in doc]

['when', 'life', 'gives', 'you', 'lemons', 'make', 'lemonade']

In [26]:
# lemmatize
[token.lemma_ for token in doc]

['when', 'life', 'give', 'you', 'lemon', 'make', 'lemonade']

In [29]:
# stop words
list(nlp.Defaults.stop_words)[:10]
norm = [token.lemma_ for token in doc if not token.is_stop]
norm

['life', 'give', 'lemon', 'lemonade']

In [30]:
' '.join(norm)

'life give lemon lemonade'

In [31]:
def token_lemma_nonstop(text):
    doc = nlp(text)
    norm = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(norm)

In [32]:
test_series

0    We're going to start this course with traditional NLP applications.
1                               Then we'll move on to modern NLP theory.
2            Finally, we'll wrap things up with modern NLP applications.
dtype: object

In [33]:
test_series.apply(token_lemma_nonstop)

0    go start course traditional NLP application .
1                              modern NLP theory .
2    finally , wrap thing modern NLP application .
dtype: object

In [34]:
df.sentence

0                               When life gives you lemons, make lemonade! ðŸ™‚
1                                She bought 2 lemons for $1 at Maven Market.
2                A dozen lemons will make a gallon of lemonade. [AllRecipes]
3                                 lemon, lemon, lemons, lemon, lemon, lemons
4    He's running to the market to get a lemon â€” there's a great sale today.
5                     Does Maven Market carry Eureka lemons or Meyer lemons?
6              An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]
7                                                    iced tea is my favorite
Name: sentence, dtype: object

In [37]:
lower_replace(df.sentence)

0                              when life gives you lemons make lemonade 
1                              she bought 2 lemons for 1 at maven market
2                         a dozen lemons will make a gallon of lemonade 
3                                  lemon lemon lemons lemon lemon lemons
4    hes running to the market to get a lemon  theres a great sale today
5                  does maven market carry eureka lemons or meyer lemons
6                       an arnold palmer is half lemonade half iced tea 
7                                                iced tea is my favorite
Name: sentence, dtype: object

In [36]:
lower_replace(df.sentence).apply(token_lemma_nonstop)

0                       life give lemon lemonade
1                     buy 2 lemon 1 maven market
2                    dozen lemon gallon lemonade
3            lemon lemon lemon lemon lemon lemon
4        s run market lemon   s great sale today
5    maven market carry eureka lemon meyer lemon
6       arnold palmer half lemonade half ice tea
7                               ice tea favorite
Name: sentence, dtype: object

In [38]:
# Parts of Speech (POS) tagging
phrase2 = lower_replace(df.sentence).apply(token_lemma_nonstop)[0]
phrase2

'life give lemon lemonade'

In [39]:
doc2 = nlp(phrase2)
doc2

life give lemon lemonade

In [41]:
[(token.text, token.pos_) for token in doc2]

[('life', 'NOUN'), ('give', 'VERB'), ('lemon', 'NOUN'), ('lemonade', 'PROPN')]

In [43]:
[(token.text, token.pos_) for token in doc2 if token.pos_ in ['NOUN', 'PROPN']]

[('life', 'NOUN'), ('lemon', 'NOUN'), ('lemonade', 'PROPN')]

In [46]:
nouns = [token.text for token in doc2 if token.pos_ in ['NOUN', 'PROPN']]
nouns

['life', 'lemon', 'lemonade']

In [47]:
' '.join(nouns)

'life lemon lemonade'

In [49]:
def filter_pos(text, pos_list = ['NOUN', 'PROPN']):
    doc = nlp(text)
    nouns = [token.text for token in doc if token.pos_ in pos_list]
    return ' '.join(nouns)  

In [50]:
test_series.apply(filter_pos)

0    course NLP applications
1                 NLP theory
2    things NLP applications
dtype: object

In [51]:
df

Unnamed: 0,sentence,sentence_clean
0,"When life gives you lemons, make lemonade! ðŸ™‚",when life gives you lemons make lemonade
1,She bought 2 lemons for $1 at Maven Market.,she bought 2 lemons for 1 at maven market
2,A dozen lemons will make a gallon of lemonade. [AllRecipes],a dozen lemons will make a gallon of lemonade
3,"lemon, lemon, lemons, lemon, lemon, lemons",lemon lemon lemons lemon lemon lemons
4,He's running to the market to get a lemon â€” there's a great sale today.,hes running to the market to get a lemon theres a great sale today
5,Does Maven Market carry Eureka lemons or Meyer lemons?,does maven market carry eureka lemons or meyer lemons
6,"An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]",an arnold palmer is half lemonade half iced tea
7,iced tea is my favorite,iced tea is my favorite


In [54]:
df.sentence

0                               When life gives you lemons, make lemonade! ðŸ™‚
1                                She bought 2 lemons for $1 at Maven Market.
2                A dozen lemons will make a gallon of lemonade. [AllRecipes]
3                                 lemon, lemon, lemons, lemon, lemon, lemons
4    He's running to the market to get a lemon â€” there's a great sale today.
5                     Does Maven Market carry Eureka lemons or Meyer lemons?
6              An Arnold Palmer is half lemonade, half iced tea. [Wikipedia]
7                                                    iced tea is my favorite
Name: sentence, dtype: object

In [53]:
lower_replace(df.sentence)

0                              when life gives you lemons make lemonade 
1                              she bought 2 lemons for 1 at maven market
2                         a dozen lemons will make a gallon of lemonade 
3                                  lemon lemon lemons lemon lemon lemons
4    hes running to the market to get a lemon  theres a great sale today
5                  does maven market carry eureka lemons or meyer lemons
6                       an arnold palmer is half lemonade half iced tea 
7                                                iced tea is my favorite
Name: sentence, dtype: object

In [55]:
lower_replace(df.sentence).apply(token_lemma_nonstop)

0                       life give lemon lemonade
1                     buy 2 lemon 1 maven market
2                    dozen lemon gallon lemonade
3            lemon lemon lemon lemon lemon lemon
4        s run market lemon   s great sale today
5    maven market carry eureka lemon meyer lemon
6       arnold palmer half lemonade half ice tea
7                               ice tea favorite
Name: sentence, dtype: object

In [56]:
lower_replace(df.sentence).apply(token_lemma_nonstop).apply(filter_pos)

0                    life lemon lemonade
1                     lemon maven market
2            dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon lemon
4              s market lemon sale today
5        maven market eureka lemon lemon
6           palmer lemonade half ice tea
7                       ice tea favorite
Name: sentence, dtype: object

In [57]:
# notice that by switching the order of the normalization steps, we get different results
lower_replace(df.sentence).apply(filter_pos).apply(token_lemma_nonstop)

0              life lemon lemonade
1               lemon maven market
2      dozen lemon gallon lemonade
3    lemon lemon lemon lemon lemon
4          market lemon sale today
5        market eureka lemon lemon
6              palmer lemonade tea
7                              tea
Name: sentence, dtype: object