# NLP Exercises: Prepare

In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

import pandas as pd

import acquire

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brockgreen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/brockgreen/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brockgreen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
article = "Coming into our Data Science program, you will need to know some math and \
stats. However, many of our applicants actually learn in the application process – you \
don’t need to be an expert before applying! Data science is a very accessible field to \
anyone dedicated to learning new skills, and we can work with any applicant to help them \
learn what they need to know. But what “skills” do we mean, exactly? Just what exactly \
are the data science math and stats principles you need to know?', 'What are the main \
math principles you need to know to get into Codeup’s Data Science program?'"

### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

 - Lowercase everything
 - Normalize unicode characters
 - Replace anything that is not a letter, number, whitespace or a single quote.

In [3]:
def basic_clean(str):
    '''
    This function takes in a string and returns the string normalized.
    '''
    # Lowercase everything
    str = str.lower()
    # Normalize unicode characters
    str = unicodedata.normalize('NFKD', str)\
        .encode('ascii', 'ignore')\
        .decode('utf-8', 'ignore')
    # Remove anything that is not a letter, number, or whitespace
    str = re.sub(r"[^a-z0-9\s]", '', str)
    return str

In [4]:
article_clean = basic_clean(article)
len(article_clean)

567

### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [5]:
def tokenize(str):
    '''
    This function takes in a string and returns it tokenized.
    '''
    # Create the tokenizer
    tokenizer = nltk.tokenize.ToktokTokenizer()
    # Use the tokenizer
    str = tokenizer.tokenize(str, return_str=True)
    return str

In [6]:
article_tokens = tokenize(article_clean)
len(article_tokens)

566

### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [7]:
def stem(str):
    '''
    This function takes in a string and returns it stemmed.
    '''
    # Create the porter stemmer
    ps = nltk.porter.PorterStemmer()
    # Use the stemmer on each token
    stems = [ps.stem(word) for word in str.split()]
    # Join the stemmed list of words back into a string
    article_stemmed = ' '.join(stems)
    return article_stemmed, stems

In [8]:
article_stemmed, stems = stem(article_tokens)
article_stemmed

'come into our data scienc program you will need to know some math and stat howev mani of our applic actual learn in the applic process you dont need to be an expert befor appli data scienc is a veri access field to anyon dedic to learn new skill and we can work with ani applic to help them learn what they need to know but what skill do we mean exactli just what exactli are the data scienc math and stat principl you need to know what are the main math principl you need to know to get into codeup data scienc program'

In [9]:
pd.Series(stems).value_counts().head(10)

to        9
need      5
data      4
scienc    4
you       4
what      4
know      4
applic    3
the       3
math      3
Name: count, dtype: int64

### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [10]:
def lemmatize(str):
    '''
    This function takes in a string and returns it lemmatized.
    '''
    # Create the Lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    # Use the lemmatizer on each token
    lemmas = [wnl.lemmatize(word) for word in str.split()]
    # Join the lemmatized list of words back into a string
    article_lemmatized = ' '.join(lemmas)
    return article_lemmatized, lemmas

In [11]:
article_lemmatized, lemmas = lemmatize(article_tokens)

In [12]:
pd.Series(lemmas).value_counts().head(10)

to         9
need       5
what       4
data       4
science    4
you        4
know       4
the        3
math       3
and        3
Name: count, dtype: int64

### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.


This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [13]:
def remove_stopwords(str, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string and returns it after removing stopwords.
    '''
    # Create stopword_list
    stopword_list = stopwords.words('english')
    # Remove 'exclude_words' from stopword_list to keep these in my text
    stopword_list = set(stopword_list) - set(exclude_words)
    # Add in 'extra_words' to stopword_list
    stopword_list = stopword_list.union(set(extra_words))
    # Split words in str
    words = str.split()
    # Create a list of words from my string with stopwords removed and assign to variable
    filtered_words = [w for w in words if w not in stopword_list]
    # Join words in the list back into strings and assign to a variable
    article_without_stopwords = ' '.join(filtered_words)
    return article_without_stopwords


In [14]:
article_without_stopwords = remove_stopwords(article_lemmatized)
print(article_without_stopwords)

coming data science program need know math stats however many applicant actually learn application process dont need expert applying data science accessible field anyone dedicated learning new skill work applicant help learn need know skill mean exactly exactly data science math stats principle need know main math principle need know get codeups data science program


### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.

In [15]:
news_df = acquire.get_news_articles()
news_df.head()

Grabbing contents for business.
Time to grab contents of business: 1.58 seconds
Grabbing contents for entertainment.
Time to grab contents of entertainment: 1.53 seconds
Grabbing contents for technology.
Time to grab contents of technology: 1.25 seconds
Grabbing contents for sports.
Time to grab contents of sports: 0.93 seconds
Job finished!
It took 0.09 minutes to execute scraping


Unnamed: 0,title,body,category
0,Jet Airways Founder Naresh Goyal allowed home ...,A special PMLA court in Mumbai has allowed Jet...,business
1,"Wipro's staff count falls for 4th quarter, att...",Wipro reported a fall in its employee headcoun...,business
2,"Wipro's revenue in Q2 falls to ₹22,516 cr, pro...",IT major Wipro reported a 0.1% fall in its rev...,business
3,"Will march them off to Tihar, they'll know SC'...",Chief Justice of India (CJI) DY Chandrachud re...,business
4,Goldman Sachs CEO David Solomon quits working ...,"David Solomon, the 61-year-old CEO of investme...",business


### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [16]:
codeup_df = acquire.get_blog_articles()
codeup_df.head()

Unnamed: 0,title,article
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...


### 8. For each dataframe, produce the following columns:

 - title to hold the title
 - original to hold the original article/post content
 - clean to hold the normalized and tokenized original with the stopwords removed.
 - stemmed to hold the stemmed version of the cleaned data.
 - lemmatized to hold the lemmatized version of the cleaned data.

In [17]:
# rename article column to original
news_df.rename(columns={'body': 'original'}, inplace=True)
codeup_df.rename(columns={'article': 'original'}, inplace=True)

# add 'clean' column to each df
news_df['clean'] = news_df.original.apply(basic_clean).apply(tokenize).apply(remove_stopwords)
codeup_df['clean'] = codeup_df.original.apply(basic_clean).apply(tokenize).apply(remove_stopwords)

# add 'stemmed' column to each df
news_df['stemmed'] = news_df.clean.apply(stem)
codeup_df['stemmed'] = codeup_df.clean.apply(stem)

# add 'lemmatized' column to each df
news_df['lemmatized'] = news_df.clean.apply(lemmatize)
codeup_df['lemmatized'] = codeup_df.clean.apply(lemmatize)

news_df.head()

Unnamed: 0,title,original,category,clean,stemmed,lemmatized
0,Jet Airways Founder Naresh Goyal allowed home ...,A special PMLA court in Mumbai has allowed Jet...,business,special pmla court mumbai allowed jet airways ...,(special pmla court mumbai allow jet airway fo...,(special pmla court mumbai allowed jet airway ...
1,"Wipro's staff count falls for 4th quarter, att...",Wipro reported a fall in its employee headcoun...,business,wipro reported fall employee headcount fourth ...,(wipro report fall employe headcount fourth co...,(wipro reported fall employee headcount fourth...
2,"Wipro's revenue in Q2 falls to ₹22,516 cr, pro...",IT major Wipro reported a 0.1% fall in its rev...,business,major wipro reported 01 fall revenue julysepte...,(major wipro report 01 fall revenu julyseptemb...,(major wipro reported 01 fall revenue julysept...
3,"Will march them off to Tihar, they'll know SC'...",Chief Justice of India (CJI) DY Chandrachud re...,business,chief justice india cji dy chandrachud reprima...,(chief justic india cji dy chandrachud reprima...,(chief justice india cji dy chandrachud reprim...
4,Goldman Sachs CEO David Solomon quits working ...,"David Solomon, the 61-year-old CEO of investme...",business,david solomon 61yearold ceo investment banking...,(david solomon 61yearold ceo invest bank firm ...,(david solomon 61yearold ceo investment bankin...


In [18]:
codeup_df.head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,Spotlight on APIDA Voices: Celebrating Heritag...,May is traditionally known as Asian American a...,may traditionally known asian american pacific...,(may tradit known asian american pacif island ...,(may traditionally known asian american pacifi...
1,Women in tech: Panelist Spotlight – Magdalena ...,Women in tech: Panelist Spotlight – Magdalena ...,women tech panelist spotlight magdalena rahn c...,(women tech panelist spotlight magdalena rahn ...,(woman tech panelist spotlight magdalena rahn ...
2,Women in tech: Panelist Spotlight – Rachel Rob...,Women in tech: Panelist Spotlight – Rachel Rob...,women tech panelist spotlight rachel robbinsma...,(women tech panelist spotlight rachel robbinsm...,(woman tech panelist spotlight rachel robbinsm...
3,Women in Tech: Panelist Spotlight – Sarah Mellor,Women in tech: Panelist Spotlight – Sarah Mell...,women tech panelist spotlight sarah mellor cod...,(women tech panelist spotlight sarah mellor co...,(woman tech panelist spotlight sarah mellor co...
4,Women in Tech: Panelist Spotlight – Madeleine ...,Women in tech: Panelist Spotlight – Madeleine ...,women tech panelist spotlight madeleine capper...,(women tech panelist spotlight madelein capper...,(woman tech panelist spotlight madeleine cappe...


### 9. Ask yourself:

 - If your corpus is 493KB, would you prefer to use stemmed or lemmatized text? **Lemmatized**
 - If your corpus is 25MB, would you prefer to use stemmed or lemmatized text? **Lemmatized**
 - If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text? **Stemmed**