In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire
import prepare

#### 1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it::
- lowercase everyth9ing
- normalize unicode characters
- replace anything that is not a letter, number, whitespace, or single quote

In [2]:
def basic_clean(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text)\
        .encode('ascii', 'ignore')\
        .decode('utf-8', 'ignore')
    text = re.sub(r"[^a-z0-9'\s]", '', text)
    return text

#### 2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.


In [3]:
def tokenize(text):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(text, return_str=True)

#### 3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.


In [4]:
def stem(text):
    ps = nltk.porter.PorterStemmer()
    words = text.split()
    stems = [ps.stem(word) for word in words]
    return ' '.join(stems)

#### 4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.


In [5]:
def lemmatize(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    return ' '.join(lemmas)

#### 5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [6]:
def remove_stopwords(text, stopword_list=stopwords.words('english')):
    words = text.split()
    filtered_words = [word for word in words if word not in stopword_list]
    return ' '.join(filtered_words)

#### 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.


In [7]:
news_df = acquire.get_news_articles()

Reading from local CSV...


#### 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.


In [8]:
codeup_df = acquire.get_blog_articles()

Reading from local CSV...


#### 8. For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.



In [9]:
def nlp_prep(df):
    df = df.rename(columns={'content':'original'})
    df['clean'] = (df.original.apply(basic_clean)
                     .apply(tokenize)
                     .apply(remove_stopwords)
                  )
    df['stemmed'] = df.clean.apply(stem)
    df['lemmatized'] = df.clean.apply(lemmatize)
    return df

#### News Articles

In [10]:
news_df.head(3)

Unnamed: 0,title,author,content,category
0,Rupee hits all-time low of 77.42 against US do...,Apaar Sharma,The Indian rupee fell to an all-time low of 77...,business
1,Bitcoin falls to the lowest level since Januar...,Pragya Swastik,"Bitcoin fell on Monday to as low as $33,266 in...",business
2,Made best possible decision: IndiGo on barring...,Pragya Swastik,IndiGo's CEO Ronojoy Dutta said the airline ma...,business


In [11]:
news_df = nlp_prep(news_df)
news_df.head()

Unnamed: 0,title,author,original,category,clean,stemmed,lemmatized
0,Rupee hits all-time low of 77.42 against US do...,Apaar Sharma,The Indian rupee fell to an all-time low of 77...,business,indian rupee fell alltime low 7742 us dollar m...,indian rupe fell alltim low 7742 us dollar mon...,indian rupee fell alltime low 7742 u dollar mo...
1,Bitcoin falls to the lowest level since Januar...,Pragya Swastik,"Bitcoin fell on Monday to as low as $33,266 in...",business,bitcoin fell monday low 33266 morning trade ne...,bitcoin fell monday low 33266 morn trade near ...,bitcoin fell monday low 33266 morning trade ne...
2,Made best possible decision: IndiGo on barring...,Pragya Swastik,IndiGo's CEO Ronojoy Dutta said the airline ma...,business,indigo ' ceo ronojoy dutta said airline made b...,indigo ' ceo ronojoy dutta said airlin made be...,indigo ' ceo ronojoy dutta said airline made b...
3,India's biggest IPO of LIC subscribed nearly 3...,Pragya Swastik,"LIC's IPO, India's biggest IPO which opened on...",business,lic ' ipo india ' biggest ipo opened may 4 clo...,lic ' ipo india ' biggest ipo open may 4 close...,lic ' ipo india ' biggest ipo opened may 4 clo...
4,"If I die under mysterious circumstances, nice ...",Ridham Gambhir,"Tesla CEO Elon Musk has tweeted, ""If I die und...",business,tesla ceo elon musk tweeted die mysterious cir...,tesla ceo elon musk tweet die mysteri circumst...,tesla ceo elon musk tweeted die mysterious cir...


#### Codeup Articles

In [12]:
codeup_df.head()

Unnamed: 0,title,content
0,From Bootcamp to Bootcamp | A Military Appreci...,"In honor of Military Appreciation Month, join ..."
1,Our Acquisition of the Rackspace Cloud Academy...,"Just about a year ago on April 16th, 2021 we a..."
2,Learn to Code: HTML & CSS on 4/30,HTML & CSS are the design building blocks of a...
3,Learn to Code: Python Workshop on 4/23,"According to LinkedIn, the “#1 Most Promising ..."
4,Coming Soon: Cloud Administration,We’re launching a new program out of San Anton...


In [13]:
codeup_df = nlp_prep(codeup_df)
codeup_df.head()

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,From Bootcamp to Bootcamp | A Military Appreci...,"In honor of Military Appreciation Month, join ...",honor military appreciation month join us disc...,honor militari appreci month join us discuss c...,honor military appreciation month join u discu...
1,Our Acquisition of the Rackspace Cloud Academy...,"Just about a year ago on April 16th, 2021 we a...",year ago april 16th 2021 announced acquisition...,year ago april 16th 2021 announc acquisit rack...,year ago april 16th 2021 announced acquisition...
2,Learn to Code: HTML & CSS on 4/30,HTML & CSS are the design building blocks of a...,html css design building blocks websites inter...,html css design build block websit interact da...,html cs design building block website interact...
3,Learn to Code: Python Workshop on 4/23,"According to LinkedIn, the “#1 Most Promising ...",according linkedin 1 promising job data scienc...,accord linkedin 1 promis job data scienc codeu...,according linkedin 1 promising job data scienc...
4,Coming Soon: Cloud Administration,We’re launching a new program out of San Anton...,launching new program san antonio acquisition ...,launch new program san antonio acquisit racksp...,launching new program san antonio acquisition ...


#### Testing the prepare module

In [14]:
news_df = acquire.get_news_articles()
news_df.head(3)

Reading from local CSV...


Unnamed: 0,title,author,content,category
0,Rupee hits all-time low of 77.42 against US do...,Apaar Sharma,The Indian rupee fell to an all-time low of 77...,business
1,Bitcoin falls to the lowest level since Januar...,Pragya Swastik,"Bitcoin fell on Monday to as low as $33,266 in...",business
2,Made best possible decision: IndiGo on barring...,Pragya Swastik,IndiGo's CEO Ronojoy Dutta said the airline ma...,business


In [15]:
news_df = prepare.nlp_prep(news_df)
news_df.head(3)

Unnamed: 0,title,author,original,category,clean,stemmed,lemmatized
0,Rupee hits all-time low of 77.42 against US do...,Apaar Sharma,The Indian rupee fell to an all-time low of 77...,business,indian rupee fell alltime low 7742 us dollar m...,indian rupe fell alltim low 7742 us dollar mon...,indian rupee fell alltime low 7742 u dollar mo...
1,Bitcoin falls to the lowest level since Januar...,Pragya Swastik,"Bitcoin fell on Monday to as low as $33,266 in...",business,bitcoin fell monday low 33266 morning trade ne...,bitcoin fell monday low 33266 morn trade near ...,bitcoin fell monday low 33266 morning trade ne...
2,Made best possible decision: IndiGo on barring...,Pragya Swastik,IndiGo's CEO Ronojoy Dutta said the airline ma...,business,indigo ' ceo ronojoy dutta said airline made b...,indigo ' ceo ronojoy dutta said airlin made be...,indigo ' ceo ronojoy dutta said airline made b...


In [16]:
codeup_df = acquire.get_blog_articles()
codeup_df.head(3)

Reading from local CSV...


Unnamed: 0,title,content
0,From Bootcamp to Bootcamp | A Military Appreci...,"In honor of Military Appreciation Month, join ..."
1,Our Acquisition of the Rackspace Cloud Academy...,"Just about a year ago on April 16th, 2021 we a..."
2,Learn to Code: HTML & CSS on 4/30,HTML & CSS are the design building blocks of a...


In [18]:
codeup_df = prepare.nlp_prep(codeup_df)
codeup_df.head(3)

Unnamed: 0,title,original,clean,stemmed,lemmatized
0,From Bootcamp to Bootcamp | A Military Appreci...,"In honor of Military Appreciation Month, join ...",honor military appreciation month join us disc...,honor militari appreci month join us discuss c...,honor military appreciation month join u discu...
1,Our Acquisition of the Rackspace Cloud Academy...,"Just about a year ago on April 16th, 2021 we a...",year ago april 16th 2021 announced acquisition...,year ago april 16th 2021 announc acquisit rack...,year ago april 16th 2021 announced acquisition...
2,Learn to Code: HTML & CSS on 4/30,HTML & CSS are the design building blocks of a...,html css design building blocks websites inter...,html css design build block websit interact da...,html cs design building block website interact...


#### 9. Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?

