In [1]:
import pandas as pd

import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import warnings
warnings.filter="ignore"

import acquire

from bs4 import BeautifulSoup


### Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.


In [2]:
urls = ['https://codeup.com/codeups-data-science-career-accelerator-is-here/', 'https://codeup.com/data-science-myths/', 
        'https://codeup.com/data-science-vs-data-analytics-whats-the-difference/', 'https://codeup.com/10-tips-to-crush-it-at-the-sa-tech-job-fair/',
        'https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/']
acquire.get_blog_articles(urls)

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


In [3]:
def basic_clean(text):
    '''
    This function takes in a string of text and cleans it for NLP by:
    - converting all chracters to lowercase
    - normalizing unicode characters
    - removing any characters that are not letters, numbers, single quote, or space
    
    It returns a cleaned text string.
    '''
    
    #lowercase all characters
    text = text.lower()
    
    #normalize unicode characters
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    
    #remove any characters that are not a letter, number, or single quote
    text = re.sub(r"[^a-z0-9'\s]", '', text)
    text = re.sub(r"\n", '', text)
    
    return text

In [4]:
blog = acquire.get_codeup_blog('https://codeup.com/competitor-bootcamps-are-closing-is-the-model-in-danger/')
blog


{'title': 'Competitor Bootcamps Are Closing. Is the Model in Danger?',
 'content': 'Competitor Bootcamps Are Closing. Is the Model in Danger?\n\xa0\n\nIs the programming bootcamp model in danger?\nIn recent news, DevBootcamp and The Iron Yard announced that they are closing their doors. This is big news. DevBootcamp was the first programming bootcamp model and The Iron Yard is a national player with 15 campuses across the U.S. In both cases, the companies cited an unsustainable business model. Does that mean the boot-camp model is dead?\n\ntl;dr “Nope!”\nBootcamps exist because traditional education models have failed to provide students job-ready skills for the 21st century. Students demand better employment options from their education. Employers demand skilled and job ready candidates. Big Education’s failure to meet those needs through traditional methods created the fertile ground for the new business model of the programming bootcamp.\nEducation giant Kaplan and Apollo Education 

In [5]:
blog = basic_clean(blog['content'])
blog

'competitor bootcamps are closing is the model in danger is the programming bootcamp model in dangerin recent news devbootcamp and the iron yard announced that they are closing their doors this is big news devbootcamp was the first programming bootcamp model and the iron yard is a national player with 15 campuses across the us in both cases the companies cited an unsustainable business model does that mean the bootcamp model is deadtldr nopebootcamps exist because traditional education models have failed to provide students jobready skills for the 21st century students demand better employment options from their education employers demand skilled and job ready candidates big educations failure to meet those needs through traditional methods created the fertile ground for the new business model of the programming bootcampeducation giant kaplan and apollo education group owner of university of phoenix bought their way into this new educational model when they purchased the iron yard and 

### Define a function named tokenize. It should take in a string and tokenize all the words in the string.



In [6]:
def tokenize(text):
    '''
    This function takes in a single arguement, a string 
    and prepares it for NLP by tokenizing the words.
    
    It returns a string. 
    '''
    
    #Create the tokenizer object
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    #Use the tokenizer
    text = tokenizer.tokenize(text, return_str = True)
    
    return text

In [7]:

blog = tokenize(blog)
blog

'competitor bootcamps are closing is the model in danger is the programming bootcamp model in dangerin recent news devbootcamp and the iron yard announced that they are closing their doors this is big news devbootcamp was the first programming bootcamp model and the iron yard is a national player with 15 campuses across the us in both cases the companies cited an unsustainable business model does that mean the bootcamp model is deadtldr nopebootcamps exist because traditional education models have failed to provide students jobready skills for the 21st century students demand better employment options from their education employers demand skilled and job ready candidates big educations failure to meet those needs through traditional methods created the fertile ground for the new business model of the programming bootcampeducation giant kaplan and apollo education group owner of university of phoenix bought their way into this new educational model when they purchased the iron yard and 

### Define a function named stem. It should accept some text and return the text after applying stemming to all the words.



In [8]:
def stem(text):
    '''
    This function takes in a string as an arguement
    and stems the words for NLP.
    It returns a single string of the stemmed words. 
    '''
    #create the porter stemmer
    ps = nltk.porter.PorterStemmer()
    
    #Apply the stemmer to each word in string
    stems = [ps.stem(word) for word in text.split()]
    
    #Join the stemmed list of words back into a string
    text_stemmed = ' '.join(stems)
    
    return text_stemmed

In [9]:
text_stemmed = stem(blog)
text_stemmed

'competitor bootcamp are close is the model in danger is the program bootcamp model in dangerin recent news devbootcamp and the iron yard announc that they are close their door thi is big news devbootcamp wa the first program bootcamp model and the iron yard is a nation player with 15 campus across the us in both case the compani cite an unsustain busi model doe that mean the bootcamp model is deadtldr nopebootcamp exist becaus tradit educ model have fail to provid student jobreadi skill for the 21st centuri student demand better employ option from their educ employ demand skill and job readi candid big educ failur to meet those need through tradit method creat the fertil ground for the new busi model of the program bootcampeduc giant kaplan and apollo educ group owner of univers of phoenix bought their way into thi new educ model when they purchas the iron yard and devbootcamp they purchas their competit with the intent to scale up the model unfortun big educ is too habitu to come up 

### Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [10]:
def lemmatize(text):
    '''
    This function takes in a string of text as
    an arguement and lemmatizes the words for NLP.
    It returns a single single string of the lemmatized words.
    '''
    #create the word nest list
    wnl = nltk.stem.WordNetLemmatizer()
    
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    
    text_lemmatized = ' '.join(lemmas)
    
    return text_lemmatized

In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/carlg/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
text_lemmatized = lemmatize(blog)
text_lemmatized

'competitor bootcamps are closing is the model in danger is the programming bootcamp model in dangerin recent news devbootcamp and the iron yard announced that they are closing their door this is big news devbootcamp wa the first programming bootcamp model and the iron yard is a national player with 15 campus across the u in both case the company cited an unsustainable business model doe that mean the bootcamp model is deadtldr nopebootcamps exist because traditional education model have failed to provide student jobready skill for the 21st century student demand better employment option from their education employer demand skilled and job ready candidate big education failure to meet those need through traditional method created the fertile ground for the new business model of the programming bootcampeducation giant kaplan and apollo education group owner of university of phoenix bought their way into this new educational model when they purchased the iron yard and devbootcamp they pu

### Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

- This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.



In [13]:
def remove_stopwords(text, extra_words=[], exclude_words=[]):
    '''
    This function takes in three arguements:
    1. A string
    2. extra_words=[] that should also be removed in addition to the std. stopwords.
    3. exclude_words=[] to signify std. stopwords that should not be removed.
    
    It returns a string with stopwords removed.
    '''
    stopword_list = stopwords.words('english')
    
    if len(extra_words) > 0:
        stopword_list.append(extra_words)
    else:
        stopword_list = stopword_list
        
    if len(exclude_words) > 0:
        stopword_list.remove(exclude_words)
    
    words = text.split()
    
    filtered_words = [word for word in words if word not in stopword_list]
    
    text_without_stopwords = ' '.join(filtered_words)
    
    return text_without_stopwords

In [14]:
text_without_stopwords = remove_stopwords(text_lemmatized, extra_words=['model', 'iron'])
text_without_stopwords

'competitor bootcamps closing model danger programming bootcamp model dangerin recent news devbootcamp iron yard announced closing door big news devbootcamp wa first programming bootcamp model iron yard national player 15 campus across u case company cited unsustainable business model doe mean bootcamp model deadtldr nopebootcamps exist traditional education model failed provide student jobready skill 21st century student demand better employment option education employer demand skilled job ready candidate big education failure meet need traditional method created fertile ground new business model programming bootcampeducation giant kaplan apollo education group owner university phoenix bought way new educational model purchased iron yard devbootcamp purchased competition intent scale model unfortunately big education habituated coming short student bought upstart challenged tried making change run bootcamps big education way sadly theyve closed door realized scaling education challeng

### Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.



In [15]:
categories = ["business", "sports", "technology", "entertainment", "science", "world"]
news_df = acquire.get_all_news_articles(categories)

In [16]:
news_df

Unnamed: 0,title,content,category
0,China's ex-teacher turned billionaire no more ...,"China's Larry Chen, a former teacher who becam...",business
1,Amazon job posting fuels speculations about pl...,A new job posting by Amazon has fuelled specul...,business
2,"Musk takes a jibe at rival car companies, says...",Tesla CEO and the world's second-richest perso...,business
3,Govt paid Infosys ₹164.5 crore for new Income ...,The government paid ₹164.5 crore to Infosys to...,business
4,Mahua Moitra writes to FM to look into 'over-i...,Lok Sabha MP Mahua Moitra has shared a letter ...,business
...,...,...,...
142,Lebanese lawmakers pick billionaire Najib Mika...,Lebanese lawmakers during parliamentary consul...,world
143,Equatorial Guinea to close UK embassy over san...,Equatorial Guinea's Foreign Minister said that...,world
144,US offers further air support to Afghan troops...,The US will continue to carry out airstrikes a...,world
145,46 Afghan soldiers flee to Pakistan in retreat...,The Pakistani Army on Monday said that 46 Afgh...,world


### Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.



In [17]:
codeup_df = acquire.get_blog_articles(urls)
codeup_df

Unnamed: 0,title,content
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...


### For each dataframe, produce the following columns:

- title to hold the title
- original to hold the original article/post content
- clean to hold the normalized and tokenized original with the stopwords removed.
- stemmed to hold the stemmed version of the cleaned data.
- lemmatized to hold the lemmatized version of the cleaned data.


In [20]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    # create column with text cleaned
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    # basic clean, tokenize, remove_stopwords, and stem text
    df['stemmed'] = df[column].apply(basic_clean).apply(tokenize).apply(remove_stopwords).apply(stem)
    # basic clean, tokenize, remove_stopwords, and lemmatize text
    df['lemmatized'] = df[column].apply(basic_clean).apply(tokenize).apply(remove_stopwords).apply(lemmatize)
    # return only exercise requested columns
    return df[['title', column, 'clean', 'stemmed', 'lemmatized']]

In [21]:
codeup_df['clean'] = prep_article_data(codeup_df, 'content')
codeup_df

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...,Codeup’s Data Science Career Accelerator is Here!,rumor true time arriv codeup offici open appli...,rumor true time arrived codeup officially open...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...,Data Science Myths,dimitri antoni maggi giustdata scienc big data...,dimitri antoniou maggie giustdata science big ...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",Data Science VS Data Analytics: What’s The Dif...,dimitri antonioua week ago codeup launch immer...,dimitri antonioua week ago codeup launched imm...
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...,10 Tips to Crush It at the SA Tech Job Fair,sa tech job fairth third biannual san antonio ...,sa tech job fairthe third biannual san antonio...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamp close model danger program...,competitor bootcamps closing model danger prog...


In [22]:
#stemmed to hold the stemmed version of the cleaned data.
codeup_df['stemmed'] = codeup_df['clean'].apply(lambda x: stem(x))
codeup_df

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,Codeup’s Data Science Career Accelerator is Here!,The rumors are true! The time has arrived. Cod...,Codeup’s Data Science Career Accelerator is Here!,codeup’ data scienc career acceler is here!,rumor true time arrived codeup officially open...
1,Data Science Myths,By Dimitri Antoniou and Maggie Giust\nData Sci...,Data Science Myths,data scienc myth,dimitri antoniou maggie giustdata science big ...
2,Data Science VS Data Analytics: What’s The Dif...,"By Dimitri Antoniou\nA week ago, Codeup launch...",Data Science VS Data Analytics: What’s The Dif...,data scienc VS data analytics: what’ the diffe...,dimitri antonioua week ago codeup launched imm...
3,10 Tips to Crush It at the SA Tech Job Fair,SA Tech Job Fair\nThe third bi-annual San Anto...,10 Tips to Crush It at the SA Tech Job Fair,10 tip to crush It at the SA tech job fair,sa tech job fairthe third biannual san antonio...
4,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,Competitor Bootcamps Are Closing. Is the Model...,competitor bootcamp are closing. Is the model ...,competitor bootcamps closing model danger prog...


In [23]:
news_df['clean'] = prep_article_data(news_df, 'content')
news_df

Unnamed: 0,title,content,category,clean,stemmed,lemmatized
0,China's ex-teacher turned billionaire no more ...,"China's Larry Chen, a former teacher who becam...",business,China's ex-teacher turned billionaire no more ...,china ' larri chen former teacher becam billio...,china ' larry chen former teacher became billi...
1,Amazon job posting fuels speculations about pl...,A new job posting by Amazon has fuelled specul...,business,Amazon job posting fuels speculations about pl...,new job post amazon fuell specul ecommerc majo...,new job posting amazon fuelled speculation eco...
2,"Musk takes a jibe at rival car companies, says...",Tesla CEO and the world's second-richest perso...,business,"Musk takes a jibe at rival car companies, says...",tesla ceo world ' secondrichest person elon mu...,tesla ceo world ' secondrichest person elon mu...
3,Govt paid Infosys ₹164.5 crore for new Income ...,The government paid ₹164.5 crore to Infosys to...,business,Govt paid Infosys ₹164.5 crore for new Income ...,govern paid 1645 crore infosi build new incom ...,government paid 1645 crore infosys build new i...
4,Mahua Moitra writes to FM to look into 'over-i...,Lok Sabha MP Mahua Moitra has shared a letter ...,business,Mahua Moitra writes to FM to look into 'over-i...,lok sabha mp mahua moitra share letter wrote f...,lok sabha mp mahua moitra shared letter wrote ...
...,...,...,...,...,...,...
142,Lebanese lawmakers pick billionaire Najib Mika...,Lebanese lawmakers during parliamentary consul...,world,Lebanese lawmakers pick billionaire Najib Mika...,lebanes lawmak parliamentari consult monday pi...,lebanese lawmaker parliamentary consultation m...
143,Equatorial Guinea to close UK embassy over san...,Equatorial Guinea's Foreign Minister said that...,world,Equatorial Guinea to close UK embassy over san...,equatori guinea ' foreign minist said countri ...,equatorial guinea ' foreign minister said coun...
144,US offers further air support to Afghan troops...,The US will continue to carry out airstrikes a...,world,US offers further air support to Afghan troops...,us continu carri airstrik taliban support afgh...,u continue carry airstrikes taliban support af...
145,46 Afghan soldiers flee to Pakistan in retreat...,The Pakistani Army on Monday said that 46 Afgh...,world,46 Afghan soldiers flee to Pakistan in retreat...,pakistani armi monday said 46 afghan soldier g...,pakistani army monday said 46 afghan soldier g...


In [24]:
#stemmed to hold the stemmed version of the cleaned data.
news_df['stemmed'] = news_df['clean'].apply(lambda x: stem(x))
news_df

Unnamed: 0,title,content,category,clean,stemmed,lemmatized
0,China's ex-teacher turned billionaire no more ...,"China's Larry Chen, a former teacher who becam...",business,China's ex-teacher turned billionaire no more ...,china' ex-teach turn billionair no more a bill...,china ' larry chen former teacher became billi...
1,Amazon job posting fuels speculations about pl...,A new job posting by Amazon has fuelled specul...,business,Amazon job posting fuels speculations about pl...,amazon job post fuel specul about plan to acce...,new job posting amazon fuelled speculation eco...
2,"Musk takes a jibe at rival car companies, says...",Tesla CEO and the world's second-richest perso...,business,"Musk takes a jibe at rival car companies, says...","musk take a jibe at rival car companies, say '...",tesla ceo world ' secondrichest person elon mu...
3,Govt paid Infosys ₹164.5 crore for new Income ...,The government paid ₹164.5 crore to Infosys to...,business,Govt paid Infosys ₹164.5 crore for new Income ...,govt paid infosi ₹164.5 crore for new incom ta...,government paid 1645 crore infosys build new i...
4,Mahua Moitra writes to FM to look into 'over-i...,Lok Sabha MP Mahua Moitra has shared a letter ...,business,Mahua Moitra writes to FM to look into 'over-i...,mahua moitra write to FM to look into 'over-in...,lok sabha mp mahua moitra shared letter wrote ...
...,...,...,...,...,...,...
142,Lebanese lawmakers pick billionaire Najib Mika...,Lebanese lawmakers during parliamentary consul...,world,Lebanese lawmakers pick billionaire Najib Mika...,lebanes lawmak pick billionair najib mikati as...,lebanese lawmaker parliamentary consultation m...
143,Equatorial Guinea to close UK embassy over san...,Equatorial Guinea's Foreign Minister said that...,world,Equatorial Guinea to close UK embassy over san...,equatori guinea to close UK embassi over sanct...,equatorial guinea ' foreign minister said coun...
144,US offers further air support to Afghan troops...,The US will continue to carry out airstrikes a...,world,US offers further air support to Afghan troops...,US offer further air support to afghan troop a...,u continue carry airstrikes taliban support af...
145,46 Afghan soldiers flee to Pakistan in retreat...,The Pakistani Army on Monday said that 46 Afgh...,world,46 Afghan soldiers flee to Pakistan in retreat...,46 afghan soldier flee to pakistan in retreat ...,pakistani army monday said 46 afghan soldier g...


### Ask yourself:

- If your corpus is 493KB, would you prefer to use stemmed or lemmatized text?
    - lemmatized
- If your corpus is 25MB, would you prefer to use stemmed or lemmatized text?
    - either
- If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text?
    - stemmed