In [None]:
# <font color="darkgreen">Data Preperation Exercises</font>

The end result of this exercise should be a file named `prepare.py` that defines the requested functions.

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.





In [8]:
# Imports
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

### <font color ="blue"> 1. Define a function named `basic_clean`. It should take in a string and apply some basic text cleaning to it:
* Lowercase everything
* Normalize unicode characters
* Replace anything that is not a letter, number, whitespace or a single quote.</font>
    

In [9]:
original = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"
original

"Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [10]:
# Lowercase all letters in the text
article =  original.lower()
article


"paul erdős and george pólya are influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

In [11]:
# Remove inconsistencies in unicode character encoding.
# encode the strings into ASCII bytestrings (ignore non-ASCII characters)
# decode the bytestring into (Unicode) string

article = unicodedata.normalize('NFKD', article)\
.encode('ascii', 'ignore')\
.decode('utf-8', 'ignore')


article


"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [12]:
# remove anything that is not a through z, a number, a single quote, or whitespace

article = re.sub(r"[^a-z0-9'\s]", '', article)
article



"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [15]:
def basic_clean(string):
    '''
    This function takes in a string and
    returns the string normalized.
    '''
    string = unicodedata.normalize('NFKC', string)\
             .encode('ascii', 'ignore')\
             .decode('utf-8', 'ignore')
    string = re.sub(r'[^\w\s]', '', string).lower()
    return string

In [16]:
basic_clean(article)

'paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdoss name contains the hungarian letter o o with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity'

### <font color ="blue"> 2. Define a function named `tokenize`. It should take in a string and tokenize all the words in the string.



In [17]:
# Create the tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()

# Use the tokenizer
tokenizer.tokenize(article, return_str = True)

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [18]:
#Tokenize function
def tokenize(string):
    '''
    This function takes in a string and
    returns a tokenized string.
    '''
    # Create tokenizer.
    tokenizer = nltk.tokenize.ToktokTokenizer()
    
    # Use tokenizer
    string = tokenizer.tokenize(string, return_str=True)
    
    return string


In [19]:
tokenize(article)

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

### <font color ="blue">3. Define a function named `stem`. It should accept some text and return the text after applying stemming to all the words.



In [20]:
# Create porter stemmer.

ps = nltk.porter.PorterStemmer()

In [21]:
# Apply the stemmer to each word in our string.

stems =[ps.stem(word) for word in article.split()]
stems[:10] 



['paul',
 'erdo',
 'and',
 'georg',
 'polya',
 'are',
 'influenti',
 'hungarian',
 'mathematician',
 'who']

In [22]:
# Join our lists of words into a string again

article_stemmed =  ' '.join(stems)
article_stemmed

"paul erdo and georg polya are influenti hungarian mathematician who contribut a lot to the field erdos' name contain the hungarian letter 'o' 'o' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

In [23]:
#Stem function
def stem(string):
    '''
    This function takes in a string and
    returns a string with words stemmed.
    '''
    # Create porter stemmer.
    ps = nltk.porter.PorterStemmer()
    
    # Use the stemmer to stem each word in the list of words we created by using split.
    stems = [ps.stem(word) for word in string.split()]
    
    # Join our lists of words into a string again and assign to a variable.
    string = ' '.join(stems)
    
    return string

In [24]:
stem(article)

"paul erdo and georg polya are influenti hungarian mathematician who contribut a lot to the field erdos' name contain the hungarian letter 'o' 'o' with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess"

### <font color ="blue"> 4. Define a function named `lemmatize`. It should accept some text and return the text after applying lemmatization to each word.

In [25]:
# Download the first time.
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/brandonbryant/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [26]:
# Create the Lemmatizer.

wnl = nltk.stem.WordNetLemmatizer()

In [27]:
# Check lemmatizer. It works.

wnl.lemmatize('influence')

'influence'

In [28]:
# Use the lemmatizer on each word in the list of words we created by using split.

lemmas = [wnl.lemmatize(word) for word in article.split()]
lemmas[:10]

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'are',
 'influential',
 'hungarian',
 'mathematician',
 'who']

In [29]:
# Join our list of words into a string again; assign to a variable to save changes.

article_lemmatized = ' '.join(lemmas)
article_lemmatized

"paul erdos and george polya are influential hungarian mathematician who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

In [30]:
#Lemmatize fucntion
def lemmatize(string):
    '''
    This function takes in string for and
    returns a string with words lemmatized.
    '''
    # Create the lemmatizer.
    wnl = nltk.stem.WordNetLemmatizer()
    
    # Use the lemmatizer on each word in the list of words we created by using split.
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    
    # Join our list of words into a string again and assign to a variable.
    string = ' '.join(lemmas)
    
    return string

In [31]:
lemmatize(article)

"paul erdos and george polya are influential hungarian mathematician who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity"

### <font color ="blue"> 5. Define a function named `remove_stopwords`. It should accept some text and return the text after removing all the stopwords.</font>
- This function should define two optional parameters, `extra_words` and `exclude_words`. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [32]:
# standard English language stopwords list from nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


stopword_list = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/brandonbryant/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
stopword_list[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [34]:
len(stopword_list)

179

In [35]:
# Split words in lemmatized article.

words = article_lemmatized.split()
words[:10]

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'are',
 'influential',
 'hungarian',
 'mathematician',
 'who']

In [36]:
# Create a list of words from my string with stopwords removed and assign to variable.

filtered_words = [word for word in words if word not in stopword_list]
filtered_words[:10]

['paul',
 'erdos',
 'george',
 'polya',
 'influential',
 'hungarian',
 'mathematician',
 'contributed',
 'lot',
 'field']

In [37]:
# Join words in the list back into strings; assign to a variable to keep changes.

article_without_stopwords = ' '.join(filtered_words)
article_without_stopwords

"paul erdos george polya influential hungarian mathematician contributed lot field erdos's name contains hungarian letter 'o' 'o' double acute accent often incorrectly written erdos erdos either mistake typographical necessity"

In [38]:

def remove_stopwords(string, extra_words=[], exclude_words=[]):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters
    with default empty lists and returns a string.
    '''
    # Create stopword_list.
    stopword_list = stopwords.words('english')
    
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)

    # Add in 'extra_words' to stopword_list.
    stopword_list = stopword_list.union(set(extra_words))
    
    # Split words in string.
    words = string.split()
    
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [word for word in words if word not in stopword_list]
    
    # Join words in the list back into strings and assign to a variable.
    string_without_stopwords = ' '.join(filtered_words)
    
    return string_without_stopwords

In [39]:
remove_stopwords(article)

"paul erdos george polya influential hungarian mathematicians contributed lot field erdos's name contains hungarian letter 'o' 'o' double acute accent often incorrectly written erdos erdos either mistake typographical necessity"

In [68]:
string1 = "hbsshabyh eabah hbwbwkh jknrghrjbgurb ugwbigbahuiwgeag ugauywevfluweibf ueVHFwvuf UEBFYWliY"

In [69]:
remove_stopwords(string1)

'hbsshabyh eabah hbwbwkh jknrghrjbgurb ugwbigbahuiwgeag ugauywevfluweibf ueVHFwvuf UEBFYWliY'

### <font color ="blue"> 6. Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe 'news_df'.

In [49]:
import warnings
warnings.filterwarnings('ignore')

In [70]:
# define categories
categories = ["business", "sports", "technology", "entertainment"]

# use get_all_new_article function from acquire.py file 

news_df = acquire.get_all_news_articles(categories)                                 

In [71]:
news_df.head()

Unnamed: 0,title,content,category
0,India underestimated the coronavirus: Raghuram...,"Speaking about India's second COVID-19 wave, f...",business
1,Air India pilots demand vaccination on priorit...,Indian Commercial Pilots Association (ICPA) on...,business
2,South Korea's richest woman gets fortune worth...,South Korea’s richest woman Hong Ra-hee added ...,business
3,World's biggest jeweller says it will no longe...,"Pandora, the world's biggest jeweller, has sai...",business
4,Samsung pledges ₹37 crore to India to fight CO...,Samsung has pledged $5 million (around ₹37 cro...,business


### <font color ="blue"> 7. Make another dataframe for the Codeup blog posts. Name the dataframe codeup_df.

In [72]:
from acquire import acquire_codeup_blog
codeup_df = acquire_codeup_blog()
codeup_df.head()

Unnamed: 0,title,published_date,blog_image,content
0,Codeup’s Data Science Career Accelerator is Here!,"September 30, 2018",https://codeup.com/wp-content/uploads/2018/10/...,The rumors are true! The time has arrived. Cod...
1,Data Science Myths,"October 31, 2018",https://codeup.com/wp-content/uploads/2018/10/...,By Dimitri Antoniou and Maggie Giust\nData Sci...
2,Data Science VS Data Analytics: What’s The Dif...,"October 17, 2018",https://codeup.com/wp-content/uploads/2018/10/...,"By Dimitri Antoniou\nA week ago, Codeup launch..."
3,10 Tips to Crush It at the SA Tech Job Fair,"August 14, 2018",,SA Tech Job Fair\nThe third bi-annual San Anto...
4,Competitor Bootcamps Are Closing. Is the Model...,"August 14, 2018",,Competitor Bootcamps Are Closing. Is the Model...


### <font color ="blue"> 8. For each dataframe, produce the following columns:</font>

   * 'title' to hold the title
   * 'original' to hold the original article/post content
   * 'clean' to hold the normalized and tokenized original with the stopwords removed.
   * 'stemmed' to hold the stemmed version of the cleaned data.
   * 'lemmatized' to hold the lemmatized version of the cleaned data.



In [73]:
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['stemmed'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(stem)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    df['lemmatized'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(lemmatize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    
    return df[['title', column,'clean', 'stemmed', 'lemmatized']]

In [76]:
#use the function defined above for news_df's content column
prep_article_data(news_df,'content', extra_words = ['ha'], exclude_words = ['no']).head()

Unnamed: 0,title,content,clean,stemmed,lemmatized
0,India underestimated the coronavirus: Raghuram...,"Speaking about India's second COVID-19 wave, f...",speaking indias second covid19 wave former rbi...,speak india second covid19 wave former rbi gov...,speaking india second covid19 wave former rbi ...
1,Air India pilots demand vaccination on priorit...,Indian Commercial Pilots Association (ICPA) on...,indian commercial pilots association icpa tues...,indian commerci pilot associ icpa tuesday said...,indian commercial pilot association icpa tuesd...
2,South Korea's richest woman gets fortune worth...,South Korea’s richest woman Hong Ra-hee added ...,south koreas richest woman hong rahee added an...,south korea richest woman hong rahe ad anoth 7...,south korea richest woman hong rahee added ano...
3,World's biggest jeweller says it will no longe...,"Pandora, the world's biggest jeweller, has sai...",pandora worlds biggest jeweller said itll stop...,pandora world biggest jewel said itll stop use...,pandora world biggest jeweller said itll stop ...
4,Samsung pledges ₹37 crore to India to fight CO...,Samsung has pledged $5 million (around ₹37 cro...,samsung pledged 5 million around 37 crore help...,samsung pledg 5 million around 37 crore help i...,samsung pledged 5 million around 37 crore help...


### <font color="blue">9. Ask yourself:</font>

* If your corpus is 493KB, would you prefer to use stemmed or lemmatized text? -Lemmatized
* If your corpus is 25MB, would you prefer to use stemmed or lemmatized text? -Personal preference
* If your corpus is 200TB of text and you're charged by the megabyte for your hosted computational resources, would you prefer to use stemmed or lemmatized text? -Stem