In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

In this exercise we will be defining some functions to prepare textual data. These functions should apply equally well to both the codeup blog articles and the news articles that were previously acquired.

In [2]:
df = acquire.get_blog_post()

In [3]:
df

Unnamed: 0.1,Unnamed: 0,body,title
0,0,\nThe rumors are true! The time has arrived. C...,Codeup’s Data Science Career Accelerator is He...
1,1,\nBy Dimitri Antoniou and Maggie Giust\nData S...,Data Science Myths - Codeup
2,2,"\nBy Dimitri Antoniou\nA week ago, Codeup laun...",Data Science VS Data Analytics: What’s The Dif...
3,3,\n10 Tips to Crush It at the SA Tech Job Fair\...,10 Tips to Crush It at the SA Tech Job Fair - ...
4,4,\nCompetitor Bootcamps Are Closing. Is the Mod...,Competitor Bootcamps Are Closing. Is the Model...


In [4]:
df.drop(columns=('Unnamed: 0'), inplace=True)

In [5]:
df.title

0    Codeup’s Data Science Career Accelerator is He...
1                          Data Science Myths - Codeup
2    Data Science VS Data Analytics: What’s The Dif...
3    10 Tips to Crush It at the SA Tech Job Fair - ...
4    Competitor Bootcamps Are Closing. Is the Model...
Name: title, dtype: object

In [6]:
df.body

0    \nThe rumors are true! The time has arrived. C...
1    \nBy Dimitri Antoniou and Maggie Giust\nData S...
2    \nBy Dimitri Antoniou\nA week ago, Codeup laun...
3    \n10 Tips to Crush It at the SA Tech Job Fair\...
4    \nCompetitor Bootcamps Are Closing. Is the Mod...
Name: body, dtype: object

1. Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

    - Lowercase everything
    - Normalize unicode characters
    - Replace anything that is not a letter, number, whitespace or a single quote.

In [7]:
df_prepped = df.copy()

In [8]:
df_prepped

Unnamed: 0,body,title
0,\nThe rumors are true! The time has arrived. C...,Codeup’s Data Science Career Accelerator is He...
1,\nBy Dimitri Antoniou and Maggie Giust\nData S...,Data Science Myths - Codeup
2,"\nBy Dimitri Antoniou\nA week ago, Codeup laun...",Data Science VS Data Analytics: What’s The Dif...
3,\n10 Tips to Crush It at the SA Tech Job Fair\...,10 Tips to Crush It at the SA Tech Job Fair - ...
4,\nCompetitor Bootcamps Are Closing. Is the Mod...,Competitor Bootcamps Are Closing. Is the Model...


In [9]:
# lowercase characters only
def lowercase(string):
    return string.lower()

In [10]:
ex = 'TA-DA'
lowercase(ex)

'ta-da'

In [11]:
# normalize unicode characters 
def normalize(string):
    return unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [12]:
example = "laéios"
normalize(example)

'laeios'

In [13]:
# remove special characters
def remove_special_chars(string):
    return re.sub(r"[^a-z0-9'\s]", '', string)

In [14]:
eg = 'here! you? go!'
remove_special_chars(eg)

'here you go'

In [15]:
# remove \n in front of strings
def strip(string):
    string = string.replace('\n', ' ')
    string = string.strip()
    return string

# re.sub(r'[\r|\n|\r\n]+', ' ', string)

In [16]:
def basic_clean(string):
    string = string.lower()
    string = unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    string = re.sub(r"[^a-z0-9'\s]", '', string)
    string = string.strip()
    string = string.replace('\n', ' ')
    return string 

In [17]:
df_prepped.title.apply(basic_clean)

0    codeups data science career accelerator is her...
1                           data science myths  codeup
2    data science vs data analytics whats the diffe...
3    10 tips to crush it at the sa tech job fair  c...
4    competitor bootcamps are closing is the model ...
Name: title, dtype: object

In [18]:
df_prepped.body.apply(basic_clean)

0    the rumors are true the time has arrived codeu...
1    by dimitri antoniou and maggie giust data scie...
2    by dimitri antoniou a week ago codeup launched...
3    10 tips to crush it at the sa tech job fair sa...
4    competitor bootcamps are closing is the model ...
Name: body, dtype: object

2. Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [19]:
def tokenize(string):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string, return_str=True)

In [20]:
# test tokenize function
tokenize('Hello, world!')

'Hello , world !'

3. Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [21]:
# Stemming takes the stem of each word
def stem(text):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text.split()]
    text_stemmed = ' '.join(stems)
    return text_stemmed

In [22]:
# test function 
stem('running')

'run'

4. Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [23]:
def lemmatize(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    text_lemmatized = ' '.join(lemmas)
    return text_lemmatized

In [24]:
# test function 
lemmatize('running into a house')
lemmatize('was')

'wa'

5. Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [25]:
def remove_stopwords(string, extra_words=[], exclude_words=[]):
    # Tokenize the string
    string = tokenize(string)
    
    words = string.split()
    stopword_list = stopwords.words('english')
    
    # Remove the excluded words from the stopword list 
    stopword_list = set(stopword_list) - set(exclude_words)
    
    # Add in user specified extra words
    stopword_list = stopword_list.union(set(extra_words))
    
    filtered_words = [w for w in words if w not in stopword_list ]
    
    final_string = " ".join(filtered_words)
    
    return final_string  

In [26]:
remove_stopwords("I love the holidays so much!")

'I love holidays much !'

In [27]:
remove_stopwords("I like the snow and sledding is my favorite.")

'I like snow sledding favorite .'

6. Define a function named prep_article that takes in the dictionary representing an article and returns a dictionary that looks like this:

{

    'title': 'the original title'
    
    'original': original,
    
    'stemmed': article_stemmed,
    
    'lemmatized': article_lemmatized,
    
    'clean': article_without_stopwords
    
}

Note that if the orignal dictionary has a title property, it should remain unchanged (same goes for the category property).

In [28]:
def prep_article(df):
    df['original'] = df.body
    df['stemmed'] = df.body.apply(basic_clean).apply(stem)
    df['lemmatized'] = df.body.apply(basic_clean).apply(lemmatize)
    df['clean'] = df.body.apply(basic_clean).apply(remove_stopwords)
    df.drop(columns=('body'), inplace=True)
    return df

In [29]:
df = acquire.get_blog_post()

In [30]:
prep_article(df)

Unnamed: 0.1,Unnamed: 0,title,original,stemmed,lemmatized,clean
0,0,Codeup’s Data Science Career Accelerator is He...,\nThe rumors are true! The time has arrived. C...,the rumor are true the time ha arriv codeup ha...,the rumor are true the time ha arrived codeup ...,rumors true time arrived codeup officially ope...
1,1,Data Science Myths - Codeup,\nBy Dimitri Antoniou and Maggie Giust\nData S...,by dimitri antoni and maggi giust data scienc ...,by dimitri antoniou and maggie giust data scie...,dimitri antoniou maggie giust data science big...
2,2,Data Science VS Data Analytics: What’s The Dif...,"\nBy Dimitri Antoniou\nA week ago, Codeup laun...",by dimitri antoni a week ago codeup launch our...,by dimitri antoniou a week ago codeup launched...,dimitri antoniou week ago codeup launched imme...
3,3,10 Tips to Crush It at the SA Tech Job Fair - ...,\n10 Tips to Crush It at the SA Tech Job Fair\...,10 tip to crush it at the sa tech job fair sa ...,10 tip to crush it at the sa tech job fair sa ...,10 tips crush sa tech job fair sa tech job fai...
4,4,Competitor Bootcamps Are Closing. Is the Model...,\nCompetitor Bootcamps Are Closing. Is the Mod...,competitor bootcamp are close is the model in ...,competitor bootcamps are closing is the model ...,competitor bootcamps closing model danger prog...


In [32]:
df.drop(columns=("Unnamed: 0"), inplace=True)

In [33]:
df

Unnamed: 0,title,original,stemmed,lemmatized,clean
0,Codeup’s Data Science Career Accelerator is He...,\nThe rumors are true! The time has arrived. C...,the rumor are true the time ha arriv codeup ha...,the rumor are true the time ha arrived codeup ...,rumors true time arrived codeup officially ope...
1,Data Science Myths - Codeup,\nBy Dimitri Antoniou and Maggie Giust\nData S...,by dimitri antoni and maggi giust data scienc ...,by dimitri antoniou and maggie giust data scie...,dimitri antoniou maggie giust data science big...
2,Data Science VS Data Analytics: What’s The Dif...,"\nBy Dimitri Antoniou\nA week ago, Codeup laun...",by dimitri antoni a week ago codeup launch our...,by dimitri antoniou a week ago codeup launched...,dimitri antoniou week ago codeup launched imme...
3,10 Tips to Crush It at the SA Tech Job Fair - ...,\n10 Tips to Crush It at the SA Tech Job Fair\...,10 tip to crush it at the sa tech job fair sa ...,10 tip to crush it at the sa tech job fair sa ...,10 tips crush sa tech job fair sa tech job fai...
4,Competitor Bootcamps Are Closing. Is the Model...,\nCompetitor Bootcamps Are Closing. Is the Mod...,competitor bootcamp are close is the model in ...,competitor bootcamps are closing is the model ...,competitor bootcamps closing model danger prog...


7. Define a function named prepare_article_data that takes in the list of articles dictionaries, applies the prep_article function to each one, and returns the transformed data.

In [34]:
def prepare_article_data():
    df = acquire.get_blog_post()
    return prep_article(df)

In [40]:
new_df = prepare_article_data()

In [42]:
new_df

Unnamed: 0.1,Unnamed: 0,title,original,stemmed,lemmatized,clean
0,0,Codeup’s Data Science Career Accelerator is He...,\nThe rumors are true! The time has arrived. C...,the rumor are true the time ha arriv codeup ha...,the rumor are true the time ha arrived codeup ...,rumors true time arrived codeup officially ope...
1,1,Data Science Myths - Codeup,\nBy Dimitri Antoniou and Maggie Giust\nData S...,by dimitri antoni and maggi giust data scienc ...,by dimitri antoniou and maggie giust data scie...,dimitri antoniou maggie giust data science big...
2,2,Data Science VS Data Analytics: What’s The Dif...,"\nBy Dimitri Antoniou\nA week ago, Codeup laun...",by dimitri antoni a week ago codeup launch our...,by dimitri antoniou a week ago codeup launched...,dimitri antoniou week ago codeup launched imme...
3,3,10 Tips to Crush It at the SA Tech Job Fair - ...,\n10 Tips to Crush It at the SA Tech Job Fair\...,10 tip to crush it at the sa tech job fair sa ...,10 tip to crush it at the sa tech job fair sa ...,10 tips crush sa tech job fair sa tech job fai...
4,4,Competitor Bootcamps Are Closing. Is the Model...,\nCompetitor Bootcamps Are Closing. Is the Mod...,competitor bootcamp are close is the model in ...,competitor bootcamps are closing is the model ...,competitor bootcamps closing model danger prog...


In [43]:
new_df.drop(columns=("Unnamed: 0"), inplace=True)

In [44]:
new_df.head()

Unnamed: 0,title,original,stemmed,lemmatized,clean
0,Codeup’s Data Science Career Accelerator is He...,\nThe rumors are true! The time has arrived. C...,the rumor are true the time ha arriv codeup ha...,the rumor are true the time ha arrived codeup ...,rumors true time arrived codeup officially ope...
1,Data Science Myths - Codeup,\nBy Dimitri Antoniou and Maggie Giust\nData S...,by dimitri antoni and maggi giust data scienc ...,by dimitri antoniou and maggie giust data scie...,dimitri antoniou maggie giust data science big...
2,Data Science VS Data Analytics: What’s The Dif...,"\nBy Dimitri Antoniou\nA week ago, Codeup laun...",by dimitri antoni a week ago codeup launch our...,by dimitri antoniou a week ago codeup launched...,dimitri antoniou week ago codeup launched imme...
3,10 Tips to Crush It at the SA Tech Job Fair - ...,\n10 Tips to Crush It at the SA Tech Job Fair\...,10 tip to crush it at the sa tech job fair sa ...,10 tip to crush it at the sa tech job fair sa ...,10 tips crush sa tech job fair sa tech job fai...
4,Competitor Bootcamps Are Closing. Is the Model...,\nCompetitor Bootcamps Are Closing. Is the Mod...,competitor bootcamp are close is the model in ...,competitor bootcamps are closing is the model ...,competitor bootcamps closing model danger prog...
