In [2]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

from acquire import get_codeup_blog, get_inshorts_articles

# Data Preparation Exercises

## 1

Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [9]:
# A string to work with
text = "HERE is some text: α alpha  β beta | something * else. Someone's pencil."

In [10]:
# Lowercase everything
cleaned = text.lower()
cleaned

"here is some text: α alpha  β beta | something * else. someone's pencil."

In [11]:
# Normalize unicode characters
cleaned = unicodedata.normalize('NFKD', cleaned).encode('ascii', 'ignore').decode('utf-8', 'ignore')
cleaned

"here is some text:  alpha   beta | something * else. someone's pencil."

In [12]:
# Replace special characters
regexp = r"[^a-z0-9'\s]"
cleaned = re.sub(regexp, '', cleaned)
cleaned

"here is some text  alpha   beta  something  else someone's pencil"

In [15]:
# Now let's put it in a function

def basic_clean(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    regexp = r"[^a-z0-9'\s]"
    text = re.sub(regexp, '', text)
    return text

In [16]:
# Let's test it
basic_clean(text)

"here is some text  alpha   beta  something  else someone's pencil"

## 2

Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [17]:
# Let's create a function that will create a tokenizer object and tokenize the input

def tokenize(text):
    tokenizer = ToktokTokenizer()
    return tokenizer.tokenize(text, return_str = True)

In [19]:
# Let's test it
tokenize(cleaned)

"here is some text alpha beta something else someone ' s pencil"

## 3

Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

In [20]:
# Let's create a function that apply stemming to the input text

def stem(text):
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text.split()]
    return ' '.join(stems)

In [21]:
# Let's test it
stem(cleaned)

"here is some text alpha beta someth els someone' pencil"

## 4

Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [22]:
# Let's create a function that will apply lemmatization to the input text

def lemmatize(text):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    return ' '.join(lemmas)

In [23]:
# Let's test it
lemmatize(cleaned)

"here is some text alpha beta something else someone's pencil"

That didn't really change anything. Let's try a different string.

In [29]:
lemmatize("He studies the principles of mathematical mumbo jumbo")

'He study the principle of mathematical mumbo jumbo'

## 5

Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

In [30]:
# Let's try to add and remove words from the stopwords list

stopword_list = stopwords.words('english')
stopword_list[ : 20]

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [31]:
# First let's try adding some words

extra_words = [
    'hubba',
    'bubba'
]

stopword_list += extra_words
stopword_list[-10 : ]

['wasn',
 "wasn't",
 'weren',
 "weren't",
 'won',
 "won't",
 'wouldn',
 "wouldn't",
 'hubba',
 'bubba']

In [32]:
# Now let's try removing some words

exclude_words = [
    "wouldn't",
    "won't"
]

[stopword_list.remove(word) for word in exclude_words]

stopword_list[-10 : ]

['shouldn',
 "shouldn't",
 'wasn',
 "wasn't",
 'weren',
 "weren't",
 'won',
 'wouldn',
 'hubba',
 'bubba']

In [37]:
# Now let's create the function to remove all stopwords from the input text

def remove_stopwords(text, extra_words = None, exclude_words = None):
    stopword_list = stopwords.words('english')
    
    # We need to add in the extra checks if the parameters are None in order to make the 
    # parameters optional.
    stopword_list = stopword_list + extra_words if extra_words is not None else stopword_list
    [stopword_list.remove(word) for word in (exclude_words if exclude_words is not None else [])]
    
    text = [word for word in text.split() if word not in stopword_list]
    return ' '.join(text)

In [38]:
# Let's test it
remove_stopwords(cleaned)

"text alpha beta something else someone's pencil"

In [39]:
remove_stopwords(cleaned, extra_words = ['alpha', 'beta'], exclude_words = ['here'])

"here text something else someone's pencil"

## 6

Use your data from the acquire to produce a dataframe of the news articles. Name the dataframe news_df.