In [1]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire

# Exercise 
Define a function named basic_clean. It should take in a string and apply some basic text cleaning to it:

- Lowercase everything
- Normalize unicode characters
- Replace anything that is not a letter, number, whitespace or a single quote.

In [2]:
original = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"
original

"Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [3]:
#Lowercase everything
string = original.lower()
string

"paul erdős and george pólya are influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

In [4]:
#Normalize unicode characters
string = original.lower()
string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')    
string

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [5]:
#Replace anything that is not a letter, number, whitespace or a single quote.
string = re.sub(r"[^a-z0-9\s']", '', string)
string

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [6]:
def basic_clean(string:str):
    #Lowercase everything
    string = string.lower()
    #Normalize unicode characters
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    #Replace anything that is not a letter, number, whitespace or a single quote.
    string = re.sub(r"[^a-z0-9\s']", '', string)
    return string

# Exercise 
Define a function named tokenize. It should take in a string and tokenize all the words in the string.

In [7]:
clean_text = basic_clean(original)
clean_text

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [8]:
def tokenize(clean_string:str):
    tokenizer = nltk.tokenize.ToktokTokenizer()
    string = tokenizer.tokenize(clean_string, return_str=True)
    return string

In [9]:
tokenized_text= tokenize(clean_text)
tokenized_text

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [10]:
type(tokenized_text)

str

# Exercise 
Define a function named stem. It should accept some text and return the text after applying stemming to all the words.

# Exercise 
Define a function named lemmatize. It should accept some text and return the text after applying lemmatization to each word.

In [11]:
wnl = nltk.stem.WordNetLemmatizer()

In [12]:
lemmas = [wnl.lemmatize(word) for word in tokenized_text.split()]
lemmas

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'are',
 'influential',
 'hungarian',
 'mathematician',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdos',
 "'",
 's',
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 "'",
 'o',
 "'",
 "'",
 'o',
 "'",
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'a',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

In [13]:
def lemmatize(tokenized_string:str):
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in tokenized_string.split()]
    return lemmas

In [14]:
lemmatized_text=lemmatize(tokenized_text)
lemmatized_text

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'are',
 'influential',
 'hungarian',
 'mathematician',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdos',
 "'",
 's',
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 "'",
 'o',
 "'",
 "'",
 'o',
 "'",
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'a',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

# Exercise 
Define a function named remove_stopwords. It should accept some text and return the text after removing all the stopwords.

In [15]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yipengjiang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
stopword_list = stopwords.words('english')
stopword_list[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [17]:
len(lemmatized_text)

51

In [67]:
stopword_removed_text = [word for word in lemmatized_text if word not in stopword_list]
len(stopword_removed_text)

32

In [76]:
' '.join(stopword_removed_text)

"paul erdos george polya influential hungarian mathematician contributed lot field erdos ' name contains hungarian letter ' ' ' ' double acute accent often incorrectly written erdos erdos either mistake typographical necessity"

In [20]:
def remove_stopwords(lemmatized_string):
    stopword_list = stopwords.words('english')
    stopword_removed_string =  [word for word in lemmatized_text if word not in stopword_list]
    return ' '.join(stopword_removed_string)   

In [21]:
stopword_removed_text = remove_stopwords(lemmatized_text)
stopword_removed_text

"paul erdos george polya influential hungarian mathematician contributed lot field erdos ' name contains hungarian letter ' ' ' ' double acute accent often incorrectly written erdos erdos either mistake typographical necessity"

# Exercise 
This function should define two optional parameters, extra_words and exclude_words. These parameters should define any additional stop words to include, and any words that we don't want to remove.

# Exercise 
Define a function named prep_article that takes in the dictionary representing an article and returns a dictionary that looks like this:

{
    'title': 'the original title'.
    'original': original,
    'stemmed': article_stemmed,
    'lemmatized': article_lemmatized,
    'clean': article_without_stopwords
}
Note that if the orignal dictionary has a title property, it should remain unchanged (same goes for the category property).

# Exercise 
Define a function named prepare_article_data that takes in the list of articles dictionaries, applies the prep_article function to each one, and returns the transformed data.

In [23]:
def prepare_text(original:str):
    clean_text = basic_clean(original)
    tokenized_text= tokenize(clean_text)
    lemmatized_text=lemmatize(tokenized_text)
    prepared_text = remove_stopwords(lemmatized_text)
    return prepared_text
    

In [24]:
prepare_text(original)

"paul erdos george polya influential hungarian mathematician contributed lot field erdos ' name contains hungarian letter ' ' ' ' double acute accent often incorrectly written erdos erdos either mistake typographical necessity"