## Preprocessing (or making the input data better)
* Helps make for better input data when performing ML or other statistical methods by removing stop words, punctuation, unwanted tokens, or by performing **Lemmatizaiton/Stemming** by shortenning words to their root stems

* It's good to experiment with different approaches

* Examples
    * Tokenizaiton to create a bag of words
    * Lowercasing words
    * Removing punctuation

In [13]:
# Make lowercase, is alpha numeric, removed stopwords (common words 'of' 'is' 'to')
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from collections import Counter

text = """The cat is in the box. The cat likes the box. The box is over the cat."""

tokens = [w for w in word_tokenize(text.lower())   
            if w.isalpha()]

no_stops = [t for t in tokens
            if t not in stopwords.words('english')]

Counter(no_stops).most_common(2)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jeremybrien/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[('cat', 3), ('box', 3)]

In [23]:
# Import WordNetLemmatizer
import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

file = open("../datasets/wiki/wiki_text_bug.txt", "r")
wiki = file.read()
file.close()

# Tokenize the article: tokens
tokens = word_tokenize(wiki)

# Convert the tokens into lowercase: lower_tokens
lower_tokens = [t.lower() for t in tokens]
english_stops = [t for t in tokens
            if t not in stopwords.words('english')]

# Retain alphabetic words: alpha_only
alpha_only = [t for t in lower_tokens if t.isalpha()]

# Remove all stop words: no_stops
no_stops = [t for t in alpha_only if t not in english_stops]

# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]

# Create the bag-of-words: bow
bow = Counter(lemmatized)

# Print the 10 most common tokens
print(bow.most_common(10))

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jeremybrien/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[('the', 282), ('a', 199), ('of', 138), ('to', 131), ('in', 103), ('and', 89), ('or', 69), ('is', 67), ('that', 49), ('it', 45)]
