In [1]:
import unicodedata
import re
import json
import os
from requests import get
from bs4 import BeautifulSoup
import acquire

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords


import pandas as pd
from acquire import parse_blog
from acquire import get_article_text
from acquire import get_codeup_blogs
from acquire import get_inshorts_articles
from acquire import prep_text

## From acquire, use 'get_article_text' function and store results into variable.

In [None]:
original = get_article_text()

In [None]:
# Lowercase everything in the text.
article = original.lower()

In [None]:
print(article)

## Remove Accented Characters

Convert invalid characters into ASCII characters.
1. 'unicodedata.normalize' will remove inconsistencies in unicode character encoding.
2. '.encode' will convert the resulting string to the ASCII character set. 
3. '.decode' turns the resulting bytes object back into a string.

In [None]:
article = unicodedata.normalize('NFKD', article)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')

In [None]:
print(article)

## Remove Special Characters

In [None]:
# Remove anything that isn't a-z, a number, single quote, or whitespace.
article = re.sub(r"[^a-z0-9'\s]", '', article)

In [None]:
print(article)

## Tokenization
##### Use nltk to tokenize the strings.

In [None]:
tokenizer = nltk.tokenize.ToktokTokenizer()

In [None]:
tokenizer.tokenize(original, return_str=True)

## Stemming and Lemmatization
### Stemming
Reducing words to its root stem. The root stem may not always be an official word found in a dictionary.

In [None]:
# Create the nltk stemmer object, then use it
ps = nltk.porter.PorterStemmer()

ps.stem('call'), ps.stem('called'), ps.stem('calling')

In [None]:
# Apply stemming transformation to all the words in the article.
stems = [ps.stem(word) for word in article.split()]

In [None]:
# Join each word in 'stems' with a space.
article_stemmed = ' '.join(stems)

In [None]:
print(article_stemmed)

In [None]:
pd.Series(stems).value_counts().head(5)

## Lemmatization
The base form of a lemmatized word is the root word(lemma). Lemmas will always be present in dictionaries.

In [None]:
# Create lemmatizer object
wnl = nltk.stem.WordNetLemmatizer()

for word in 'study studies come coming eat eatery eating eaters'.split():
    print('stem:', ps.stem(word), '-- lemma:', wnl.lemmatize(word))

In [None]:
lemmas = [wnl.lemmatize(word) for word in article.split()]
article_lemmatized = ' '.join(lemmas)

In [None]:
print(article_lemmatized)

## Removing Stopwords
**stopword:** words that have little to no significance while constructing meaningful features from text.
* Articles, conjunctions, and prepositions are some examples of stopwords.

In [None]:
stopword_list = stopwords.words('english')
#stopword_list.remove('no')
#stopword_list.remove('not')

In [None]:
stopword_list

In [None]:
words = article.split()

In [None]:
filtered_words = [w for w in words if w not in stopword_list]

print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
print('---')

article_without_stopwords = ' '.join(filtered_words)

print(article_without_stopwords)

In [None]:
def basic_clean(string):
    '''
    This function takes in a string and returns the string normalized.
    '''
    string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')
    # Remove anything that isn't a-z, a number, single quote, or whitespace.
    string = re.sub(r"[^a-z0-9'\s]", '', string).lower()
    return string

In [None]:
def tokenize(string):
    '''
    This function takes in a string and returns a tokenized string.
    '''
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(string, return_str=True)

In [None]:
def stem(string):
    '''
    This function takes in a string and returns a string with words stemmed.
    '''
    # Create the nltk stemmer object, then use it
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in string.split()]
    article_stemmed = ' '.join(stems)
    return article_stemmed

In [None]:
def lemmatize(string):
    '''
    This function takes in a string and returns a string with words lemmatized.
    '''
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in string.split()]
    article_lemmatized = ' '.join(lemmas)
    return article_lemmatized

In [None]:
def remove_stopwords(string, extra_words = [], exclude_words = []):
    '''
    This function takes in a string, optional extra_words and exclude_words parameters with default empty lists and returns a string.
    '''
    # Create a stopword list.
    stopword_list = stopwords.words('english')
    # Remove 'exclude_words' from stopword_list to keep these in my text.
    stopword_list = set(stopword_list) - set(exclude_words)
    # Add in 'extra_words' to stopword_list
    stopword_list = stopword_list.union(set(extra_words))
    # Split words in string.
    words = string.split()
    # Create a list of words from my string with stopwords removed and assign to variable.
    filtered_words = [w for w in words if w not in stopword_list]
    print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    print('---')
    string_without_stopwords = ' '.join(filtered_words)
    return string_without_stopwords

In [None]:
codeup_df = get_codeup_blogs(cached=True)

In [None]:
codeup_df

In [None]:
# Create a a dataframe with the column 'content' dropped and run it
# through the newly created function to see if it performs as 
# expected.

# b = codeup_df.drop(columns='content',inplace = True)

In [None]:
prep_text(codeup_df, 'original')

In [None]:
news_df = get_inshorts_articles()

In [None]:
news_df

In [None]:
prep_text(news_df, 'original')

In [None]:
news_df