# _Processing and Understanding Text_
# Text Preprocessing and Wrangling
    1. Removing HTML Tags
    2. Text Tokenization
    3. Removing Accented Characters
    4. Expanding Contradictions
    5. Removing Special Characters
    6. Case Conversions
    7. Text Corrections
    8. Stemming
    9. Lemmatization
    10. Removing Stopwords
    11. Building a Text Normalizer

## _Stemming_

## Porter stemmer

In [30]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()
ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')

('jump', 'jump', 'jump')

In [31]:
ps.stem('lying')

'lie'

In [32]:
ps.stem('strange')

'strang'

## Lancaster Stemmer

In [33]:
from nltk.stem import LancasterStemmer

ls = LancasterStemmer()

ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped')

('jump', 'jump', 'jump')

In [34]:
ls.stem('lying')

'lying'

In [35]:
ls.stem('strange')

'strange'

## Regex based stemmer

In [36]:
from nltk.stem import RegexpStemmer

rs = RegexpStemmer('ing$|s$|ed$', min = 4)

rs.stem('jumping'), rs.stem('jumps'), rs.stem('jumped')

('jump', 'jump', 'jump')

In [37]:
rs.stem('lying')

'ly'

In [38]:
rs.stem('strange')

'strange'

## Snowball Stemmer

In [39]:
from nltk.stem import SnowballStemmer

ss = SnowballStemmer('german')
print('Supported Languages:', SnowballStemmer.languages)

Supported Languages: ('arabic', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'hungarian', 'italian', 'norwegian', 'porter', 'portuguese', 'romanian', 'russian', 'spanish', 'swedish')


In [40]:
# stemming on German words
# autobahnen -> cars
# autobahn -> car
ss.stem('autobahnen')

'autobahn'

In [41]:
# springen -> jumping
# spring -> jump
ss.stem('springen')

'spring'

In [3]:
# basic function that can be used for stemming text
import nltk
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

'My system keep crash hi crash yesterday, our crash daili'

## _Lemmatization_

In [43]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

# lemmatize nouns
print(wnl.lemmatize('cars', 'n'))
print(wnl.lemmatize('men', 'n'))

car
men


In [44]:
# lemmatize verbs
print(wnl.lemmatize('running', 'v'))
print(wnl.lemmatize('ate', 'v'))

run
eat


In [45]:
# lemmatize adjectives
print(wnl.lemmatize('saddest', 'a'))
print(wnl.lemmatize('fancier', 'a'))

sad
fancy


In [46]:
# pos is important or else lemmatization will not be effective
# ineffective lemmatization
print(wnl.lemmatize('ate', 'n'))
print(wnl.lemmatize('fancier', 'v'))

ate
fancier


In [4]:
# SpaCy performs pos tagging and lemmatization w/o us worrying about whether we're using lemmatization properly
import spacy
nlp = spacy.load('en_core_web_sm', parse = True, tag = True, entity = True)
text = 'My system keeps crashing his crashed yesterday, ours crashes daily'

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")

'My system keep crash ! his crash yesterday , ours crash daily'

## Removing Stopwords

In [5]:
from nltk.tokenize.toktok import ToktokTokenizer
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')

def remove_stopwords(text, is_lower_case = False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]

    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

remove_stopwords("The, and, if are stopwords, computer is not")

', , stopwords , computer'

## Bringing It All Together: Building a Text Normalizer

In [8]:
import handmade.pickle_jar as pj
import re

def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True, accented_char_removal=True,
                     text_lower_case=True, text_lemmatization = True, special_char_removal=True,
                     stopword_removal=True, remove_digits=True):
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = pj.strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = pj.remove_accented_chars(doc)
        # expand contractions
        if contraction_expansion:
            doc = pj.expand_contractions(doc)
        # lowercase the text
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ', doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and/or digits
        if special_char_removal:
            # insert spaces between special characters to isolate them
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = pj.remove_special_characters(doc, remove_digits=remove_digits)
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
        
        normalized_corpus.append(doc)

    return normalized_corpus

In [16]:
# get sample text
sample_text = pj.sample_text

# use above function
{'Original': sample_text,
 'Processed': normalize_corpus([sample_text])[0]}

{'Original': "US unveils world's most powerful supercomputer, beats China. The US Has unveiled the world's most powerful supercomputer called 'Summit, beating the previous record-holder China's Sunway TaihuLight. With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, which reportedly take up the size of two tennis courts.",
 'Processed': 'us unveil world powerful supercomputer beat china us unveil world powerful supercomputer call summit beat previous record holder chinas sunway taihulight peak performance trillion calculation per second twice fast sunway taihulight capable trillion calculation per second summit server reportedly take size two tennis court'}