# Text preprocessing

In [5]:
import ipytest
import re
ipytest.autoconfig()

## Task 1: Tokenization

Split an input text into tokens based on whitespaces, punctuation, hyphens, and HTML markup. Additionally, lowercase all tokens.

In [56]:
def tokenize(text):    
    """Returns a sequence of terms given an input text."""
    special_chars=',|\.|:|;|\?|\!|\n|\t|\\\'|-|<(.*?)>'
    doc = re.sub(special_chars,' ',text)
    doc = doc.lower()
    doc = re.sub('\s{2,}', ' ',doc).strip()
    items = doc.split(' ')
    return items

Tests.

In [57]:
%%run_pytest[clean]

def test_whitespace():
    assert tokenize('aaa bbb ccc') == ['aaa', 'bbb', 'ccc']
    
def test_punctuation():
    assert tokenize('aaa! bbb.ccc,ddd:eee ff\'f') == ['aaa', 'bbb', 'ccc', 'ddd', 'eee', 'ff', 'f']
    
def test_hyphens():
    assert tokenize('aaa bbb-Ccc') == ['aaa', 'bbb', 'ccc']
    
def test_html():
    assert tokenize('aaa <bbb>ccc <ddd>eee</ddd></bbb>fff <ggg />') == ['aaa', 'ccc', 'eee', 'fff']

....                                                                     [100%]
4 passed in 0.02s


## Task 2: Stopwords removal

Remove stopwords from a sequence of tokens, given a set of stopwords.

In [64]:
def remove_stopwords(tokens, stopwords):
    """Removes stopwords from a sequence of tokens."""
    #'|'.join(['this', 'is', 'some', 'text'])
    return [token for token in tokens if token not in stopwords]
    

Tests.

In [65]:
%%run_pytest[clean]

def test_no_stopwords():
    assert remove_stopwords(['this', 'is', 'some', 'text'], {}) == ['this', 'is', 'some', 'text']
    
def test_stopwords():
    assert remove_stopwords(['this', 'is', 'some', 'text'], {'is', 'this'}) == ['some', 'text']
    
def test_stopwords2():
    assert remove_stopwords(['this', 'isolate', 'otto'], {'is', 'this', 'to'}) == ['isolate', 'otto']    

...                                                                      [100%]
3 passed in 0.02s


## Task 3: Suffix-s stemming

Remove the s-suffix from all terms in a sequence.

In [74]:
def suffix_s_stemmer(terms):
    sufixes = ['s']
    """Removes the s-suffix from all terms in a sequence."""
    for i in range(len(terms)):
        if terms[i][-1]  in sufixes:
            terms[i] = terms[i][0:-1]
    return terms

Tests.

In [77]:
%%run_pytest[clean]

def test_stemming():
    assert suffix_s_stemmer(['dogs', 'better', 'cats']) == ['dog', 'better', 'cat']

.                                                                        [100%]
1 passed in 0.02s
