## Stop-word and punctuation removal

#### Stop-words

In [None]:
from nltk.corpus import stopwords

stop_list = stopwords.words('english')

stop_list

In [None]:
len(stop_list)

#### Punctuation

In [None]:
from string import punctuation

punctuation

In [None]:
list(punctuation)

In [None]:
stop_list += list(punctuation)

len(stop_list)

In [None]:
stop_list

#### Adding custom words

In [None]:
stop_list += ['rt', 'via']  # custom list

len(stop_list)

#### Fun with Unicode

Unicode categories: https://en.wikipedia.org/wiki/Unicode_character_property#General_Category

Punctuation categories are labelled as P*

In [None]:
from unicodedata import category

In [None]:
category('A')  # Letter, uppercase

In [None]:
category('a')  # Letter, lowercase

In [None]:
category('.')  # Punctuation, other

In [None]:
dashes = ['‒', '–', '—', '―', '⁓']  # https://en.wikipedia.org/wiki/Dash#Common_dashes

'-' in dashes

In [None]:
for d in dashes:
    print(category(d))

In [None]:
def is_unicode_punct(token):
    try:
        return category(token).startswith('P')
    except TypeError:
        return False

In [None]:
is_unicode_punct('A')

In [None]:
is_unicode_punct('.')

In [None]:
is_unicode_punct('HELLOOO')

#### Putting everything together

In [None]:
from nltk.tokenize import word_tokenize

text = """Python is a widely used high-level programming
language for general-purpose programming,
created by Guido van Rossum and first released in 1991."""
# text from https://en.wikipedia.org/wiki/Python_(programming_language)

tokens = word_tokenize(text)

tokens

In [None]:
tokens_no_stop = [t for t in tokens
                  if t not in stop_list and not is_unicode_punct(t)]

tokens_no_stop