In [9]:
# http://www.bbc.com/news/business-38930699
text = open('ticking_spotify.txt').read()
sents = [line.strip() for line in text.splitlines()]

## Tokenization

### Using nltk

Install nltk

In [None]:
!pip3 install nltk -U

Download tokenization model

In [None]:
import nltk
nltk.download('punkt')

In [10]:
from nltk import word_tokenize
print(word_tokenize(sents[1]))

['It', "'s", 'amazing', 'to', 'think', 'that', 'just', '10', 'years', 'ago', ',', 'flat-rate', 'digital', 'music', 'streaming', 'services', 'were', 'a', 'mere', 'gleam', 'in', 'the', 'eye', 'of', 'industry', 'executives', '.']


In [11]:
from nltk import wordpunct_tokenize
print(wordpunct_tokenize(sents[1]))

['It', "'", 's', 'amazing', 'to', 'think', 'that', 'just', '10', 'years', 'ago', ',', 'flat', '-', 'rate', 'digital', 'music', 'streaming', 'services', 'were', 'a', 'mere', 'gleam', 'in', 'the', 'eye', 'of', 'industry', 'executives', '.']


### Filter stopwords

Download stopword corpus

In [None]:
import nltk
nltk.download('stopwords')

load stopwords and save it in a stopword set

In [19]:
from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

In [22]:
print(stopwords)

{'further', 'off', '@', 'and', '.', 'having', '%', ',', 'for', "'", 'needn', 'wasn', 'then', 'did', 'isn', 'didn', 'm', 'if', 'of', 'theirs', ';', 'above', ':', 'll', '?', '&', 'mightn', 'while', 'each', 'wouldn', '#', '(', 'until', '\\', 'an', 'ours', 'weren', 'more', 'himself', 'her', 'am', 'what', 'they', 'in', 'its', 'aren', 'there', 'other', 'd', 'own', 't', 'the', 'than', 'no', 'do', 'couldn', '`', 'it', 'has', 'yours', 'because', 'i', 'here', 'why', 'these', 'my', 'who', 'where', 'haven', 'is', 'themselves', 'yourselves', 'through', 'from', 'out', 'very', 'ain', '<', '$', 'to', 'all', '|', 'he', 'been', 'over', 'or', '-', 'will', 'this', 'doing', 'don', 'she', 'that', 'those', 'his', 'our', 'such', 'before', 'between', 'was', 'me', 'be', 'both', 'have', 'shan', '"', 'can', 'by', 'during', 'below', '^', 'myself', 'at', 'once', ')', '+', 'are', '~', 'ma', 'too', 'hasn', 'so', 'them', 'when', 'now', 'on', 'herself', 'won', '_', 'him', 'most', 'a', 'with', 'just', 'not', '}', 'their

filter example

In [23]:
words = word_tokenize(sents[1])
# for word in words:
#     if word not in stopwords:
#         print(word)
print([word for word in words if word not in stopwords])

['It', "'s", 'amazing', 'think', '10', 'years', 'ago', 'flat-rate', 'digital', 'music', 'streaming', 'services', 'mere', 'gleam', 'eye', 'industry', 'executives']


Take punctuation into account

In [18]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

Add punctuation into stopword set

In [21]:
# you can add one at a time
for symbol in string.punctuation:
    stopwords.add(symbol)
# or update a sequence into the set
stopwords.update(string.punctuation)

### Part-of-speech tagging

Download nltk POS tagging model

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

In [36]:
words = word_tokenize(sents[1])

In [37]:
nltk.pos_tag(words)

[('It', 'PRP'),
 ("'s", 'VBZ'),
 ('amazing', 'JJ'),
 ('to', 'TO'),
 ('think', 'VB'),
 ('that', 'DT'),
 ('just', 'RB'),
 ('10', 'CD'),
 ('years', 'NNS'),
 ('ago', 'RB'),
 (',', ','),
 ('flat-rate', 'JJ'),
 ('digital', 'JJ'),
 ('music', 'NN'),
 ('streaming', 'NN'),
 ('services', 'NNS'),
 ('were', 'VBD'),
 ('a', 'DT'),
 ('mere', 'JJ'),
 ('gleam', 'NN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('eye', 'NN'),
 ('of', 'IN'),
 ('industry', 'NN'),
 ('executives', 'NNS'),
 ('.', '.')]

In [51]:
for sent in nltk.pos_tag_sents(word_tokenize(sent) for sent in sents):
    print(' '.join('{0}/{1}'.format(word, pos) for word, pos in sent))

The/DT clock/NN is/VBZ ticking/VBG for/IN Spotify/NNP


# spaCy
Industrial-strength Natural Language Processing (NLP) with Python and Cython

Homepage: https://spacy.io

Github: https://github.com/explosion/spaCy

### Install spaCy

In [None]:
!pip3 install -U spaCy

### Download language models

In [None]:
!python3 -m spacy.en.download all

### Using spaCy

Load language data might take some time...

In [41]:
import spacy
nlp = spacy.load('en')

In [49]:
for t in sents:
    doc = nlp(t)
    for sent in doc.sents:
        for token in sent:
            print('{}/{}'.format(token, token.pos_), end=' ')
        print()

The/DET clock/NOUN is/VERB ticking/VERB for/ADP Spotify/PROPN 


In [50]:
doc = nlp(text)
for sent in doc.sents:
    for token in sent:
        print('{}/{}'.format(token, token.pos_), end=' ')
    print()

The/DET clock/NOUN is/VERB ticking/VERB for/ADP Spotify/PROPN 
/SPACE 
