# 1. Dividing text into sentences

### Using NLTK

In [1]:
import nltk

In [2]:
filename = 'data/sherlock_holmes_1.txt'
file = open(filename, 'r', encoding='utf-8')
text = file.read()
text = text.replace('\n', ' ')

In [3]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [4]:
sentences = tokenizer.tokenize(text)
print(sentences)

['To Sherlock Holmes she is always _the_ woman.', 'I have seldom heard him mention her under any other name.', 'In his eyes she eclipses and predominates the whole of her sex.', 'It was not that he felt any emotion akin to love for Irene Adler.', 'All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind.', 'He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false position.', 'He never spoke of the softer passions, save with a gibe and a sneer.', 'They were admirable things for the observer—excellent for drawing the veil from men’s motives and actions.', 'But for the trained reasoner to admit such intrusions into his own delicate and finely adjusted temperament was to introduce a distracting factor which might throw a doubt upon all his mental results.', 'Grit in a sensitive instrument, or a crack in one of his own high-power lenses, would not be mor

###  Using spaCy

In [5]:
import spacy

In [6]:
nlp = spacy.load("en_core_web_sm")

In [7]:
filename = 'data/sherlock_holmes_1.txt'
file = open(filename, 'r', encoding='utf-8')
text = file.read()
text = text.replace('\n', ' ')

In [8]:
doc = nlp(text)
sentences = [sentence.text for sentence in doc.sents]
print(sentences)

['To Sherlock Holmes she is always _the_ woman.', 'I have seldom heard him mention her under any other name.', 'In his eyes she eclipses and predominates the whole of her sex.', 'It was not that he felt any emotion akin to love for Irene Adler.', 'All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind.', 'He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false position.', 'He never spoke of the softer passions, save with a gibe and a sneer.', 'They were admirable things for the observer—excellent for drawing the veil from men’s motives and actions.', 'But for the trained reasoner to admit such intrusions into his own delicate and finely adjusted temperament was to introduce a distracting factor which might throw a doubt upon all his mental results.', 'Grit in a sensitive instrument, or a crack in one of his own high-power lenses, would not be mor

###  More complicated example

In [9]:
s = 'If you’re hungry at midnight, you can go to 7-11, FamilyMart, etc. \
I established my own workshop in 2018, i.e., two years ago. \
Email me the menu at abc-123@gmai.com'

In [10]:
doc = nlp(s)
sentences = [sentence.text for sentence in doc.sents]
print(sentences)

['If you’re hungry at midnight, you can go to 7-11, FamilyMart, etc.', 'I established my own workshop in 2018, i.e., two years ago.', 'Email me the menu at abc-123@gmai.com']


In [11]:
sentences = tokenizer.tokenize(s)
print(sentences)

['If you’re hungry at midnight, you can go to 7-11, FamilyMart, etc.', 'I established my own workshop in 2018, i.e., two years ago.', 'Email me the menu at abc-123@gmai.com']


# 2. Dividing sentences into words – tokenization

### Using NLTK

In [12]:
words = nltk.tokenize.word_tokenize(s)
print(words)

['If', 'you', '’', 're', 'hungry', 'at', 'midnight', ',', 'you', 'can', 'go', 'to', '7-11', ',', 'FamilyMart', ',', 'etc', '.', 'I', 'established', 'my', 'own', 'workshop', 'in', '2018', ',', 'i.e.', ',', 'two', 'years', 'ago', '.', 'Email', 'me', 'the', 'menu', 'at', 'abc-123', '@', 'gmai.com']


### Using spaCy

In [13]:
doc = nlp(s)
words = [token.text for token in doc]
print(words)

['If', 'you', '’re', 'hungry', 'at', 'midnight', ',', 'you', 'can', 'go', 'to', '7', '-', '11', ',', 'FamilyMart', ',', 'etc', '.', 'I', 'established', 'my', 'own', 'workshop', 'in', '2018', ',', 'i.e.', ',', 'two', 'years', 'ago', '.', 'Email', 'me', 'the', 'menu', 'at', 'abc-123@gmai.com']


### Using spaCy for Chinese Segmentation

In [14]:
zh_nlp = spacy.load("zh_core_web_sm")

zh_s = "超級1000系列全英公開賽將於3月16日登場，\
昨（22日）籤表出爐，我國世界球后戴資穎仍以第一種子出戰，\
尋求個人在全英公開賽的第四座冠軍。印度媒體《滾動》則報導，\
「小戴」籤運不錯，如果能穩定發揮，晉級八強不是問題。"

doc = zh_nlp(zh_s)
words = [token.text for token in doc]
print(words)

['超級', '1000', '系列', '全', '英', '公開', '賽將', '於', '3月', '16日', '登場', '，', '昨', '（', '22日', '）', '籤表', '出爐', '，', '我', '國', '世界', '球后', '戴', '資穎', '仍', '以', '第一', '種子', '出戰', '，', '尋求', '個人', '在', '全', '英', '公開', '賽', '的', '第四', '座', '冠軍', '。', '印度', '媒體', '《', '滾動', '》', '則', '報導', '，', '「', '小戴', '」', '籤運不錯', '，', '如果', '能', '穩定', '發揮', '，', '晉級', '八', '強', '不是', '問題', '。']


### NLTK's special tokenizer for tweets
#### ref: https://www.nltk.org/api/nltk.tokenize.casual.html

In [15]:
tweet = "@EmpireStateBldg Central Park Tower is reaaaally hiiiigh :-) ^^ <3"

In [16]:
words = \
nltk.tokenize.casual.casual_tokenize(tweet,
preserve_case=True,
reduce_len=True,
strip_handles=True)

print(words)

['Central', 'Park', 'Tower', 'is', 'reaaally', 'hiiigh', ':-)', '^', '^', '<3']


In [17]:
words = \
nltk.tokenize.casual.casual_tokenize(tweet,
preserve_case=True,
reduce_len=True,
strip_handles=False)

print(words)

['@EmpireStateBldg', 'Central', 'Park', 'Tower', 'is', 'reaaally', 'hiiigh', ':-)', '^', '^', '<3']


# Parts of speech tagging

### Using spaCy

In [18]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [19]:
def ext_pos(result):
    words = [token.text for token in result]
    pos = [token.pos_ for token in result]
    word_pos_tuples = list(zip(words, pos))
    return word_pos_tuples

In [20]:
example = "They can fish."
result = nlp(example)
word_pos_tuples = ext_pos(result)

print(word_pos_tuples)

[('They', 'PRON'), ('can', 'AUX'), ('fish', 'VERB'), ('.', 'PUNCT')]


In [21]:
s = 'If you’re hungry at midnight, you can go to 7-11, FamilyMart, etc. \
I established my own workshop in 2018, i.e., two years ago. \
We sell various electronic devices, e.g., computers, fans, and heaters.'

result = nlp(s)
word_pos_tuples = ext_pos(result)
print(word_pos_tuples)

[('If', 'SCONJ'), ('you', 'PRON'), ('’re', 'VERB'), ('hungry', 'ADJ'), ('at', 'ADP'), ('midnight', 'NOUN'), (',', 'PUNCT'), ('you', 'PRON'), ('can', 'AUX'), ('go', 'VERB'), ('to', 'ADP'), ('7', 'NUM'), ('-', 'SYM'), ('11', 'NUM'), (',', 'PUNCT'), ('FamilyMart', 'PROPN'), (',', 'PUNCT'), ('etc', 'X'), ('.', 'PUNCT'), ('I', 'PRON'), ('established', 'VERB'), ('my', 'PRON'), ('own', 'ADJ'), ('workshop', 'NOUN'), ('in', 'ADP'), ('2018', 'NUM'), (',', 'PUNCT'), ('i.e.', 'X'), (',', 'PUNCT'), ('two', 'NUM'), ('years', 'NOUN'), ('ago', 'ADV'), ('.', 'PUNCT'), ('We', 'PRON'), ('sell', 'VERB'), ('various', 'ADJ'), ('electronic', 'ADJ'), ('devices', 'NOUN'), (',', 'PUNCT'), ('e.g.', 'ADV'), (',', 'PUNCT'), ('computers', 'NOUN'), (',', 'PUNCT'), ('fans', 'NOUN'), (',', 'PUNCT'), ('and', 'CCONJ'), ('heaters', 'NOUN'), ('.', 'PUNCT')]


In [22]:
zh_s = "超級1000系列全英公開賽將於3月16日登場，\
昨（22日）籤表出爐，我國世界球后戴資穎仍以第一種子出戰，\
尋求個人在全英公開賽的第四座冠軍。印度媒體《滾動》則報導，\
「小戴」籤運不錯，如果能穩定發揮，晉級八強不是問題。"

zh_nlp = spacy.load("zh_core_web_sm")

result = zh_nlp(zh_s)
word_pos_tuples = ext_pos(result)
print(word_pos_tuples)

[('超級', 'VERB'), ('1000', 'NUM'), ('系列', 'NUM'), ('全', 'DET'), ('英', 'PROPN'), ('公開', 'NOUN'), ('賽將', 'ADV'), ('於', 'ADP'), ('3月', 'NOUN'), ('16日', 'NOUN'), ('登場', 'VERB'), ('，', 'PUNCT'), ('昨', 'NOUN'), ('（', 'PUNCT'), ('22日', 'NUM'), ('）', 'PUNCT'), ('籤表', 'NOUN'), ('出爐', 'VERB'), ('，', 'PUNCT'), ('我', 'PRON'), ('國', 'VERB'), ('世界', 'NOUN'), ('球后', 'NOUN'), ('戴', 'VERB'), ('資穎', 'NOUN'), ('仍', 'ADV'), ('以', 'ADP'), ('第一', 'NUM'), ('種子', 'NOUN'), ('出戰', 'VERB'), ('，', 'PUNCT'), ('尋求', 'NOUN'), ('個人', 'VERB'), ('在', 'ADP'), ('全', 'DET'), ('英', 'PROPN'), ('公開', 'NOUN'), ('賽', 'VERB'), ('的', 'PART'), ('第四', 'NUM'), ('座', 'NUM'), ('冠軍', 'NOUN'), ('。', 'PUNCT'), ('印度', 'PROPN'), ('媒體', 'NOUN'), ('《', 'PUNCT'), ('滾動', 'NOUN'), ('》', 'PUNCT'), ('則', 'VERB'), ('報導', 'NOUN'), ('，', 'PUNCT'), ('「', 'PUNCT'), ('小戴', 'NOUN'), ('」', 'PUNCT'), ('籤運不錯', 'VERB'), ('，', 'PUNCT'), ('如果', 'SCONJ'), ('能', 'VERB'), ('穩定', 'VERB'), ('發揮', 'VERB'), ('，', 'PUNCT'), ('晉級', 'VERB'), ('八', 'NUM'), ('強', 'NUM'),

### Using NLTK

In [23]:
import nltk
words = nltk.tokenize.word_tokenize(s)
words_with_pos = nltk.pos_tag(words)

print(words_with_pos)

[('If', 'IN'), ('you', 'PRP'), ('’', 'VBP'), ('re', 'JJ'), ('hungry', 'NN'), ('at', 'IN'), ('midnight', 'NN'), (',', ','), ('you', 'PRP'), ('can', 'MD'), ('go', 'VB'), ('to', 'TO'), ('7-11', 'CD'), (',', ','), ('FamilyMart', 'NNP'), (',', ','), ('etc', 'FW'), ('.', '.'), ('I', 'PRP'), ('established', 'VBD'), ('my', 'PRP$'), ('own', 'JJ'), ('workshop', 'NN'), ('in', 'IN'), ('2018', 'CD'), (',', ','), ('i.e.', 'FW'), (',', ','), ('two', 'CD'), ('years', 'NNS'), ('ago', 'RB'), ('.', '.'), ('We', 'PRP'), ('sell', 'VBP'), ('various', 'JJ'), ('electronic', 'JJ'), ('devices', 'NNS'), (',', ','), ('e.g.', 'NN'), (',', ','), ('computers', 'NNS'), (',', ','), ('fans', 'NNS'), (',', ','), ('and', 'CC'), ('heaters', 'NNS'), ('.', '.')]


# Word stemming

### Using NLTK

In [24]:
words = ['leaf', 'leaves', 'booking', 'writing',
'completed', 'stemming', 'skies', 'gone', 'goes', 'this']

In [25]:
from nltk.stem.snowball import SnowballStemmer

stemmer = SnowballStemmer('english')
stemmed_words = [stemmer.stem(word) for word in words]
print(stemmed_words)

['leaf', 'leav', 'book', 'write', 'complet', 'stem', 'sky', 'gone', 'goe', 'this']


In [26]:
from nltk.stem.porter import PorterStemmer

porterStemmer = PorterStemmer()
stemmed_words = [porterStemmer.stem(word) for word in words]
print(stemmed_words)

['leaf', 'leav', 'book', 'write', 'complet', 'stem', 'sky', 'gone', 'goe', 'thi']


In [27]:
from nltk.stem.lancaster import LancasterStemmer

lancasterStemmer = LancasterStemmer()
stemmed_words = [lancasterStemmer.stem(word) for word in words]
print(stemmed_words)

['leaf', 'leav', 'book', 'writ', 'complet', 'stem', 'ski', 'gon', 'goe', 'thi']


# Combining similar words – lemmatization

In [28]:
import spacy

nlp = spacy.load("en_core_web_sm")

doc = nlp('This product integrates both libraries for downloading and applying patches.')

for token in doc:
    print(token.text, token.lemma_)

This this
product product
integrates integrate
both both
libraries library
for for
downloading download
and and
applying apply
patches patch
. .


# Removing stopwords

In [29]:
import nltk

stopwords = nltk.corpus.stopwords.words('english')
print(stopwords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [30]:
words = nltk.tokenize.word_tokenize(s)

print(words)

['If', 'you', '’', 're', 'hungry', 'at', 'midnight', ',', 'you', 'can', 'go', 'to', '7-11', ',', 'FamilyMart', ',', 'etc', '.', 'I', 'established', 'my', 'own', 'workshop', 'in', '2018', ',', 'i.e.', ',', 'two', 'years', 'ago', '.', 'We', 'sell', 'various', 'electronic', 'devices', ',', 'e.g.', ',', 'computers', ',', 'fans', ',', 'and', 'heaters', '.']


In [31]:
words = [word for word in words if word.lower() not in stopwords]

print(words)

['’', 'hungry', 'midnight', ',', 'go', '7-11', ',', 'FamilyMart', ',', 'etc', '.', 'established', 'workshop', '2018', ',', 'i.e.', ',', 'two', 'years', 'ago', '.', 'sell', 'various', 'electronic', 'devices', ',', 'e.g.', ',', 'computers', ',', 'fans', ',', 'heaters', '.']


# Getting the dependency parse

In [32]:
#https://alvinntnu.github.io/python-notes/nlp/nlp-spacy-zh.html

In [33]:
sent = 'I need a ticket to Los Angeles on May 8th.'

In [34]:
doc = nlp(sent)

for token in doc:
    print('%s\t%s\t%s\t%s\t%s' %(token.text, token.pos_, token.dep_, \
                        spacy.explain(token.dep_), token.head.text))

I	PRON	nsubj	nominal subject	need
need	VERB	ROOT	None	need
a	DET	det	determiner	ticket
ticket	NOUN	dobj	direct object	need
to	ADP	prep	prepositional modifier	ticket
Los	PROPN	compound	compound	Angeles
Angeles	PROPN	pobj	object of preposition	to
on	ADP	prep	prepositional modifier	need
May	PROPN	compound	compound	8th
8th	NOUN	pobj	object of preposition	on
.	PUNCT	punct	punctuation	need


In [35]:
doc = nlp(sent)

for token in doc:
    if token.ent_type != 0:
        print(token.text, token.ent_type_)

Los GPE
Angeles GPE
May DATE
8th DATE


In [36]:
ner_s = 'Apple investors urged to vote against a nearly \
$100 million pay package for CEO Tim Cook.'

doc = nlp(ner_s)

for token in doc:
    if token.ent_type != 0:
        print(token.text, token.ent_type_)

Apple ORG
nearly MONEY
$ MONEY
100 MONEY
million MONEY
Tim PERSON
Cook PERSON


# Extracting noun chunks

In [37]:
for noun_chunk in doc.noun_chunks:
    print(noun_chunk.text)

Apple investors
a nearly $100 million pay package
CEO
Tim Cook


# Extracting subjects, predicates, and objects of the sentence

In [76]:
sentence = 'I established my own workshop in 2018 before I went to Japan.'
doc = nlp(sentence)
for token in doc:
    print('%s\t%s\t%s\t%s\t%s\t%d' %(token.text, token.pos_, token.dep_, \
                        spacy.explain(token.dep_), token.head.text, token.head.i))

I	PRON	nsubj	nominal subject	established	1
established	VERB	ROOT	None	established	1
my	PRON	poss	possession modifier	workshop	4
own	ADJ	amod	adjectival modifier	workshop	4
workshop	NOUN	dobj	direct object	established	1
in	ADP	prep	prepositional modifier	established	1
2018	NUM	pobj	object of preposition	in	5
before	SCONJ	mark	marker	went	9
I	PRON	nsubj	nominal subject	went	9
went	VERB	advcl	adverbial clause modifier	established	1
to	ADP	prep	prepositional modifier	went	9
Japan	PROPN	pobj	object of preposition	to	10
.	PUNCT	punct	punctuation	established	1


In [65]:
for token in doc:
    subtree = list(token.subtree)
    print(token, subtree)

I [I]
established [I, established, my, own, workshop, in, 2018, before, I, went, to, Japan, .]
my [my]
own [own]
workshop [my, own, workshop]
in [in, 2018]
2018 [2018]
before [before]
I [I]
went [before, I, went, to, Japan]
to [to, Japan]
Japan [Japan]
. [.]


In [74]:
verb_idxs = [(i, token) for i, token in enumerate(doc) if token.pos_ == 'VERB']
print(verb_idxs)

[(1, established), (9, went)]


In [78]:
def get_phrase(doc, head_idx, tag):
    for token in doc:
        if tag in token.dep_ and token.head.i == head_idx:
            subtree = list(token.subtree)
            start = subtree[0].i
            end = subtree[-1].i + 1
            return doc[start:end]

In [81]:
doc = nlp(sentence)

for verb_idx in verb_idxs:
    subject_phrase = get_phrase(doc, verb_idx[0], 'subj')
    object_phrase = get_phrase(doc, verb_idx[0], 'obj')
    print('subject:', subject_phrase)
    print('predicate:', doc[verb_idx[0]])
    print('object:', object_phrase)

subject: I
predicate: established
object: my own workshop
subject: I
predicate: went
object: None
