In [59]:
text = "The Dr. heart is a bloom, shoots up through the stoney ground. There's no room, no space to rent in this town."

In [71]:
import spacy 
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

# Sentence Segmentation: 
sent_seg = [sentence for sentence in doc.sents]
print(sent_seg)

# Word Segmentation 
word_seg = [word for sentence in doc.sents for word in sentence]
print(word_seg)

[The Dr. heart is a bloom, shoots up through the stoney ground., There's no room, no space to rent in this town.]
[The, Dr., heart, is, a, bloom, ,, shoots, up, through, the, stoney, ground, ., There, 's, no, room, ,, no, space, to, rent, in, this, town, .]


In [26]:
# NLTK Examples
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
# nltk.download('punkt')

# Sentence Segmentation 
print(sent_tokenize(text))

# Word Segmentation 
print(word_tokenize(text))

# Token 

['The Dr. heart is a bloom, shoots up through the stoney ground.', "There's no room, no space to rent in this town."]
['The', 'Dr.', 'heart', 'is', 'a', 'bloom', ',', 'shoots', 'up', 'through', 'the', 'stoney', 'ground', '.', 'There', "'s", 'no', 'room', ',', 'no', 'space', 'to', 'rent', 'in', 'this', 'town', '.']


In [34]:
nlp = spacy.blank("en")  # Will only include tokenizer. 
# nlp = spacy.load("en_core_web_sm") # Will provide a pipeline. Includes tokenizer, tagger, parser, ner...

doc2 = nlp("Let's go to N.Y.!")

for token in doc2:
    print(token)

print(type(doc2)) # spacy Doc type
print(type(doc2[1:3])) # spacy Span type
print(type(doc2[0])) # spacy Token type

Let
's
go
to
N.Y.
!
<class 'spacy.tokens.doc.Doc'>
<class 'spacy.tokens.span.Span'>
<class 'spacy.tokens.token.Token'>


In [46]:
token = doc2[0]
print("Token Value: ", token)
print("Is Punctuation: ", token.is_punct)
print("Is Upper: ",token.is_upper)
print("Is Title: ", token.is_title)
print("Like Number: ", token.like_num)

Token Value:  Let
Is Punctuation:  False
Is Upper:  False
Is Title:  True
Like Number:  False


In [50]:
# Extracting Students Email from a file: 

with open("students.txt") as f:
    students_file = f.readlines()
students_file = " ".join(students_file)
print(students_file)

Dayton high school, 8th grade students information
 
 Name	birth day   	email
 -----	------------	------
 Virat   5 June, 1882    virat@kohli.com
 Maria	12 April, 2001  maria@sharapova.com
 Serena  24 June, 1998   serena@williams.com 
 Joe      1 May, 1997    joe@root.com
 
 
 



In [56]:
emails = nlp(students_file)
email_list = []

for value in emails:
    if value.like_email == True:
        email_list.append(value)

print(email_list)

[virat@kohli.com, maria@sharapova.com, serena@williams.com, joe@root.com]


In [84]:
# Adding a special case for a word: "Gimme"

doc3 = nlp("Gimme some extra cheese, please!")

t = [token.text for token in doc3]
print(t) #Gimme is considered one word here, want to split it to Give and Me

['Gim', 'me', 'some', 'extra', 'cheese', ',', 'please', '!']


In [85]:
from spacy.symbols import ORTH
nlp.tokenizer.add_special_case("Gimme", [{ORTH:"Gim"}, {ORTH:"me"}])
t2 = [token.text for token in doc3]
print(t2)  # Now Gimme is "Gim" and "me"

['Gim', 'me', 'some', 'extra', 'cheese', ',', 'please', '!']


In [86]:
# What's been included in the pipeline: 
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [92]:
### Exercise 1:

text='''
Look for data to help you address the question. Governments are good
sources because data from public research is often freely available. Good
places to start include http://www.data.gov/, and http://www.science.
gov/, and in the United Kingdom, http://data.gov.uk/.
Two of my favorite data sets are the General Social Survey at http://www3.norc.org/gss+website/, 
and the European Social Survey at http://www.europeansocialsurvey.org/.
'''

# TODO: Write code here to extract URLs

doc4 = nlp(text)
url_list = []
for each in doc4:
    if each.like_url == True:
        url_list.append(each)

url_list

[http://www.data.gov/,
 http://www.science,
 http://data.gov.uk/.,
 http://www3.norc.org/gss+website/,
 http://www.europeansocialsurvey.org/.]

In [99]:
### Exercise 2:

transactions = "Tony gave two $ to Peter, Bruce gave 500 € to Steve"

# TODO: Write code here: Extract all money transaction from below sentence along with currency. Output should be, two $ 500 €

doc5 = nlp(transactions)
currency_amts = []
for each in doc5:
    if each.is_currency == True:
        currency_amts.append(doc5[each.i-1 :each.i +1])

currency_amts

[two $, 500 €]