In [1]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
doc = nlp("Today's weather is good, very windy and sunny, we have no classes in the afternoon.We have to play basketball tomorrow")

In [3]:
for token in doc:
    print(token)

Today
's
weather
is
good
,
very
windy
and
sunny
,
we
have
no
classes
in
the
afternoon
.
We
have
to
play
basketball
tomorrow


In [4]:
for sentence in doc.sents:
    print(sentence)

Today's weather is good, very windy and sunny, we have no classes in the afternoon.
We have to play basketball tomorrow


# POS

In [5]:
for token in doc:
    print('{}-{}'.format(token, token.pos_))

Today-NOUN
's-PART
weather-NOUN
is-AUX
good-ADJ
,-PUNCT
very-ADV
windy-ADJ
and-CCONJ
sunny-ADJ
,-PUNCT
we-PRON
have-VERB
no-DET
classes-NOUN
in-ADP
the-DET
afternoon-NOUN
.-PUNCT
We-PRON
have-VERB
to-PART
play-VERB
basketball-NOUN
tomorrow-NOUN


# Entity Recognition

In [6]:
doc_2 = nlp("I went to Paris where I met my old friend Jack from univeristy.")

In [7]:
for ent in doc_2.ents:
    print('{}-{}'.format(ent, ent.label_))

Paris-GPE
Jack-PERSON


In [8]:
from spacy import displacy

doc_2 = nlp("I went to Paris where I met my old friend Jack from univeristy.")
displacy.render(doc_2, style='ent', jupyter=True)

### Case Study: Find all entity in book

In [18]:
def read_file(file_name):
    with open(file_name, 'r', encoding='windows-1252') as file:
        return file.read()

In [19]:
text = read_file('C:/Users/YapWH/MSVC/NLP/Basic_Remastered/data/A.txt')
processed_text = nlp(text)

In [20]:
sentence = [s for s in processed_text.sents]
print(len(sentence))

5483


In [21]:
sentence[:5]

[The Project Gutenberg eBook of Pride and prejudice, by Jane Austen
 
 This eBook is for the use of anyone anywhere in the United States and
 most other parts of the world at no cost and with almost no restrictions
 whatsoever.,
 You may copy it, give it away or re-use it under the terms
 of the Project Gutenberg License included with this eBook or online at
 www.gutenberg.org.,
 If you are not located in the United States, you
 will have to check the laws of the country where you are located before
 using this eBook.
 ,
 Title: Pride and prejudice
 
 Author: Jane Austen
 
 Release Date: November 12, 2022 [eBook #1342]
 ,
 Language]

In [23]:
from collections import Counter

In [24]:
def find_person(doc):
    c = Counter()
    for ent in processed_text.ents:
        if ent.label_ == 'PERSON':
            c[ent.lemma_] += 1
    return c.most_common(10)

print(find_person(processed_text))

[('Elizabeth', 552), ('Darcy', 337), ('Jane', 253), ('Bennet', 246), ('Wickham', 154), ('Collins', 151), ('Bingley', 117), ('Lady Catherine', 79), ('Gardiner', 78), ('Lizzy', 70)]
