In [2]:
import spacy

In [3]:
nlp = spacy.load('en')

In [4]:
doc = nlp(u"hello world is a primary de-facto line in C.S.")

In [5]:
for t in doc:
    print(t.text, t.pos_, spacy.explain(t.pos_))

hello INTJ interjection
world NOUN noun
is VERB verb
a DET determiner
primary ADJ adjective
de ADJ adjective
- ADJ adjective
facto ADJ adjective
line NOUN noun
in ADP adposition
C.S. PROPN proper noun


In [6]:
from spacy import displacy
displacy.render(doc, style='dep', options={'distance':50})

In [7]:
list(doc.noun_chunks)

[hello world, a primary de-facto line, C.S.]

In [8]:
doc2 = nlp(u"Ansuman is a good boy")

In [9]:
list(doc2.noun_chunks)

[Ansuman, a good boy]

In [10]:
user_text = 'Archaeological excavations in parts of Kathmandu have found evidence of ancient civilizations. The oldest of these findings is a statue, found in Maligaon, that was dated at 185 AD.[9] The excavation of Dhando Chaitya uncovered a brick with an inscription in Brahmi script. Archaeologists believe it is two thousand years old.[9] Stone inscriptions are a ubiquitous element at heritage sites and are key sources for the history of Nepal. The earliest Western reference to Kathmandu appears in an account of Jesuit Fathers Johann Grueber and Albert d\'Orville. In 1661, they passed through Nepal on their way from Tibet to India, and reported that they reached "Cadmendu", the capital of Nepal kingdom.[10] '

In [11]:
import os

In [12]:
with open('UPDATED_NLP_COURSE/test_text.txt') as f:
    user_text = f.read()

In [13]:
doc3 = nlp(user_text)

In [27]:
for ent in doc3.ents:
    print(f'{ent.text:{20}} {spacy.explain(ent.label_)}')

Kathmandu            Countries, cities, states
Maligaon             Countries, cities, states
185                  Numerals that do not fall under another type
Dhando Chaitya       Companies, agencies, institutions, etc.
Brahmi               Countries, cities, states
two thousand years   Absolute or relative dates or periods
Stone                People, including fictional
Nepal                Countries, cities, states
Western              Nationalities or religious or political groups
Kathmandu            Countries, cities, states
Johann Grueber       People, including fictional
Albert d'Orville     People, including fictional
1661                 Absolute or relative dates or periods
Nepal                Countries, cities, states
Tibet                Countries, cities, states
India                Countries, cities, states
Cadmendu             Titles of books, songs, etc.
Nepal                Countries, cities, states


In [16]:
list(doc3.noun_chunks)

[Archaeological excavations,
 parts,
 Kathmandu,
 evidence,
 ancient civilizations,
 these findings,
 a statue,
 Maligaon,
 The excavation,
 Dhando Chaitya,
 a brick,
 an inscription,
 Brahmi script,
 Archaeologists,
 it,
 Stone inscriptions,
 a ubiquitous element,
 heritage sites,
 key sources,
 the history,
 Nepal,
 The earliest Western reference,
 Kathmandu,
 an account,
 Jesuit Fathers Johann Grueber,
 Albert d'Orville,
 they,
 Nepal,
 their way,
 Tibet,
 India,
 they,
 "Cadmendu,
 the capital,
 Nepal]

In [14]:
sents = list(doc3.sents)
for sent in sents:
    print(list(sent.noun_chunks))

[Archaeological excavations, parts, Kathmandu, evidence, ancient civilizations]
[these findings, a statue, Maligaon]
[]
[The excavation, Dhando Chaitya, a brick, an inscription, Brahmi script]
[Archaeologists, it, Stone inscriptions, a ubiquitous element, heritage sites, key sources, the history, Nepal]
[The earliest Western reference, Kathmandu, an account, Jesuit Fathers Johann Grueber, Albert d'Orville]
[they, Nepal, their way, Tibet, India, they, "Cadmendu, the capital, Nepal]


In [64]:
print(doc3.ents)

(Kathmandu, Maligaon, 185, Dhando Chaitya, Brahmi, two thousand years, Stone, Nepal, Western, Kathmandu, Johann Grueber, Albert d'Orville, 1661, Nepal, Tibet, India, Cadmendu, Nepal)


In [56]:
sents[2]

185 AD.[9]

In [38]:
doc3[doc3.ents[1].start-12:doc3.ents[1].end+12]

. The oldest of these findings is a statue, found in Maligaon, that was dated at 185 AD.[9] The excavation of Dhando

In [43]:
def split_on_newLine(doc):
    start = 0
    seen_newLine = False
    
    for word in doc:
        if seen_newLine:
            yield doc[start:word.i]
            start = word.i
            seen_newLine=False
        elif word.text.startswith('\n'):
            seen_newLine = True
    yield doc[start:]


In [67]:
from spacy.pipeline import SentenceSegmenter
spd = SentenceSegmenter(nlp.vocab, strategy=split_on_newLine)
nlp.remove_pipe(spd)

ValueError: [E001] No component '<spacy.pipeline.hooks.SentenceSegmenter object at 0x12268fdd0>' found in pipeline. Available names: ['tagger', 'parser', 'ner', 'sentencizer']

In [59]:
with open('rupi_kaur.txt', 'w+') as f:
    f.write('for you to see beauty here \ndoes not mean\nthere is beauty in me\nit means there is beauty rooted\nso deep within you\nyou can\'t help but \nsee it everywhere \n-rupi kaur')
    

In [62]:
with open('rupi_kaur.txt') as f:
    doc = nlp(f.read())

In [68]:
for t in doc.sents:
    print(t.text)

for you to see beauty here 

does not mean

there is beauty in me

it means there is beauty rooted

so deep within you

you can't help but 

see it everywhere 

-rupi kaur
