In [2]:
import spacy

sp = spacy.load('en_core_web_sm')

In [7]:
sentence = sp(u'my dumbass was writing all the dependencies of the python project in requirements.txt manually. pain.')

In [8]:
sentence

my dumbass was writing all the dependencies of the python project in requirements.txt manually. pain.

In [11]:
for word in sentence:
    print(word.text, word.pos_, word.dep_)

my PRON poss
dumbass NOUN nsubj
was AUX aux
writing VERB ROOT
all DET predet
the DET det
dependencies NOUN dobj
of ADP prep
the DET det
python PROPN compound
project NOUN pobj
in ADP prep
requirements.txt NOUN pobj
manually ADV advmod
. PUNCT punct
pain NOUN ROOT
. PUNCT punct


In [12]:
for s in sentence.sents:
    print(s)

my dumbass was writing all the dependencies of the python project in requirements.txt manually.
pain.


In [16]:
sentence[4]

all

In [17]:
sentence[4].is_sent_start

False

In [18]:
sentence[0].is_sent_start

True

### Tokenization

In [19]:
text = sp(u'"They\'re leaving U.K. for U.S.A."')
print(text)

"They're leaving U.K. for U.S.A."


In [20]:
for word in text:
    print(word.text)

"
They
're
leaving
U.K.
for
U.S.A.
"


In [21]:
len(text)

8

#### Detecting entities

In [None]:
sentence5 = sp(u'Manchester United is looking to sign Harry Kane for $90 million')
for word in sentence5:
    print(word.text)

In [24]:
for entity in sentence5.ents:
    print(entity.text + ' - ' + entity.label_ + ' - ' + str(spacy.explain(entity.label_)))

Manchester United - ORG - Companies, agencies, institutions, etc.
Harry Kane - PERSON - People, including fictional
$90 million - MONEY - Monetary values, including unit


#### Detecting Nouns



In [25]:
for noun in sentence5.noun_chunks:
    print(noun.text)

Manchester United
Harry Kane


### Stemming
###### stemming refers to reducing a word to its root form.

#### Porter Stemmer



In [26]:
import nltk
from nltk.stem.porter import PorterStemmer

In [27]:
stemmer = PorterStemmer()
tokens = ['compute', 'computer', 'computed', 'computing']

In [28]:
for token in tokens:
    print(token + ' -->' + stemmer.stem(token))

compute -->comput
computer -->comput
computed -->comput
computing -->comput


#### Snowball Stemmer
##### a slightly improved version of Porter stemmer and usually preferred


In [29]:
from nltk.stem.snowball import SnowballStemmer
snowballStemmer = SnowballStemmer(language='english')

In [30]:
for token in tokens:
    print(token + ' --> ' + snowballStemmer.stem(token))

compute --> comput
computer --> comput
computed --> comput
computing --> comput


### Lemmatization

In [31]:
sentence7 = sp(u'A letter has been written, asking him to be released')

In [32]:
for word in sentence7:
    print(word.text, word.lemma_)

A a
letter letter
has have
been be
written write
, ,
asking ask
him he
to to
be be
released release
