# Spacy

#### Install Spacy

In [6]:
%%bash
pip install spacy



You are using pip version 9.0.1, however version 21.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.


#### Download models or corpora

In [7]:
%%bash
python3 -m spacy download en_core_web_sm

Collecting https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz (52.2MB)

    Downloading en_core_web_sm-1.2.0/en_core_web_sm-1.2.0.tar.gz


[93m    Linking successful[0m

    /home/ana/.local/lib/python3.5/site-packages/en_core_web_sm/en_core_web_sm-1.2.0
    -->
    /home/ana/.local/lib/python3.5/site-packages/spacy/data/en_core_web_sm

    You can now load the model via spacy.load('en_core_web_sm').



You are using pip version 9.0.1, however version 21.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.


#### Define the model

In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

#### Process the texts

In [2]:
nlp = spacy.load("en", disable=['ner'])


def normalize(text, remove_stopwords, remove_punctuation):
    text = text.lower()
    text = nlp(text)
    lemmatized = list()
    for word in text:
        if remove_stopwords and word.is_stop:
            continue
        if remove_punctuation and word.is_punct:
            continue
        lemma = word.lemma_.strip()
        if lemma:
            lemmatized.append(lemma)
    return lemmatized



In [3]:
tweet = "RT @lOR42wsOEFcv3f: I fall too fast, crash too hard, forgive too easily and care too much... :( #amiright"
normalize(tweet, remove_stopwords=True, remove_punctuation=True)

['rt',
 '@lor42wsoefcv3f',
 'fall',
 'fast',
 'crash',
 'hard',
 'forgive',
 'easily',
 'care',
 'amiright']

In [4]:
text = nlp(tweet)
for token in text:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

RT rt PROPN NNP compound XX True False
@lOR42wsOEFcv3f @lor42wsoefcv3f NOUN NN nsubj @xXXddxxXXXxxdx False False
: : PUNCT : punct : False False
I -PRON- PRON PRP nsubj X True True
fall fall VERB VBP acl xxxx True False
too too ADV RB advmod xxx True True
fast fast ADV RB advmod xxxx True False
, , PUNCT , punct , False False
crash crash NOUN NN acl xxxx True False
too too ADV RB advmod xxx True True
hard hard ADV RB acomp xxxx True False
, , PUNCT , punct , False False
forgive forgive VERB VB aux xxxx True False
too too ADV RB advmod xxx True True
easily easily ADV RB advmod xxxx True False
and and CCONJ CC cc xxx True True
care care VERB VB ROOT xxxx True False
too too ADV RB advmod xxx True True
much much ADV RB dobj xxxx True True
... ... PUNCT NFP punct ... False False
:( :( PUNCT NFP punct :( False False
# # PUNCT NFP punct # False False
amiright amiright ADV RB advmod xxxx True False


In [20]:
from spacy.matcher import Matcher


matcher = Matcher(nlp.vocab)
matcher.add('HASHTAG', None, *[{'ORTH': '#'}, {'IS_ASCII': True}])

doc = nlp('This is a #sentence. Here is another #hashtag. #The #End.')
matches = matcher(doc)
hashtags = []
for match_id, start, end in matches:
    hashtags.append(doc[start:end])

for span in hashtags:
    span.merge()

print([t.text for t in doc])