In [1]:
# Import the English language class

from spacy.lang.en import English

In [2]:
# Create an NLP object
nlp = English()

#### The above code contains
- Processing pipeline
- language-specific rules for tokenization, etc.

In [3]:
# Created for processing a string of text with nlp object
doc = nlp('Hello World!')

[t.text for t in doc]

['Hello', 'World', '!']

In [4]:
span = doc[1:3]
span.text

'World!'

##### Lexical Attributes

In [5]:
doc = nlp('This costs $5.')

print("Index: ", [t.i for t in doc])
print("Text: ", [t.text for t in doc])
print("Alpha: ", [t.is_alpha for t in doc])
print("Punct: ", [t.is_punct for t in doc])
print("Like a Number: ", [t.like_num for t in doc])

Index:  [0, 1, 2, 3, 4]
Text:  ['This', 'costs', '$', '5', '.']
Alpha:  [True, True, False, False, False]
Punct:  [False, False, False, False, True]
Like a Number:  [False, False, False, True, False]


### Statistical Models
- Enables spaCy to predict linguistic attributes in context
    - Parts-of-speech tags
    - Syntactical Dependencies
    - Named Entities
    
- Trained on labeled example texts
- Can be updated with more examples to fine-tune predictions

In [6]:
import spacy

nlp = spacy.load('en_core_web_sm')

- Binary weights
- Vocabulary
- Meta information (language, pipeline)

In [7]:
doc = nlp('He ate the burger')

# Parts of Speech  prediction
[(t.text, t.pos_, t.dep_) for t in doc]

[('He', 'PRON', 'nsubj'),
 ('ate', 'VERB', 'ROOT'),
 ('the', 'DET', 'det'),
 ('burger', 'NOUN', 'dobj')]

In [8]:
# Prediction of Syntactical Dependencies
[(t.text, t.pos_, t.dep_, t.head.text) for t in doc]

[('He', 'PRON', 'nsubj', 'ate'),
 ('ate', 'VERB', 'ROOT', 'ate'),
 ('the', 'DET', 'det', 'burger'),
 ('burger', 'NOUN', 'dobj', 'ate')]

In [9]:
from spacy import displacy

displacy.render(doc)

In [10]:
# Named Entities

doc = nlp('Tata is thinking to buy Land Rover from Britain for $54 billion')

[(t.text, t.label_) for t in doc.ents]

[('Land Rover', 'ORG'), ('Britain', 'GPE'), ('$54 billion', 'MONEY')]

In [11]:
spacy.explain("GPE")

'Countries, cities, states'

### Matcher object

**Why not regular expressions?**
- This matches on doc objects, not just plain string text.
- Matches on token and its attributes
- Uses model's prediction, not simply rule-based
- It matches on parts of speech too (As we saw, go can be assumed as a lang if its not a verb)

In [12]:
Data = ['I like coding in go', 'Please go to the shop', 'Where to start in GO']

pattern = [{
    'LOWER': 'go',
    'POS': {'NOT_IN': ['VERB']}
}]

from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)
matcher.add('go', [pattern])

In [13]:
docs = [doc for doc in nlp.pipe(Data)]

for doc in docs:
    for idx, start, end in matcher(doc):
        print(doc)

I like coding in go
Where to start in GO


In [14]:
pattern = [
    {'IS_DIGIT': True},
    {'LOWER': 'fifa'},
    {'LOWER': 'world'},
    {'LOWER': 'cup'},
    {'IS_PUNCT': True}
]

In [15]:
doc = nlp('2018 FIFA World Cup: France Won')

In [16]:
matcher = Matcher(nlp.vocab)
matcher.add('fifa', [pattern])
matcher(doc)

[(7778788780550260102, 0, 5)]

In [17]:
doc[0:5]

2018 FIFA World Cup:

In [18]:
pattern = [
    {'LEMMA': 'buy'},
    {'POS': 'DET', 'OP': '?'},
    {'POS': 'NOUN'}
]

In [19]:
doc = nlp('I bought a car. Now i will be buying accessories')
matcher = Matcher(nlp.vocab)
matcher.add('buy', [pattern])
matcher(doc)

[(9457496526477982497, 1, 4), (9457496526477982497, 9, 11)]

In [20]:
doc[1:4], doc[9:12]

(bought a car, buying accessories)

### Data Structures followed by spaCy (1)

- Vocab: stores data shared across multiple documents
- To save memory, spaCy encodes all strings to hash values
- Strings are only stored once in the ```StringStore``` via ```nlp.vocab.strings```
- String store: **lookup table** in both directions

In [21]:
coffee_hash = nlp.vocab.strings['coffee']
coffee_string = nlp.vocab.strings[coffee_hash]

In [22]:
coffee_hash, coffee_string

(3197928453018144401, 'coffee')

In [23]:
doc = nlp('I love coffee')
doc.vocab.strings['coffee']

3197928453018144401

In [24]:
lexeme = nlp.vocab['coffee']

lexeme.text, lexeme.orth, lexeme.is_alpha

('coffee', 3197928453018144401, True)

- Contains the **context-independent** info about the word
    - Word text: ```lexeme.text``` and `lexeme.orth` (the hash)
    - Lexical attributes like `lexeme.is_alpha`
    - **NOT** context-dependent pos tags, dependencies or entity labels

### Data Structures followed by spaCy (2)

- `Doc` and `Span` are very powerful and hold references and relationships of words and sentences
    - **Convert result to strings as late as possible**
    - **Use token attributes if available** - for example `token.i` for token index
    
- Don't forget to pass in the shared `vocab`

### Word Vectors and semantic similarity
- `spaCy` can compare two objects and predict similarity
- `Doc.similarity()`, `Span.similarity()`, `Token.similarity()`
- Take another object and return a similarity score (0 to 1)
- Important: needs a model that has word vectors included, fo example:
    - `en_core_web_md` or `en_core_web_lg` 

In [25]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.0.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0-py3-none-any.whl (47.1 MB)
[K     |████████████████████████████████| 47.1 MB 7.6 kB/s eta 0:00:01     |███████████████████████████████▊| 46.7 MB 982 kB/s eta 0:00:01
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.0.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')


In [26]:
nlp = spacy.load('en_core_web_md')

doc1 = nlp('I like fast food')
doc2 = nlp('I like pizza')
print(doc1.similarity(doc2))

0.8627204117787385


In [27]:
doc = nlp('I like pizza and pasta')
token1 = doc[2]
token2 = doc[4]
token1.similarity(token2)

0.73695457

**How spaCy predicts similarity?**
- Similarity is determined using **word vectors**
- Multi-dimensional meaning representation of words
- Generated using an algo like Word2Vec and lots of text
- Default: cosine similarity, but can be adjusted based on use case.
- `Doc` and `Span` vectors default to average of token vectors
- Short phrases are better than long docs with many irrelevant words

In [28]:
doc1 = nlp('I like cats')
doc2 = nlp('I hate dogs')

doc1.similarity(doc2)

0.912617208179217

## Processing Pipelines

In [31]:
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

In [33]:
nlp = spacy.load('en_core_web_md')
nlp.pipe_names

['tok2vec', 'tagger', 'parser', 'ner', 'attribute_ruler', 'lemmatizer']

### Setting custom attributes

In [36]:
from spacy.tokens import Doc, Token, Span

Doc.set_extension('title', default=None)
Token.set_extension('is_color', default=False)
Span.set_extension('has_color', default=False)

In [38]:
def get_is_color(token):
    colors = ['red', 'yellow', 'blue']
    return token.text in colors

Token.set_extension('is_color', getter=get_is_color, force=True)

doc = nlp('The sky is blue.')
print(doc[3]._.is_color, '-', doc[3].text)

True - blue


#### Processing large volumes of text
- Use `nlp.pipe()` method
- Processes text as a stream, yields `Doc` objects
- Much faster than calling `nlp` on each text

In [39]:
Doc.set_extension('id', default=None)
Doc.set_extension('page_number', default=None)

data = [
    ('This is a text', {'id': 1, 'page_number': 15}),
    ('And another text', {'id': 2, 'page_number': 16})
]

for doc, context in nlp.pipe(data, as_tuples=True):
    doc._.id = context['id']
    doc._.page_number = context['page_number']

In [40]:
# For using only tokenizer
doc1 = nlp("Hello world")

doc2 = nlp.make_doc("Hello world")

In [42]:
[t.pos_ for t in doc1]

['INTJ', 'NOUN']

In [44]:
[t.pos_ for t in doc2]
# NOte this is because we technically disabled the pipeline using make_doc

['', '']

In [48]:
with nlp.disable_pipes('tagger', 'parser', 'senter', 'ner', 'attribute_ruler', 'lemmatizer'):
    doc = nlp("Hello I am anant")
    print(doc)

Hello I am anant


### Custom Training

Check out the docs... Not needed currently

In [50]:
Training_Data = [
    ('How to preorder the iPhone X', {'entities': [(20, 28, 'GADGET')]})
]

In [57]:
import random
for i in range(10):
    random.shuffle(Training_Data)
    for batch in spacy.util.minibatch(Training_Data, size=2):
        for text, annotations in batch:
            # create Example
            doc = nlp.make_doc(Training_Data[0][0])
            example = Example.from_dict(doc, annotations)
            # Update the model
            nlp.update([example], losses=losses, drop=0.3)

        
nlp.to_disk('./')

NameError: name 'Example' is not defined