In [2]:
!pip install spacy

Defaulting to user installation because normal site-packages is not writeable
Collecting spacy
  Downloading spacy-3.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.10-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.0 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.4 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.2 kB)
Collecting thinc<8.3.0,

In [8]:
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "spacy", "download", "en_core_web_sm"])


Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m945.4 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m36m0:00:01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


0

In [9]:
# Import the spaCy library
import spacy

# Load the small English language model
nlp = spacy.load("en_core_web_sm")

# Process a sample sentence with the loaded language model
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Loop through each word (token) in the processed sentence
for token in doc:
    # Print the word, its part-of-speech tag, and its syntactic dependency
    print(token.text, token.pos_, token.dep_)

# Explanation of the printed information:
# Text: The original word in the sentence.
# POS: The part-of-speech tag (e.g., noun, verb, adjective).
# Dep: The syntactic dependency, which shows the relationship between words in the sentence.


Apple PROPN nsubj
is AUX aux
looking VERB ROOT
at ADP prep
buying VERB pcomp
U.K. PROPN dobj
startup NOUN dep
for ADP prep
$ SYM quantmod
1 NUM compound
billion NUM pobj


Even though a Doc is processed, e.g. split into individual words and annotated - it still holds all information of the original text, like whitespace characters. You can always get the offset of a token into the original string, or reconstruct the original by joining the tokens and their trailing whitespace. This way, you’ll never lose any information when processing text with spaCy.

# Tokenization 
During processing, spaCy first tokenizes the text, i.e. segments it into words, punctuation and so on. This is done by applying rules specific to each language. For example, punctuation at the end of a sentence should be split off - whereas ‘U.K.’ should remain one token. Each Doc consists of individual tokens, and we can iterate over them:

In [10]:
import spacy  # Import the spaCy library

# Load the small English language model
nlp = spacy.load("en_core_web_sm")

# Process the text to create a Doc object
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Iterate over each token in the Doc and print the token's text
for token in doc:
    print(token.text)


Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion


![image.png](attachment:79234a8b-4843-4b27-b5c5-f99f904e0f7a.png)

![image.png](attachment:aeade746-3622-46a1-bc84-96a2b482a2ac.png)

![image.png](attachment:9b580d6c-9aaf-406c-ac42-7e8e49e949f7.png)

In [11]:
import spacy  # Import the spaCy library

# Load the small English language model
nlp = spacy.load("en_core_web_sm")

# Process the text to create a Doc object
doc = nlp("Coronavirus: Delhi resident tests positive for coronavirus, total 31 people infected in India")

# Iterate over each token in the Doc and print various attributes
for token in doc:
    print(
        token.text,        # The original text of the token
        token.lemma_,      # The base form of the token
        token.pos_,        # The part of speech tag
        token.tag_,        # The detailed part of speech tag
        token.dep_,        # The syntactic dependency
        token.shape_,      # The shape of the token (e.g., "Xxxx" for "Apple")
        token.is_alpha,    # Is the token an alphabetic word?
        token.is_stop      # Is the token a stop word?
    )


Coronavirus coronavirus NOUN NN nsubj Xxxxx True False
: : PUNCT : punct : False False
Delhi Delhi PROPN NNP advmod Xxxxx True False
resident resident NOUN NN compound xxxx True False
tests test NOUN NNS nsubj xxxx True False
positive positive ADJ JJ amod xxxx True False
for for ADP IN prep xxx True True
coronavirus coronavirus NOUN NN pobj xxxx True False
, , PUNCT , punct , False False
total total ADJ JJ ROOT xxxx True False
31 31 NUM CD nummod dd False False
people people NOUN NNS dobj xxxx True False
infected infect VERB VBN acl xxxx True False
in in ADP IN prep xx True True
India India PROPN NNP pobj Xxxxx True False


![image.png](attachment:a62c8dfc-bdb4-4ca2-a446-9aa7261ae5fb.png)

In [12]:
import spacy
from spacy import displacy

# Load the spaCy model (en_core_web_sm)
nlp = spacy.load("en_core_web_sm")

# Create a Doc object containing the sentence to be parsed
doc = nlp("Google, Apple crack down on fake coronavirus apps")

# Display the parsed sentence using displacy.serve()
displacy.serve(doc, style="dep")





Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...



127.0.0.1 - - [24/Jul/2024 16:34:18] "GET / HTTP/1.1" 200 6739
127.0.0.1 - - [24/Jul/2024 16:34:18] "GET /favicon.ico HTTP/1.1" 200 6739


Shutting down server on port 5000.


# Named Entities

![image.png](attachment:882d955b-8683-48ae-8590-c95d43224d71.png)

In [13]:
import spacy  # Import the spaCy library

nlp = spacy.load("en_core_web_sm")  # Load the small English language model

# Create a sentence to process
doc = nlp("Coronavirus: Delhi resident tests positive for coronavirus, total 31 people infected in India")

# Identify named entities in the sentence
for ent in doc.ents:
    # Print the text of the entity, its starting and ending character position, and its label
    print(ent.text, ent.start_char, ent.end_char, ent.label_)


Delhi 13 18 GPE
31 66 68 CARDINAL
India 88 93 GPE


![image.png](attachment:29706522-2c4f-4468-b445-009faaffb550.png)

In [15]:
import spacy
from spacy import displacy

# This line imports the spaCy library, which is a free open-source library for natural language processing (NLP)

text = "Coronavirus: Delhi resident tests positive for coronavirus, total 31 people infected in India"

# This line assigns a string to a variable named text. The string contains a sentence about a coronavirus case in India.

nlp = spacy.load("en_core_web_sm")

# This line loads a pre-trained spaCy model for English named entity recognition (NER). The "en_core_web_sm" model is a small model that is efficient to use for basic tasks.

doc = nlp(text)

# This line applies the loaded model to the text variable and stores the results in a variable named doc. The doc variable now contains a spaCy Doc object, which holds the linguistic analysis of the text.

displacy.serve(doc, style="ent")

# This line displays the named entities found in the text using displacy. The "ent" style argument specifies that we want to see the named entity tags.

# https://spacy.io/api/annotation#named-entities



Using the 'ent' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


![image.png](attachment:2bd71943-f80d-4295-8f2b-1aa7bb647c4e.png)

In [16]:
#!python -m spacy download en_core_web_md
import spacy.cli
spacy.cli.download("en_core_web_md")

import en_core_web_md
nlp = en_core_web_md.load()


Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-md==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.7.1/en_core_web_md-3.7.1-py3-none-any.whl (42.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.8/42.8 MB[0m [31m744.6 kB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:02[0m
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [17]:
import spacy

nlp = spacy.load("en_core_web_md")  # Load the English language model
tokens = nlp("lion bear apple banana fadsfdshds")  # Create a Doc object from the text

for token in tokens:             
  print(token.text, token.has_vector, token.vector_norm, token.is_oov)

# Vector norm: The L2 norm of the token's vector (the square root of the sum of the values squared)
# has vector: Does the token have a vector representation?
# OOV: Out-of-vocabulary


lion True 55.145737 False
bear True 52.114674 False
apple True 43.366478 False
banana True 31.620354 False
fadsfdshds False 0.0 True



![image.png](attachment:9090e0a7-2f17-48f9-9656-dda2e408f237.png)

In [18]:
import spacy  # Import the spaCy library

# Load a larger English language model - "en_core_web_md" is used in the example
# Make sure to download this model before running the code: python -m spacy download en_core_web_md
nlp = spacy.load("en_core_web_md")

# Create a sentence to process
sentence = "lion bear cow apple mango spinach"

# Tokenize the sentence - break it down into individual words
tokens = nlp(sentence)

# Iterate over each token and compare it to other tokens in the sentence
# Print the text of the token, another token for comparison, and their similarity score
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))


lion lion 1.0
lion bear 0.40031397342681885
lion cow 0.4524093568325043
lion apple 0.06742796301841736
lion mango 0.18510109186172485
lion spinach 0.06951921433210373
bear lion 0.40031397342681885
bear bear 1.0
bear cow 0.2781473994255066
bear apple 0.18584339320659637
bear mango 0.14443379640579224
bear spinach 0.0758492723107338
cow lion 0.4524093568325043
cow bear 0.2781473994255066
cow cow 1.0
cow apple 0.25756582617759705
cow mango 0.26287969946861267
cow spinach 0.261837899684906
apple lion 0.06742796301841736
apple bear 0.18584339320659637
apple cow 0.25756582617759705
apple apple 1.0
apple mango 0.6305076479911804
apple spinach 0.5129707455635071
mango lion 0.18510109186172485
mango bear 0.14443379640579224
mango cow 0.26287969946861267
mango apple 0.6305076479911804
mango mango 1.0
mango spinach 0.5483009219169617
spinach lion 0.06951921433210373
spinach bear 0.0758492723107338
spinach cow 0.261837899684906
spinach apple 0.5129707455635071
spinach mango 0.5483009219169617
spin

![image.png](attachment:a6997c01-5f46-4850-ab43-f2dc7ee2f612.png)

![image.png](attachment:6e1336ee-0f36-4bfe-a2ee-bb3780692a8c.png)


![image.png](attachment:370ab2af-1f7a-4e36-8014-e70e86d1bc43.png)