### Installation and some basic examples
Below are a few simple examples to show spaCy's capabilities such as tokenization, visualization, etc. The aim of this section is to show what is possible overall. More advanced examples will follow.

In [1]:
import sys

!{sys.executable} -m pip install spacy
!{sys.executable} -m spacy download en_core_web_sm


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup for $1 billion")

# Tokenization
print("Tokens:")
for token in doc:
    print(token.text)

# Part-of-Speech Tagging
print("\nPart-of-Speech Tags:")
for token in doc:
    print(f"{token.text}: {token.pos_}")

# Named Entity Recognition
print("\nNamed Entities:")
for ent in doc.ents:
    print(f"{ent.text}: {ent.label_}")


Tokens:
Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion

Part-of-Speech Tags:
Apple: PROPN
is: AUX
looking: VERB
at: ADP
buying: VERB
U.K.: PROPN
startup: VERB
for: ADP
$: SYM
1: NUM
billion: NUM

Named Entities:
Apple: ORG
U.K.: GPE
$1 billion: MONEY


In [3]:
from spacy import displacy

doc = nlp("Apple's CEO, Tim Cook, announced the new iPhone 12 at the October event in Cupertino, aiming to boost sales in the upcoming holiday season.")

print("Dependency Parsing:")
for token in doc:
    print(f"{token.text} -> {token.dep_} -> {token.head.text}")

print("\nNamed Entities:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")


Dependency Parsing:
Apple -> poss -> CEO
's -> case -> Apple
CEO -> nsubj -> announced
, -> punct -> CEO
Tim -> compound -> Cook
Cook -> appos -> CEO
, -> punct -> CEO
announced -> ROOT -> announced
the -> det -> iPhone
new -> amod -> iPhone
iPhone -> dobj -> announced
12 -> nummod -> iPhone
at -> prep -> announced
the -> det -> event
October -> compound -> event
event -> pobj -> at
in -> prep -> event
Cupertino -> pobj -> in
, -> punct -> announced
aiming -> advcl -> announced
to -> aux -> boost
boost -> xcomp -> aiming
sales -> dobj -> boost
in -> prep -> boost
the -> det -> season
upcoming -> amod -> season
holiday -> compound -> season
season -> pobj -> in
. -> punct -> announced

Named Entities:
Apple (ORG)
Tim Cook (PERSON)
iPhone (ORG)
12 (CARDINAL)
October (DATE)
Cupertino (GPE)
the upcoming holiday season (DATE)


In [4]:
displacy.render(doc, style="dep", jupyter=True, options={"compact": True, "bg": "#000000", "color": "white", "font": "Source Sans Pro"})
displacy.render(doc, style="ent", jupyter=True, options={"colors": {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}})


In [None]:
from spacy.tokens import Doc
from spacy.language import Language

# Define a simple sentiment analysis function
def simple_sentiment(doc: Doc) -> Doc:
    positive_words = {'good', 'great', 'excellent', 'amazing', 'happy'}
    negative_words = {'bad', 'terrible', 'poor', 'sad', 'horrible'}
    
    sentiment_score = 0
    for token in doc:
        if token.text.lower() in positive_words:
            sentiment_score += 1
        elif token.text.lower() in negative_words:
            sentiment_score -= 1
    doc._.sentiment_score = sentiment_score
    return doc

@Language.component("simple_sentiment")
def simple_sentiment_component(doc):
    return simple_sentiment(doc)

nlp = spacy.load("en_core_web_sm")
nlp.add_pipe("simple_sentiment", last=True)

if not Doc.has_extension("sentiment_score"):
    Doc.set_extension("sentiment_score", default=0)

# The positive and negative words cancel each other out, resulting in a sentiment score of 0:
doc = nlp("I had a great day, but the weather was terrible.")
print(f"Sentiment Score: {doc._.sentiment_score}")

doc = nlp("I had a terrible day, but the weather was terrible.")
print(f"Sentiment Score: {doc._.sentiment_score}")

# Here is an example of a positive sentiment despite the presence of a negative word:
doc = nlp("I had a great day, however I felt dissatisfied with life.")
print(f"Sentiment Score: {doc._.sentiment_score}")

Sentiment Score: 0
Sentiment Score: -2
Sentiment Score: 1


### More relevant examples to our task

In [None]:
import spacy
from spacy.tokens import DocBin
from spacy.training.example import Example
import random

# Sample training data (replace with actual data)
TRAIN_DATA = [
    ("Get 20% off on Nike Air Max Shoes! Now only $79.99, was $99.99. Offer valid until March 31, 2025.", 
     {"entities": [(4, 7, "DISCOUNT"), (15, 33, "PRODUCT"), (44, 52, "NEW_PRICE"), (56, 64, "OLD_PRICE"), (76, 96, "VALIDITY")]}),
    ("Save 15% on Adidas Ultraboost! Now $120, was $150. Offer valid until April 15, 2025.",
     {"entities": [(5, 9, "DISCOUNT"), (12, 31, "PRODUCT"), (35, 39, "NEW_PRICE"), (45, 49, "OLD_PRICE"), (63, 83, "VALIDITY")]}),
    ("Limited offer: 30% off on Puma Running Shoes! Buy for just $69.99 instead of $99.99. Valid until May 10, 2025.",
     {"entities": [(15, 18, "DISCOUNT"), (26, 44, "PRODUCT"), (59, 65, "NEW_PRICE"), (77, 83, "OLD_PRICE"), (91, 109, "VALIDITY")]}),
    ("Flash Sale! 50% discount on Ray-Ban Sunglasses. Now at $75, down from $150. Deal expires June 1, 2025.",
     {"entities": [(12, 15, "DISCOUNT"), (28, 46, "PRODUCT"), (55, 58, "NEW_PRICE"), (70, 74, "OLD_PRICE"), (81, 101, "VALIDITY")]},),
    ("Hurry! Grab 10% off on Apple AirPods Pro! Special price: $179, regular price: $199. Available until July 20, 2025.",
     {"entities": [(12, 15, "DISCOUNT"), (23, 40, "PRODUCT"), (57, 61, "NEW_PRICE"), (78, 82, "OLD_PRICE"), (94, 113, "VALIDITY")]},),
    ("Exclusive deal: 25% off on Samsung Galaxy Buds! New price: $112.49, old price: $149.99. Valid through August 5, 2025.",
     {"entities": [(16, 19, "DISCOUNT"), (27, 46, "PRODUCT"), (59, 66, "NEW_PRICE"), (79, 86, "OLD_PRICE"), (94, 116, "VALIDITY")]},),
    ("Special Offer! Buy Sony WH-1000XM5 at 40% discount. Only $210, was $350. Available until September 15, 2025.",
     {"entities": [(38, 41, "DISCOUNT"), (19, 34, "PRODUCT"), (57, 61, "NEW_PRICE"), (67, 71, "OLD_PRICE"), (83, 107, "VALIDITY")]},),
    ("Limited time: 35% off on Bose QuietComfort Earbuds. Get them for $195, down from $300. Valid until October 10, 2025.",
     {"entities": [(14, 17, "DISCOUNT"), (25, 50, "PRODUCT"), (65, 69, "NEW_PRICE"), (81, 85, "OLD_PRICE"), (93, 115, "VALIDITY")]},),
    ("Bla bla bla: 35% off on uga buga. From $420 to $69. Valid until October 40, 2025.",
     {"entities": [(13, 16, "DISCOUNT"), (24, 32, "PRODUCT"), (47, 50, "NEW_PRICE"), (39, 43, "OLD_PRICE"), (64, 80, "VALIDITY")]},),
]

# I tried running the code with the following line, and it was okay, it sometimes gave me an error, 
# but most of the time the main issue was that it did not identify the product at all.
# nlp = spacy.load("en_core_web_sm")

# Create blank English NLP model
# This approach is not perfect. For example, the model classified "unicorn" as validity or new price 
# when I ran the code a few times. It would make some sense if it classified a unicorn as a product,
# since it is a noun, but it did not do that either.
nlp = spacy.blank("en")

# Both of the approaches above can be fine-tuned to improve the model's performance.

# Add Named Entity Recognizer (NER) component if not already present
if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner", last=True)
else:
    ner = nlp.get_pipe("ner")

# Add labels to NER
for _, annotations in TRAIN_DATA:
    for ent in annotations["entities"]:
        ner.add_label(ent[2])

# Prepare training data
optimizer = nlp.initialize()
for i in range(20): 
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], drop=0.3, losses=losses)
    print(f"Iteration {i+1}, Losses: {losses}")

nlp.to_disk("coupon_ner_model")

nlp_test = spacy.load("coupon_ner_model")
doc = nlp_test("Save 15% on your Nike shoes! Now only $85, was $100. Offer valid until April 15, 2025 and if you have a unicorn.")

for ent in doc.ents:
    print(f"{ent.label_}: {ent.text}")


Iteration 1, Losses: {'ner': np.float32(210.40405)}
Iteration 2, Losses: {'ner': np.float32(119.24067)}
Iteration 3, Losses: {'ner': np.float32(65.527145)}
Iteration 4, Losses: {'ner': np.float32(103.26305)}
Iteration 5, Losses: {'ner': np.float32(79.07881)}
Iteration 6, Losses: {'ner': np.float32(49.080917)}
Iteration 7, Losses: {'ner': np.float32(94.15434)}
Iteration 8, Losses: {'ner': np.float32(77.91877)}
Iteration 9, Losses: {'ner': np.float32(68.72293)}
Iteration 10, Losses: {'ner': np.float32(48.44724)}
Iteration 11, Losses: {'ner': np.float32(41.596745)}
Iteration 12, Losses: {'ner': np.float32(34.44004)}
Iteration 13, Losses: {'ner': np.float32(31.737322)}
Iteration 14, Losses: {'ner': np.float32(23.4626)}
Iteration 15, Losses: {'ner': np.float32(29.495712)}
Iteration 16, Losses: {'ner': np.float32(18.167423)}
Iteration 17, Losses: {'ner': np.float32(12.737721)}
Iteration 18, Losses: {'ner': np.float32(11.080767)}
Iteration 19, Losses: {'ner': np.float32(4.852218)}
Iteration 2