In [20]:
import spacy 
nlp=spacy.load("en_core_web_sm")
text='''In the middle part of his career, Einstein made important contributions
to statistical mechanics and quantum theory. Especially notable was his work on the
quantum physics of radiation, in which light consists of particles, subsequently called photons.
With physicist Satyendra Nath Bose, he laid the groundwork for Bose–Einstein statistics.
For much of the last phase of his academic life,
Einstein worked on two endeavors that ultimately proved unsuccessful.
'''

In [21]:
doc=nlp('''In the middle part of his career, Einstein made important contributions
to statistical mechanics and quantum theory. Especially notable was his work on the
quantum physics of radiation, in which light consists of particles, subsequently called photons.
With physicist Satyendra Nath Bose, he laid the groundwork for Bose–Einstein statistics.
For much of the last phase of his academic life,
Einstein worked on two endeavors that ultimately proved unsuccessful.
''')
for token in doc:
    print("Word is : ",token.text)
    print("POS is : ",token.pos,"===",token.pos_,"===",spacy.explain(token.pos_))
    print("Dep is : ",token.dep,"===",token.dep_,"===",spacy.explain(token.dep_))
    print("Tag is : ",token.tag,"===",token.tag_,"===",spacy.explain(token.tag_))
    print("====================================")
    

Word is :  In
POS is :  85 === ADP === adposition
Dep is :  443 === prep === prepositional modifier
Tag is :  1292078113972184607 === IN === conjunction, subordinating or preposition
Word is :  the
POS is :  90 === DET === determiner
Dep is :  415 === det === determiner
Tag is :  15267657372422890137 === DT === determiner
Word is :  middle
POS is :  84 === ADJ === adjective
Dep is :  402 === amod === adjectival modifier
Tag is :  10554686591937588953 === JJ === adjective (English), other noun-modifier (Chinese)
Word is :  part
POS is :  92 === NOUN === noun
Dep is :  439 === pobj === object of preposition
Tag is :  15308085513773655218 === NN === noun, singular or mass
Word is :  of
POS is :  85 === ADP === adposition
Dep is :  443 === prep === prepositional modifier
Tag is :  1292078113972184607 === IN === conjunction, subordinating or preposition
Word is :  his
POS is :  95 === PRON === pronoun
Dep is :  440 === poss === possession modifier
Tag is :  4062917326063685704 === PRP$ === 

In [8]:
for token in doc:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')


In         ADP      IN     conjunction, subordinating or preposition
the        DET      DT     determiner
middle     ADJ      JJ     adjective (English), other noun-modifier (Chinese)
part       NOUN     NN     noun, singular or mass
of         ADP      IN     conjunction, subordinating or preposition
his        PRON     PRP$   pronoun, possessive
career     NOUN     NN     noun, singular or mass
,          PUNCT    ,      punctuation mark, comma
Einstein   PROPN    NNP    noun, proper singular
made       VERB     VBD    verb, past tense
important  ADJ      JJ     adjective (English), other noun-modifier (Chinese)
contributions NOUN     NNS    noun, plural

          SPACE    _SP    whitespace
to         ADP      IN     conjunction, subordinating or preposition
statistical ADJ      JJ     adjective (English), other noun-modifier (Chinese)
mechanics  NOUN     NNS    noun, plural
and        CCONJ    CC     conjunction, coordinating
quantum    NOUN     NN     noun, singular or mass
theor

In [11]:
doc1=nlp("i read a book on NLP")
for token in doc1:
    print(f'{token.text:{10}} {token.pos_:{8}} {token.tag_:{6}} {spacy.explain(token.tag_)}')


i          PRON     PRP    pronoun, personal
read       VERB     VBD    verb, past tense
a          DET      DT     determiner
book       NOUN     NN     noun, singular or mass
on         ADP      IN     conjunction, subordinating or preposition
NLP        PROPN    NNP    noun, proper singular


In [15]:
pos_counts=doc.count_by(spacy.attrs.POS)
for k,v in sorted(pos_counts.items()):
    print(f'{k}. {doc.vocab[k].text:{6}}: {v}')

84. ADJ   : 9
85. ADP   : 13
86. ADV   : 3
87. AUX   : 1
89. CCONJ : 1
90. DET   : 4
92. NOUN  : 18
93. NUM   : 1
95. PRON  : 6
96. PROPN : 7
97. PUNCT : 10
100. VERB  : 6
103. SPACE : 6


In [18]:
import nltk 
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

text2='Moses supposes his  toeses are roses but moses supposes erroneously'
for w,m in nltk.pos_tag(nltk.word_tokenize(text2)):
    print(f'Word : {w} , type : {m} ,  means : {spacy.explain(m)}')

Word : Moses , type : NNS ,  means : noun, plural
Word : supposes , type : VBZ ,  means : verb, 3rd person singular present
Word : his , type : PRP$ ,  means : pronoun, possessive
Word : toeses , type : NNS ,  means : noun, plural
Word : are , type : VBP ,  means : verb, non-3rd person singular present
Word : roses , type : NNS ,  means : noun, plural
Word : but , type : CC ,  means : conjunction, coordinating
Word : moses , type : VBZ ,  means : verb, 3rd person singular present
Word : supposes , type : NNS ,  means : noun, plural
Word : erroneously , type : RB ,  means : adverb


In [23]:
tokenizer=PunktSentenceTokenizer(text)
tokenized=tokenizer.tokenize(text)
tokenized[:5]
for i in tokenized[:5]:
    for w,m in nltk.pos_tag(nltk.word_tokenize(i)):
        print(f'Word : {w} , type : {m} ,  means : {spacy.explain(m)}')

Word : In , type : IN ,  means : conjunction, subordinating or preposition
Word : the , type : DT ,  means : determiner
Word : middle , type : JJ ,  means : adjective (English), other noun-modifier (Chinese)
Word : part , type : NN ,  means : noun, singular or mass
Word : of , type : IN ,  means : conjunction, subordinating or preposition
Word : his , type : PRP$ ,  means : pronoun, possessive
Word : career , type : NN ,  means : noun, singular or mass
Word : , , type : , ,  means : punctuation mark, comma
Word : Einstein , type : NNP ,  means : noun, proper singular
Word : made , type : VBD ,  means : verb, past tense
Word : important , type : JJ ,  means : adjective (English), other noun-modifier (Chinese)
Word : contributions , type : NNS ,  means : noun, plural
Word : to , type : TO ,  means : infinitival "to"
Word : statistical , type : JJ ,  means : adjective (English), other noun-modifier (Chinese)
Word : mechanics , type : NNS ,  means : noun, plural
Word : and , type : CC ,  m

In [24]:
import re 
train_text=state_union.raw("2005-GWBush.txt")
sample_text=state_union.raw("2006-GWBush.txt")

In [25]:
tokenizer=PunktSentenceTokenizer(train_text)
tokenized=tokenizer.tokenize(sample_text)
for i in tokenized[:5]:
    for w,m in nltk.pos_tag(nltk.word_tokenize(i)):
        print(f'Word : {w} , type : {m} ,  means : {spacy.explain(m)}')
        print("==========================================")

Word : PRESIDENT , type : NNP ,  means : noun, proper singular
Word : GEORGE , type : NNP ,  means : noun, proper singular
Word : W. , type : NNP ,  means : noun, proper singular
Word : BUSH , type : NNP ,  means : noun, proper singular
Word : 'S , type : POS ,  means : possessive ending
Word : ADDRESS , type : NNP ,  means : noun, proper singular
Word : BEFORE , type : IN ,  means : conjunction, subordinating or preposition
Word : A , type : NNP ,  means : noun, proper singular
Word : JOINT , type : NNP ,  means : noun, proper singular
Word : SESSION , type : NNP ,  means : noun, proper singular
Word : OF , type : IN ,  means : conjunction, subordinating or preposition
Word : THE , type : NNP ,  means : noun, proper singular
Word : CONGRESS , type : NNP ,  means : noun, proper singular
Word : ON , type : NNP ,  means : noun, proper singular
Word : THE , type : NNP ,  means : noun, proper singular
Word : STATE , type : NNP ,  means : noun, proper singular
Word : OF , type : IN ,  means



In [1]:
import nltk
from nltk.stem.porter import *
p_stemmer=PorterStemmer()

In [4]:
words=['run','runner','running','ran','runs','easily','fairly']
for word in words:
    print(f"{word :{9}} {p_stemmer.stem(word)}")

run       run
runner    runner
running   run
ran       ran
runs      run
easily    easili
fairly    fairli


In [5]:
from nltk.stem.snowball import SnowballStemmer
s_stemmer=SnowballStemmer(language='english')
for word in words:
    print(f"{word :{9}} {s_stemmer.stem(word)}")

run       run
runner    runner
running   run
ran       ran
runs      run
easily    easili
fairly    fair


In [7]:
from nltk.stem import  PorterStemmer , LancasterStemmer
from nltk.tokenize import sent_tokenize , word_tokenize
ps=PorterStemmer()
ls=LancasterStemmer()
words=['is','was','be','been','are','were','being']

In [8]:
for word in words:
    print(ps.stem(word))
    print(ls.stem(word))
    print('====================================')

is
is
wa
was
be
be
been
been
are
ar
were
wer
be
being


In [11]:
sentence='had you booked the air booling yet ? if not try to book it ASAP since booking will beout of books'
words=word_tokenize(sentence)
for w in words:
    print(ps.stem(w))

had
you
book
the
air
bool
yet
?
if
not
tri
to
book
it
asap
sinc
book
will
beout
of
book


In [16]:
print("{0:20}{1:20}{2:20}".format("word","porter stemmer","lncaster stemmer"))
print('----------------------------------------------------')
for w in words:
    print("{0:20}{1:20}{2:20}".format(w,ps.stem(w),ls.stem(w)))

word                porter stemmer      lncaster stemmer    
----------------------------------------------------
had                 had                 had                 
you                 you                 you                 
booked              book                book                
the                 the                 the                 
air                 air                 air                 
booling             bool                bool                
yet                 yet                 yet                 
?                   ?                   ?                   
if                  if                  if                  
not                 not                 not                 
try                 tri                 try                 
to                  to                  to                  
book                book                book                
it                  it                  it                  
ASAP                asap        

In [29]:
import spacy
nlp=spacy.load('en_core_web_sm')
def show_lemma(text):
    for token in text:
        print(f"{token.text:{15}} {token.pos_:{15}} {token.lemma:{25}} {token.lemma_}")

In [30]:
doc2=nlp("I saw eighteen mice today !")
show_lemma(doc2)

I               PRON                  4690420944186131903 I
saw             VERB                 11925638236994514241 see
eighteen        NUM                   9609336664675087640 eighteen
mice            NOUN                  1384165645700560590 mouse
today           NOUN                 11042482332948150395 today
!               PUNCT                17494803046312582752 !


In [34]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
words=['cats','cacti','radii','feet','speech','runner']
for w in words:
    print(f"{w:{15}}{lemmatizer.lemmatize(w)}")

cats           cat
cacti          cactus
radii          radius
feet           foot
speech         speech
runner         runner


In [35]:
print(lemmatizer.lemmatize("meeting",'v'))
print(lemmatizer.lemmatize("meeting",'n'))

meet
meeting


In [36]:
sentence="He was running and eating at same . He has bad habit of swimming after playing long hours in the sun."
panctuation='?!:.,;'
sentence_words=word_tokenize(sentence)
for word in sentence_words:
    if word in panctuation:
        sentence_words.remove(word)
for w in sentence_words:
    print(f"{w:{15}}{lemmatizer.lemmatize(w)}")

He             He
was            wa
running        running
and            and
eating         eating
at             at
same           same
He             He
has            ha
bad            bad
habit          habit
of             of
swimming       swimming
after          after
playing        playing
long           long
hours          hour
in             in
the            the
sun            sun


In [37]:
for w in sentence_words:
    print(f"{w:{15}}{lemmatizer.lemmatize(w,'v')}")

He             He
was            be
running        run
and            and
eating         eat
at             at
same           same
He             He
has            have
bad            bad
habit          habit
of             of
swimming       swim
after          after
playing        play
long           long
hours          hours
in             in
the            the
sun            sun


In [1]:
import spacy
nlp=spacy.load('en_core_web_sm')

In [7]:
doc=nlp("Apple to build a hong kong factory for $6 million")

In [8]:
for token in doc:
    print(token.text,end='|')

Apple|to|build|a|hong|kong|factory|for|$|6|million|

In [9]:
for token in doc.ents:
    print(f"{token.text:{10}} {token.label_:{10}} {spacy.explain(token.label_)}")

Apple      ORG        Companies, agencies, institutions, etc.
hong kong  GPE        Countries, cities, states
$6 million MONEY      Monetary values, including unit


In [10]:
for ent in doc.ents:
    print(ent.text)
    print(ent.label)
    print(ent.label_)
    print(spacy.explain(ent.label_))
    print(ent.start)
    print(ent.end)
    print(ent.start_char)
    print(ent.end_char)
    print("-----------------------------------------")

Apple
383
ORG
Companies, agencies, institutions, etc.
0
1
0
5
-----------------------------------------
hong kong
384
GPE
Countries, cities, states
4
6
17
26
-----------------------------------------
$6 million
394
MONEY
Monetary values, including unit
8
11
39
49
-----------------------------------------


In [14]:
from spacy.tokens import Span
doc=nlp("CPRO to build a U.K. factory for $6 million")

ORG=doc.vocab.strings['ORG']
new_ent=Span(doc,0,1,label=ORG)

doc.ents=list(doc.ents) + [new_ent]

for token in doc.ents:
    print(f"{token.text:{10}} {token.label_:{10}} {spacy.explain(token.label_)}")

CPRO       ORG        Companies, agencies, institutions, etc.
U.K.       GPE        Countries, cities, states
$6 million MONEY      Monetary values, including unit


In [18]:
doc2=nlp("Autonomous cars shift insurance liability toward manufaturers.")
for chunk in doc2.noun_chunks:
    print(chunk.text)
    print(chunk.root.text)
    print(spacy.explain(chunk.root.dep_))
    print(chunk.label)
    print(chunk.label_)
    print(spacy.explain(chunk.label_))
    print(chunk.start)
    print(chunk.end)
    print(chunk.start_char)
    print(chunk.end_char)
    print("------------------------------")

Autonomous cars
cars
nominal subject
3342607623747562680
NP
noun phrase
0
2
0
15
------------------------------
insurance liability
liability
direct object
3342607623747562680
NP
noun phrase
3
5
22
41
------------------------------
manufaturers
manufaturers
object of preposition
3342607623747562680
NP
noun phrase
6
7
49
61
------------------------------


In [2]:
import spacy
nlp=spacy.load('en_core_web_sm')
nlp.vocab['have'].is_stop

True

In [31]:
nlp.vocab['it'].is_stop

True

In [32]:
nlp.vocab['park'].is_stop

False

In [33]:
nlp.Defaults.stop_words

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'both',
 'bottom',
 'btw',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'front',

In [34]:
print(nlp.vocab['btw'].is_stop)

True


In [35]:
nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop=True
print(nlp.vocab['btw'].is_stop)

True


In [37]:
print(nlp.vocab['beyond'].is_stop)
nlp.Defaults.stop_words.remove('beyond')
nlp.vocab['beyond'].is_stop=False
print(nlp.vocab['beyond'].is_stop)

False
False


In [38]:
import nltk 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\yousef_haroon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [41]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words=set(stopwords.words('english'))
print(len(stop_words))
stop_words

198


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [42]:
example="this is sample sentence, And this showing off the stop words filtration."
word_tokens=word_tokenize(example)
word_tokens

['this',
 'is',
 'sample',
 'sentence',
 ',',
 'And',
 'this',
 'showing',
 'off',
 'the',
 'stop',
 'words',
 'filtration',
 '.']

In [43]:
filtered_sentence=[w for w in word_tokens if not w in stop_words]
print(filtered_sentence)

['sample', 'sentence', ',', 'And', 'showing', 'stop', 'words', 'filtration', '.']


In [45]:
ar_sw=nltk.corpus.stopwords.words("arabic")
print(len(ar_sw))
print(ar_sw)

754
['إذ', 'إذا', 'إذما', 'إذن', 'أف', 'أقل', 'أكثر', 'ألا', 'إلا', 'التي', 'الذي', 'الذين', 'اللاتي', 'اللائي', 'اللتان', 'اللتيا', 'اللتين', 'اللذان', 'اللذين', 'اللواتي', 'إلى', 'إليك', 'إليكم', 'إليكما', 'إليكن', 'أم', 'أما', 'أما', 'إما', 'أن', 'إن', 'إنا', 'أنا', 'أنت', 'أنتم', 'أنتما', 'أنتن', 'إنما', 'إنه', 'أنى', 'أنى', 'آه', 'آها', 'أو', 'أولاء', 'أولئك', 'أوه', 'آي', 'أي', 'أيها', 'إي', 'أين', 'أين', 'أينما', 'إيه', 'بخ', 'بس', 'بعد', 'بعض', 'بك', 'بكم', 'بكم', 'بكما', 'بكن', 'بل', 'بلى', 'بما', 'بماذا', 'بمن', 'بنا', 'به', 'بها', 'بهم', 'بهما', 'بهن', 'بي', 'بين', 'بيد', 'تلك', 'تلكم', 'تلكما', 'ته', 'تي', 'تين', 'تينك', 'ثم', 'ثمة', 'حاشا', 'حبذا', 'حتى', 'حيث', 'حيثما', 'حين', 'خلا', 'دون', 'ذا', 'ذات', 'ذاك', 'ذان', 'ذانك', 'ذلك', 'ذلكم', 'ذلكما', 'ذلكن', 'ذه', 'ذو', 'ذوا', 'ذواتا', 'ذواتي', 'ذي', 'ذين', 'ذينك', 'ريث', 'سوف', 'سوى', 'شتان', 'عدا', 'عسى', 'عل', 'على', 'عليك', 'عليه', 'عما', 'عن', 'عند', 'غير', 'فإذا', 'فإن', 'فلا', 'فمن', 'في', 'فيم', 'فيما', 'فيه', 'فيها

In [46]:
ar_sw2=stopwords.words("arabic")
print(len(ar_sw2))
print(ar_sw2)

754
['إذ', 'إذا', 'إذما', 'إذن', 'أف', 'أقل', 'أكثر', 'ألا', 'إلا', 'التي', 'الذي', 'الذين', 'اللاتي', 'اللائي', 'اللتان', 'اللتيا', 'اللتين', 'اللذان', 'اللذين', 'اللواتي', 'إلى', 'إليك', 'إليكم', 'إليكما', 'إليكن', 'أم', 'أما', 'أما', 'إما', 'أن', 'إن', 'إنا', 'أنا', 'أنت', 'أنتم', 'أنتما', 'أنتن', 'إنما', 'إنه', 'أنى', 'أنى', 'آه', 'آها', 'أو', 'أولاء', 'أولئك', 'أوه', 'آي', 'أي', 'أيها', 'إي', 'أين', 'أين', 'أينما', 'إيه', 'بخ', 'بس', 'بعد', 'بعض', 'بك', 'بكم', 'بكم', 'بكما', 'بكن', 'بل', 'بلى', 'بما', 'بماذا', 'بمن', 'بنا', 'به', 'بها', 'بهم', 'بهما', 'بهن', 'بي', 'بين', 'بيد', 'تلك', 'تلكم', 'تلكما', 'ته', 'تي', 'تين', 'تينك', 'ثم', 'ثمة', 'حاشا', 'حبذا', 'حتى', 'حيث', 'حيثما', 'حين', 'خلا', 'دون', 'ذا', 'ذات', 'ذاك', 'ذان', 'ذانك', 'ذلك', 'ذلكم', 'ذلكما', 'ذلكن', 'ذه', 'ذو', 'ذوا', 'ذواتا', 'ذواتي', 'ذي', 'ذين', 'ذينك', 'ريث', 'سوف', 'سوى', 'شتان', 'عدا', 'عسى', 'عل', 'على', 'عليك', 'عليه', 'عما', 'عن', 'عند', 'غير', 'فإذا', 'فإن', 'فلا', 'فمن', 'في', 'فيم', 'فيما', 'فيه', 'فيها

In [47]:
pip install arabicstopwords

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement arabicstopwords (from versions: none)
ERROR: No matching distribution found for arabicstopwords


In [48]:
import arabicstopwords.arabicstopwords as stp

ModuleNotFoundError: No module named 'arabicstopwords'

In [3]:
from spacy.matcher import Matcher
matcher=Matcher(nlp.vocab)

In [11]:
pattern1=[{'LOWER':'solarpower'}]
pattern2=[{'LOWER':'solar'},{'LOWER':'power'}]
pattern3=[{'LOWER':'solar'},{'IS_PUNCT':True},{'LOWER':'power'}]
matcher.add("SolarPower",[pattern1,pattern2,pattern3])

In [17]:
doc=nlp("""Solar power, also known as solar electricity,
           Almost half the solar--power installed in 2022""")
found_matches=matcher(doc)
for a,b,c in found_matches:
    print(f'word id : {a} start at:{b} end at:{c} word is:{doc[b:c]}')

word id : 8656102463236116519 start at:0 end at:2 word is:Solar power
word id : 8656102463236116519 start at:13 end at:16 word is:solar--power


In [9]:
matcher.remove('SolarPower')

In [18]:
pattern1=[{'LOWER':'solarpower'}]
pattern2=[{'LOWER':'solar'},{'IS_PUNCT':True,'OP':'*'},{'LOWER':'power'}]
matcher.add("SolarPower",[pattern1,pattern2])

doc=nlp("""Solar power, also known as solar electricity,
           Almost half the solar--power installed in 2022""")
found_matches=matcher(doc)
for a,b,c in found_matches:
    print(f'word id : {a} start at:{b} end at:{c} word is:{doc[b:c]}')

word id : 8656102463236116519 start at:0 end at:2 word is:Solar power
word id : 8656102463236116519 start at:13 end at:16 word is:solar--power


In [19]:
from spacy.matcher import PhraseMatcher
matcher=PhraseMatcher(nlp.vocab)
with open('x.txt') as f:
    doc=nlp(f.read())
phrase_list=['El Clásico','klasiko','klasik','The Classic','El Clàssic','Barcelona and Real Madrid']
phrase_patterns=[nlp(text) for text in phrase_list]
phrase_patterns

[El Clásico,
 klasiko,
 klasik,
 The Classic,
 El Clàssic,
 Barcelona and Real Madrid]

In [25]:
matcher.add("El_Clásico",phrase_patterns)

matches=matcher(doc)
for a,b,c in matches:
    print(f'word id : {a} start at:{b} end at:{c} word is:{doc[b-3:c+3]}')

word id : 9646648106172704443 start at:39 end at:41 word is:both meaning "The Classic", is


In [1]:
import spacy
from spacy import displacy
nlp=spacy.load("en_core_web_sm")

In [5]:
doc=nlp('Apple is going to build a U.K. factory for $6 million.')
displacy.render(doc,style='dep',jupyter=True,options={'distance':80})

ImportError: cannot import name 'display' from 'IPython.core.display' (C:\Users\yousef_haroon\AppData\Roaming\Python\Python313\site-packages\IPython\core\display.py)

In [6]:
for token in doc:
    print(f"{token.text:{10}} {token.dep_:{10}} {spacy.explain(token.dep_)}")

Apple      nsubj      nominal subject
is         aux        auxiliary
going      ROOT       root
to         aux        auxiliary
build      xcomp      open clausal complement
a          det        determiner
U.K.       compound   compound
factory    dobj       direct object
for        prep       prepositional modifier
$          quantmod   modifier of quantifier
6          compound   compound
million    pobj       object of preposition
.          punct      punctuation


In [8]:
displacy.serve(doc,style='dep')



ImportError: cannot import name 'display' from 'IPython.core.display' (C:\Users\yousef_haroon\AppData\Roaming\Python\Python313\site-packages\IPython\core\display.py)