In [9]:
# English Sentences
sentence_1 = "Hello there! How are you doing today? I hope everything is going well; the weather is nice, isn't it?"
sentence_2 = "She said, 'I’ll be there by 5:00 PM'; however, she arrived at 6:30 instead. Was she late, or was I early?"
sentence_3 = "Despite the warnings—loud and clear—he proceeded with his plan: a risky yet thrilling adventure!"
sentence_4 = "The report stated: 'Inflation rates have surged by 5% this year.' Can you believe it? It's quite alarming!"
sentence_5 = "Walking through the dark alley, he hesitated: was that a shadow moving, or just his imagination playing tricks?"

# Arabic Sentences
sentence_6 = "مرحبًا! كيف كان يومك؟ أتمنى أن يكون كل شيء على ما يرام؛ الطقس رائع، أليس كذلك؟"
sentence_7 = "قالت: 'سأصل الساعة ٥:٠٠ مساءً'؛ لكنها وصلت عند الساعة ٦:٣٠ بدلًا من ذلك! هل تأخرت، أم أنني كنت مبكرًا؟"
sentence_8 = "على الرغم من التحذيرات—الواضحة والصارمة—واصل خطته: مغامرة خطيرة لكنها مثيرة!"
sentence_9 = "ذكرت التقارير: 'ارتفع معدل التضخم بنسبة ٥٪ هذا العام'، هل يمكنك تصديق ذلك؟ إنه أمر مقلق للغاية!"
sentence_10 = "بينما كان يسير في الزقاق المظلم، تردد: هل كان ذلك ظلًا يتحرك، أم أن خياله يخدعه؟"


In [55]:
import pandas as pd

In [56]:
import spacy
import spacy.attrs

nlp = spacy.load('en_core_web_sm')

## Understanding POS, DEP, and TAG in SpaCy

### Key Concepts

- **Use `pos_`** for general POS tagging (e.g., `"NOUN"`, `"VERB"`).
- **Use `dep_`** for syntactic structure analysis (e.g., `"nsubj"`, `"ROOT"`).
- **Use `tag_`** for more precise linguistic information (e.g., `"VBZ"` for 3rd person singular verbs).
- **Use `spacy.explain()`** to get a human-readable explanation of each tag.

### Example Usage
```python
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(sentense)
for token in doc:
    print(f"Word: {token.text}")
    print(f"  POS: {token.pos_} ({spacy.explain(token.pos_)})")
    print(f"  DEP: {token.dep_} ({spacy.explain(token.dep_)})")
    print(f"  TAG: {token.tag_} ({spacy.explain(token.tag_)})")
    print("===")
```

In [57]:
tokens = nlp(sentence_1)

df = pd.DataFrame({
    "word": [token.text for token in tokens],
    "pos": [{token.pos_} for token in tokens], 
    "pos_explain": [{spacy.explain(token.pos_)} for token in tokens], 
    "dep": [{token.dep_} for token in tokens], 
    "dep_explain": [{spacy.explain(token.dep_)} for token in tokens], 
    "tag": [{token.tag_} for token in tokens], 
    "tag_explain": [{spacy.explain(token.tag_)} for token in tokens], 
})
df

Unnamed: 0,word,pos,pos_explain,dep,dep_explain,tag,tag_explain
0,Hello,{INTJ},{interjection},{ROOT},{root},{UH},{interjection}
1,there,{ADV},{adverb},{advmod},{adverbial modifier},{RB},{adverb}
2,!,{PUNCT},{punctuation},{punct},{punctuation},{.},"{punctuation mark, sentence closer}"
3,How,{SCONJ},{subordinating conjunction},{advmod},{adverbial modifier},{WRB},{wh-adverb}
4,are,{AUX},{auxiliary},{aux},{auxiliary},{VBP},"{verb, non-3rd person singular present}"
5,you,{PRON},{pronoun},{nsubj},{nominal subject},{PRP},"{pronoun, personal}"
6,doing,{VERB},{verb},{ROOT},{root},{VBG},"{verb, gerund or present participle}"
7,today,{NOUN},{noun},{npadvmod},{noun phrase as adverbial modifier},{NN},"{noun, singular or mass}"
8,?,{PUNCT},{punctuation},{punct},{punctuation},{.},"{punctuation mark, sentence closer}"
9,I,{PRON},{pronoun},{nsubj},{nominal subject},{PRP},"{pronoun, personal}"


In [58]:
pos_count = tokens.count_by(spacy.attrs.POS)
for k, v in sorted(pos_count.items()):
    print(f"{k:{5}} ==> {tokens.vocab[k].text :{8}}: {v}")

   84 ==> ADJ     : 1
   86 ==> ADV     : 2
   87 ==> AUX     : 4
   90 ==> DET     : 1
   91 ==> INTJ    : 1
   92 ==> NOUN    : 2
   94 ==> PART    : 1
   95 ==> PRON    : 4
   97 ==> PUNCT   : 5
   98 ==> SCONJ   : 1
  100 ==> VERB    : 3


In [59]:
tag_count = tokens.count_by(spacy.attrs.TAG)
for k, v in sorted(tag_count.items()):
    print(f"{k:{20}} ==> {tokens.vocab[k].text :{8}}: {v}")

  164681854541413346 ==> RB      : 3
 1534113631682161808 ==> VBG     : 2
 2593208677638477497 ==> ,       : 1
 3252815442139690129 ==> UH      : 1
 9188597074677201817 ==> VBP     : 2
10554686591937588953 ==> JJ      : 1
11532473245541075862 ==> :       : 1
12646065887601541794 ==> .       : 3
13656873538139661788 ==> PRP     : 3
13927759927860985106 ==> VBZ     : 3
15267657372422890137 ==> DT      : 1
15308085513773655218 ==> NN      : 3
17524233984504158541 ==> WRB     : 1


In [60]:
dep_count = tokens.count_by(spacy.attrs.DEP)
for k, v in sorted(dep_count.items()):
    print(f"{k:{20}} ==> {tokens.vocab[k].text :{8}}: {v}")

                 398 ==> acomp   : 1
                 400 ==> advmod  : 3
                 405 ==> aux     : 2
                 408 ==> ccomp   : 3
                 415 ==> det     : 1
                 425 ==> neg     : 1
                 428 ==> npadvmod: 1
                 429 ==> nsubj   : 5
                 445 ==> punct   : 5
 8206900633647566924 ==> ROOT    : 3


## SpaCy vs NLTK: Verb Tense Detection  

SpaCy is better than NLTK for identifying whether a verb is **past or present**.  

### Why?  
- SpaCy provides **detailed POS tags** (`VBZ`, `VBP`, `VBD`) that clearly indicate tense.  
- SpaCy understands **context**, making it more accurate for verb analysis.  
- NLTK tags verbs but **lacks context awareness**, making it less reliable for tense detection.  

### Example
```python
nlp = spacy.load("en_core_web_sm")
doc = nlp(sentence)

for token in doc:
    print(
            f"Word:{token.text}, POS:{token.pos_}, TAG:{token.tag_}{spacy.explain(token.tag_)}"
    )
```


In [61]:
doc = nlp(u'I read book now.') 
r = doc[1] 
print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}') 

doc = nlp(u'I read a book on NLP.') 
r = doc[1] 
print(f'{r.text:{10}} {r.pos_:{8}} {r.tag_:{6}} {spacy.explain(r.tag_)}') 

read       VERB     VBP    verb, non-3rd person singular present
read       VERB     VBD    verb, past tense


### NLTK

In [90]:
import nltk
from nltk.corpus import state_union 
from nltk.tokenize import PunktSentenceTokenizer, word_tokenize

nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")  
nltk.download("averaged_perceptron_tagger_eng")
nltk.download("state_union")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\aakam\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aakam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\aakam\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package state_union to
[nltk_data]     C:\Users\aakam\AppData\Roaming\nltk_data...
[nltk_data]   Package state_union is already up-to-date!


True

In [91]:
text = "Moses supposes his toeses are roses but Moses supposes erroneously"

for w, m in nltk.pos_tag(word_tokenize(text)):
    print(f"the word ==> {w:12}\t type => {m:8}\t mean => {spacy.explain(m)}")

the word ==> Moses       	 type => NNS     	 mean => noun, plural
the word ==> supposes    	 type => VBZ     	 mean => verb, 3rd person singular present
the word ==> his         	 type => PRP$    	 mean => pronoun, possessive
the word ==> toeses      	 type => NNS     	 mean => noun, plural
the word ==> are         	 type => VBP     	 mean => verb, non-3rd person singular present
the word ==> roses       	 type => NNS     	 mean => noun, plural
the word ==> but         	 type => CC      	 mean => conjunction, coordinating
the word ==> Moses       	 type => NNP     	 mean => noun, proper singular
the word ==> supposes    	 type => VBZ     	 mean => verb, 3rd person singular present
the word ==> erroneously 	 type => RB      	 mean => adverb


In [97]:
train_text = state_union.raw("2005-GWBush.txt") 
sample_text = state_union.raw("2006-GWBush.txt")

custom_sent_tokenizer_1 = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer_1.tokenize(sample_text)
tokenized[:5]

for i in tokenized[:2]:
    for w, m in nltk.pos_tag(word_tokenize(i)):
        print(f"the word ==> {w:12}\t type => {m:8}\t mean => {spacy.explain(m)}")

the word ==> PRESIDENT   	 type => NNP     	 mean => noun, proper singular
the word ==> GEORGE      	 type => NNP     	 mean => noun, proper singular
the word ==> W.          	 type => NNP     	 mean => noun, proper singular
the word ==> BUSH        	 type => NNP     	 mean => noun, proper singular
the word ==> 'S          	 type => POS     	 mean => possessive ending
the word ==> ADDRESS     	 type => NNP     	 mean => noun, proper singular
the word ==> BEFORE      	 type => IN      	 mean => conjunction, subordinating or preposition
the word ==> A           	 type => NNP     	 mean => noun, proper singular
the word ==> JOINT       	 type => NNP     	 mean => noun, proper singular
the word ==> SESSION     	 type => NNP     	 mean => noun, proper singular
the word ==> OF          	 type => IN      	 mean => conjunction, subordinating or preposition
the word ==> THE         	 type => NNP     	 mean => noun, proper singular
the word ==> CONGRESS    	 type => NNP     	 mean => noun, prope