In [3]:
# nltk.download()
import nltk
from nltk.corpus import stopwords

In [4]:
sent = "i love mom , +i llove dad"
print(nltk.word_tokenize(sent))
print(nltk.sent_tokenize(sent))

['i', 'love', 'mom', ',', '+i', 'llove', 'dad']
['i love mom , +i llove dad']


In [5]:
stopwords.words("arabic")

['إذ',
 'إذا',
 'إذما',
 'إذن',
 'أف',
 'أقل',
 'أكثر',
 'ألا',
 'إلا',
 'التي',
 'الذي',
 'الذين',
 'اللاتي',
 'اللائي',
 'اللتان',
 'اللتيا',
 'اللتين',
 'اللذان',
 'اللذين',
 'اللواتي',
 'إلى',
 'إليك',
 'إليكم',
 'إليكما',
 'إليكن',
 'أم',
 'أما',
 'أما',
 'إما',
 'أن',
 'إن',
 'إنا',
 'أنا',
 'أنت',
 'أنتم',
 'أنتما',
 'أنتن',
 'إنما',
 'إنه',
 'أنى',
 'أنى',
 'آه',
 'آها',
 'أو',
 'أولاء',
 'أولئك',
 'أوه',
 'آي',
 'أي',
 'أيها',
 'إي',
 'أين',
 'أين',
 'أينما',
 'إيه',
 'بخ',
 'بس',
 'بعد',
 'بعض',
 'بك',
 'بكم',
 'بكم',
 'بكما',
 'بكن',
 'بل',
 'بلى',
 'بما',
 'بماذا',
 'بمن',
 'بنا',
 'به',
 'بها',
 'بهم',
 'بهما',
 'بهن',
 'بي',
 'بين',
 'بيد',
 'تلك',
 'تلكم',
 'تلكما',
 'ته',
 'تي',
 'تين',
 'تينك',
 'ثم',
 'ثمة',
 'حاشا',
 'حبذا',
 'حتى',
 'حيث',
 'حيثما',
 'حين',
 'خلا',
 'دون',
 'ذا',
 'ذات',
 'ذاك',
 'ذان',
 'ذانك',
 'ذلك',
 'ذلكم',
 'ذلكما',
 'ذلكن',
 'ذه',
 'ذو',
 'ذوا',
 'ذواتا',
 'ذواتي',
 'ذي',
 'ذين',
 'ذينك',
 'ريث',
 'سوف',
 'سوى',
 'شتان',
 'عدا',
 'عسى',
 'عل'

In [6]:
stopwords = stopwords.words("english")
sent = "i myself like them to play football"
[w for w in nltk.word_tokenize(sent) if w not in stopwords ]

['like', 'play', 'football']

**Stemming and Lemmatization**



In [7]:
from nltk.stem import PorterStemmer,WordNetLemmatizer
from nltk.corpus import wordnet

In [8]:
# stemming >> getting the base of the word by cutting the last part (ing - s - es...etc)
ps = PorterStemmer()
# lema >> getting the base of the word 3n tary3 enha btrag3 ay kelma ll masdr bta3ha hta lw kant zy (went >> go) wda el stemming msh by3mlo
lema = WordNetLemmatizer()

In [9]:
print(ps.stem('playing'))
print(lema.lemmatize("playing",wordnet.VERB))

play
play


In [10]:
print(ps.stem('went')) #failed
print(lema.lemmatize('went',wordnet.VERB))
print(lema.lemmatize('better',wordnet.ADJ))

went
go
good


In [11]:
#synonyms : مرادف
#antonyms : مضاد

syns = wordnet.synsets("plan")
syns

[Synset('plan.n.01'),
 Synset('design.n.02'),
 Synset('plan.n.03'),
 Synset('plan.v.01'),
 Synset('plan.v.02'),
 Synset('plan.v.03'),
 Synset('design.v.04')]

In [12]:
print(syns[0])
print("---------------------")
print(syns[0].name())

Synset('plan.n.01')
---------------------
plan.n.01


In [13]:
print(syns[0].definition())
print("---------------------")
print(syns[0].examples())

a series of steps to be carried out or goals to be accomplished
---------------------
['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [14]:
syns[0].lemmas()

[Lemma('plan.n.01.plan'),
 Lemma('plan.n.01.program'),
 Lemma('plan.n.01.programme')]

In [15]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("hate"):
  for l in syn.lemmas():
    synonyms.append(l.name())
    if l.antonyms():
      antonyms.append(l.antonyms()[0].name())

In [16]:
print(set(synonyms))

{'hate', 'detest', 'hatred'}


In [17]:
print(set(antonyms))

{'love'}


**Similarity**

In [18]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("cat.n.01")
w1.wup_similarity(w2)

0.32

In [19]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("car.n.01")
w1.wup_similarity(w2)

0.6956521739130435

In [20]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")
w1.wup_similarity(w2)

0.9090909090909091

**Part Of Speech Tagging(POS)**

In [21]:
import nltk

original = "A prison riot left seven members of staff needing hospital treatment this month.like ahmed and ali"

def preparing(text):
  sentences = nltk.sent_tokenize(text)
  sentences = [nltk.word_tokenize(sent) for sent in sentences]
  sentences = [nltk.pos_tag(sent) for sent in sentences]
  return sentences


preparing(original)

[[('A', 'DT'),
  ('prison', 'NN'),
  ('riot', 'NN'),
  ('left', 'VBD'),
  ('seven', 'CD'),
  ('members', 'NNS'),
  ('of', 'IN'),
  ('staff', 'NN'),
  ('needing', 'VBG'),
  ('hospital', 'NN'),
  ('treatment', 'NN'),
  ('this', 'DT'),
  ('month.like', 'NN'),
  ('ahmed', 'VBD'),
  ('and', 'CC'),
  ('ali', 'VB')]]

In [22]:
'''NLTK POS Tag List
Below is the pos tag list of nltk as follows. There is a multiple tag list available in nltk, tag list showing in output as per word.

CC: It is the conjunction of coordinating
CD: It is a digit of cardinal
DT: It is the determiner
EX: Existential
FW: It is a foreign word
IN: Preposition and conjunction
JJ: Adjective
JJR and JJS: Adjective and superlative
LS: List marker
MD: Modal
NN: Singular noun
NNS, NNP, NNPS: Proper and plural noun
PDT: Predeterminer
WRB: Adverb of wh
WP$: Possessive wh
WP: Pronoun of wh
WDT: Determiner of wp
VBZ: Verb
VBP, VBN, VBG, VBD, VB: Forms of verbs
UH: Interjection
TO: To go
RP: Particle
RBS, RB, RBR: Adverb
PRP, PRP$: Pronoun personal and professional'''

'NLTK POS Tag List\nBelow is the pos tag list of nltk as follows. There is a multiple tag list available in nltk, tag list showing in output as per word.\n\nCC: It is the conjunction of coordinating\nCD: It is a digit of cardinal\nDT: It is the determiner\nEX: Existential\nFW: It is a foreign word\nIN: Preposition and conjunction\nJJ: Adjective\nJJR and JJS: Adjective and superlative\nLS: List marker\nMD: Modal\nNN: Singular noun\nNNS, NNP, NNPS: Proper and plural noun\nPDT: Predeterminer\nWRB: Adverb of wh\nWP$: Possessive wh\nWP: Pronoun of wh\nWDT: Determiner of wp\nVBZ: Verb\nVBP, VBN, VBG, VBD, VB: Forms of verbs\nUH: Interjection\nTO: To go\nRP: Particle\nRBS, RB, RBR: Adverb\nPRP, PRP$: Pronoun personal and professional'

***Chunking***


Chunking is a natural language processing technique used to group and extract meaningful phrases or chunks from sentences or text. It can help simplify and organize text data for further analysis or processing. Chunking is typically performed using part-of-speech tagging and regular expressions in programming languages like Python. Here are some common chunking expressions and patterns:


you can know the meaning of the used signs from this site
https://regex101.com/

In [28]:
def chunk(sentence):
  chunktoExtract = '''
  NP : {<NNP>*}
       {<DT>?<NN>?<NN>}
       {<NN><NN>}'''
  parser = nltk.RegexpParser(chunktoExtract)
  result = parser.parse(sentence)
  for subtree in result.subtrees():
    if subtree.label() == "NP":
      t=subtree
      t=" ".join(word for word,pos in t.leaves())
      print(t)


original = "A prison riot left seven members of staff needing hospital treatment this month, the BBC learns"
sentences = preparing(original)
print("POS_TAG:",sentences)
print("original:",original)
for sentence in sentences:
  print("______________CHUNK______________")
  chunk(sentence)

POS_TAG: [[('A', 'DT'), ('prison', 'NN'), ('riot', 'NN'), ('left', 'VBD'), ('seven', 'CD'), ('members', 'NNS'), ('of', 'IN'), ('staff', 'NN'), ('needing', 'VBG'), ('hospital', 'NN'), ('treatment', 'NN'), ('this', 'DT'), ('month', 'NN'), (',', ','), ('the', 'DT'), ('BBC', 'NNP'), ('learns', 'NNS')]]
original: A prison riot left seven members of staff needing hospital treatment this month, the BBC learns
______________CHUNK______________
A prison riot
staff
hospital treatment
this month
BBC


**Chinking**
In natural language processing and regular expressions, "chinking" is the process of removing specific chunks from a chunked text or sentence. Chinking is essentially the opposite of chunking, where you identify and extract specific phrases or chunks from a sentence, whereas chinking involves specifying phrases or chunks you want to exclude or remove.

Chinking is particularly useful when you've applied chunking to a text, but there are specific phrases or patterns within the chunks that you want to exclude from your analysis. To perform chinking, you typically use curly braces with a caret (^) inside a chunking pattern.

In [29]:
def chink(sentence):
  # awl line >>> m3nah eny ageeb kol haga fe eltext bta3y
  # tany line >>> hwa eny a execlude elhagat dy
  chinktoExtract = '''
  NP : {<.*>+}
       }<DT | NN>+{
       '''
  parser = nltk.RegexpParser(chinktoExtract)
  result = parser.parse(sentence)
  for subtree in result.subtrees():
    if subtree.label() == "NP":
      t=subtree
      t=" ".join(word for word,pos in t.leaves())
      print(t)


original = "A prison riot left seven members of staff needing hospital treatment this month, the BBC learns"
sentences = preparing(original)
print("POS_TAG:",sentences)
for sentence in sentences:
  print("______________CHUNK______________")
  chink(sentence)

POS_TAG: [[('A', 'DT'), ('prison', 'NN'), ('riot', 'NN'), ('left', 'VBD'), ('seven', 'CD'), ('members', 'NNS'), ('of', 'IN'), ('staff', 'NN'), ('needing', 'VBG'), ('hospital', 'NN'), ('treatment', 'NN'), ('this', 'DT'), ('month', 'NN'), (',', ','), ('the', 'DT'), ('BBC', 'NNP'), ('learns', 'NNS')]]
______________CHUNK______________
left seven members of
needing
,
BBC learns


**Count Vectorizer**

we are trying to represent the sentence by a vector accroding to The number of times a single word is repeated in a sentence

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
cv =  CountVectorizer()
x = ["he likes likes likes to play football everyday","he likes and she are the best"]
cv.fit_transform(x)

<2x11 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [34]:
values = cv.fit_transform(x).toarray()

In [35]:
columns = cv.get_feature_names_out()

In [36]:
import pandas as pd
pd.DataFrame(values,columns=columns)

Unnamed: 0,and,are,best,everyday,football,he,likes,play,she,the,to
0,0,0,0,1,1,1,3,1,0,0,1
1,1,1,1,0,0,1,1,0,1,1,0
