## Understanding NLP

In [1]:
!pip install nltk




In [2]:
para = """
Avul Pakir Jainulabdeen Abdul Kalam was an Indian aerospace scientist and statesman who served as the president of India from 2002 to 2007.
Born and raised in a Muslim family in Rameswaram, Tamil Nadu, Kalam studied physics and aerospace engineering. He spent the next four decades as a scientist and science administrator, mainly at the Defence Research and Development Organisation (DRDO) and Indian Space Research Organisation (ISRO) and was intimately involved in India's civilian space programme and military missile development efforts. He was known as the "Missile Man of India" for his work on the development of ballistic missile and launch vehicle technology. He also played a pivotal organisational, technical, and political role in Pokhran-II nuclear tests in 1998, India's second such test after the first test in 1974.
Kalam was elected as the president of India in 2002 with the support of both the ruling Bharatiya Janata Party and the then-opposition Indian National Congress. He was widely referred to as the "People's President". He engaged in teaching, writing and public service after his presidency. He was a recipient of several awards, including the Bharat Ratna, India's highest civilian honour.
While delivering a lecture at IIM Shillong, Kalam collapsed and died from an apparent cardiac arrest on 27 July 2015, aged 83. Thousands attended the funeral ceremony held in his hometown of Rameswaram, where he was buried with full state honours. A memorial was inaugurated near his home town in 2017.
"""

In [3]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
sentence = nltk.sent_tokenize(para)

In [7]:
type(sentence)

list

In [8]:
sentence

['\nAvul Pakir Jainulabdeen Abdul Kalam was an Indian aerospace scientist and statesman who served as the president of India from 2002 to 2007.',
 'Born and raised in a Muslim family in Rameswaram, Tamil Nadu, Kalam studied physics and aerospace engineering.',
 "He spent the next four decades as a scientist and science administrator, mainly at the Defence Research and Development Organisation (DRDO) and Indian Space Research Organisation (ISRO) and was intimately involved in India's civilian space programme and military missile development efforts.",
 'He was known as the "Missile Man of India" for his work on the development of ballistic missile and launch vehicle technology.',
 "He also played a pivotal organisational, technical, and political role in Pokhran-II nuclear tests in 1998, India's second such test after the first test in 1974.",
 'Kalam was elected as the president of India in 2002 with the support of both the ruling Bharatiya Janata Party and the then-opposition Indian N

In [9]:
stemmer = PorterStemmer()
stemmer.stem('running')

'run'

In [10]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [11]:
from nltk.stem import WordNetLemmatizer

In [12]:
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('gives')

'give'

In [13]:
import re
corpus=[]
for i in range(len(sentence)):
  text = re.sub('[^a-zA-Z]'," ",sentence[i])
  text = text.lower()
  corpus.append(text)
corpus

[' avul pakir jainulabdeen abdul kalam was an indian aerospace scientist and statesman who served as the president of india from      to      ',
 'born and raised in a muslim family in rameswaram  tamil nadu  kalam studied physics and aerospace engineering ',
 'he spent the next four decades as a scientist and science administrator  mainly at the defence research and development organisation  drdo  and indian space research organisation  isro  and was intimately involved in india s civilian space programme and military missile development efforts ',
 'he was known as the  missile man of india  for his work on the development of ballistic missile and launch vehicle technology ',
 'he also played a pivotal organisational  technical  and political role in pokhran ii nuclear tests in       india s second such test after the first test in      ',
 'kalam was elected as the president of india in      with the support of both the ruling bharatiya janata party and the then opposition indian na

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
for i in corpus:
  words = nltk.word_tokenize(i)
  for word in words:
    if word not in set(stopwords.words('english')):
      print(stemmer.stem(word))

avul
pakir
jainulabdeen
abdul
kalam
indian
aerospac
scientist
statesman
serv
presid
india
born
rais
muslim
famili
rameswaram
tamil
nadu
kalam
studi
physic
aerospac
engin
spent
next
four
decad
scientist
scienc
administr
mainli
defenc
research
develop
organis
drdo
indian
space
research
organis
isro
intim
involv
india
civilian
space
programm
militari
missil
develop
effort
known
missil
man
india
work
develop
ballist
missil
launch
vehicl
technolog
also
play
pivot
organis
technic
polit
role
pokhran
ii
nuclear
test
india
second
test
first
test
kalam
elect
presid
india
support
rule
bharatiya
janata
parti
opposit
indian
nation
congress
wide
refer
peopl
presid
engag
teach
write
public
servic
presid
recipi
sever
award
includ
bharat
ratna
india
highest
civilian
honour
deliv
lectur
iim
shillong
kalam
collaps
die
appar
cardiac
arrest
juli
age
thousand
attend
funer
ceremoni
held
hometown
rameswaram
buri
full
state
honour
memori
inaugur
near
home
town


In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(corpus)
cv.vocabulary_


{'avul': 13,
 'pakir': 84,
 'jainulabdeen': 61,
 'abdul': 0,
 'kalam': 64,
 'was': 130,
 'an': 6,
 'indian': 57,
 'aerospace': 2,
 'scientist': 105,
 'and': 7,
 'statesman': 114,
 'who': 133,
 'served': 107,
 'as': 10,
 'the': 124,
 'president': 93,
 'of': 79,
 'india': 56,
 'from': 40,
 'to': 127,
 'born': 18,
 'raised': 96,
 'in': 53,
 'muslim': 73,
 'family': 36,
 'rameswaram': 97,
 'tamil': 118,
 'nadu': 74,
 'studied': 115,
 'physics': 87,
 'engineering': 35,
 'he': 43,
 'spent': 112,
 'next': 77,
 'four': 39,
 'decades': 26,
 'science': 104,
 'administrator': 1,
 'mainly': 68,
 'at': 11,
 'defence': 27,
 'research': 101,
 'development': 29,
 'organisation': 82,
 'drdo': 31,
 'space': 111,
 'isro': 60,
 'intimately': 58,
 'involved': 59,
 'civilian': 23,
 'programme': 94,
 'military': 71,
 'missile': 72,
 'efforts': 32,
 'known': 65,
 'man': 69,
 'for': 38,
 'his': 46,
 'work': 136,
 'on': 80,
 'ballistic': 15,
 'launch': 66,
 'vehicle': 129,
 'technology': 121,
 'also': 5,
 'play

## TF-IDF

In [17]:
#TF - IDF (Term Frequency  Inverse Document Frequency)

from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    "I love my dog",
    "I love my cat",
    "You love my dog!"
]

v = TfidfVectorizer()
result = v.fit_transform(corpus)
v.vocabulary_

{'love': 2, 'my': 3, 'dog': 1, 'cat': 0, 'you': 4}

In [18]:
print(result)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 10 stored elements and shape (3, 5)>
  Coords	Values
  (0, 2)	0.5228423068642596
  (0, 3)	0.5228423068642596
  (0, 1)	0.6732546652684398
  (1, 2)	0.4532946552278861
  (1, 3)	0.4532946552278861
  (1, 0)	0.7674945674619879
  (2, 2)	0.39148397136265967
  (2, 3)	0.39148397136265967
  (2, 1)	0.5041068915759233
  (2, 4)	0.6628399823470976


In [19]:
all_feature_names = v.get_feature_names_out()
for word in all_feature_names:
  index = v.vocabulary_.get(word)
  print(word,v.idf_[index])

cat 1.6931471805599454
dog 1.2876820724517808
love 1.0
my 1.0
you 1.6931471805599454
