# Text processing with nltk

- Tokenization - converting text data into list of words / list of sentences
- Morphological analysis - converting a word into its root form
    - stemming
    - lemmatization
- PoS Tagging

In [2]:
import nltk

In [3]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("averaged_perceptron_tagger")
nltk.download("tagsets")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\anshu\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [4]:
data = "Jakarta , officially the Special Capital Region of Jakarta (Indonesian: Daerah Khusus Ibukota Jakarta), is the capital and largest city of Indonesia. On the northwest coast of the world's most-populous island of Java, it is the centre of economy, culture and politics of Indonesia with a population of 10,770,487 in the city as of 2020."
print(data)

Jakarta , officially the Special Capital Region of Jakarta (Indonesian: Daerah Khusus Ibukota Jakarta), is the capital and largest city of Indonesia. On the northwest coast of the world's most-populous island of Java, it is the centre of economy, culture and politics of Indonesia with a population of 10,770,487 in the city as of 2020.


In [5]:
# sentence tokenization
nltk.sent_tokenize(data)

['Jakarta , officially the Special Capital Region of Jakarta (Indonesian: Daerah Khusus Ibukota Jakarta), is the capital and largest city of Indonesia.',
 "On the northwest coast of the world's most-populous island of Java, it is the centre of economy, culture and politics of Indonesia with a population of 10,770,487 in the city as of 2020."]

In [6]:
#word tokenization
nltk.word_tokenize(data)

['Jakarta',
 ',',
 'officially',
 'the',
 'Special',
 'Capital',
 'Region',
 'of',
 'Jakarta',
 '(',
 'Indonesian',
 ':',
 'Daerah',
 'Khusus',
 'Ibukota',
 'Jakarta',
 ')',
 ',',
 'is',
 'the',
 'capital',
 'and',
 'largest',
 'city',
 'of',
 'Indonesia',
 '.',
 'On',
 'the',
 'northwest',
 'coast',
 'of',
 'the',
 'world',
 "'s",
 'most-populous',
 'island',
 'of',
 'Java',
 ',',
 'it',
 'is',
 'the',
 'centre',
 'of',
 'economy',
 ',',
 'culture',
 'and',
 'politics',
 'of',
 'Indonesia',
 'with',
 'a',
 'population',
 'of',
 '10,770,487',
 'in',
 'the',
 'city',
 'as',
 'of',
 '2020',
 '.']

## Morphological analysis
- converting a word into its format 
    - cars -> car
    - wives -> wife
    - went -> go
    
- stemming - faster, less accurate
- Lemmatization - slower, more accurate

In [7]:
# stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()
ps.stem("cars")

'car'

In [8]:
ps.stem("boxes")

'box'

In [9]:
ps.stem("wives")

'wive'

In [10]:
#lemmatization
from nltk.stem import WordNetLemmatizer
wd = WordNetLemmatizer()
wd.lemmatize("wives")

'wife'

In [11]:
wd.lemmatize("children")

'child'

In [12]:
wd.lemmatize("went",'v') # v = verb

'go'

## PoS Tagging

In [13]:
data = "I love python programming How about you?"
nltk.pos_tag(nltk.word_tokenize(data))

[('I', 'PRP'),
 ('love', 'VBP'),
 ('python', 'RB'),
 ('programming', 'VBG'),
 ('How', 'WRB'),
 ('about', 'IN'),
 ('you', 'PRP'),
 ('?', '.')]

In [14]:
nltk.help.upenn_tagset('RB')

RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...
