Natural Language Processing

In [1]:
text = "Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."
text

"Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."

Segmentation

In [2]:
#import
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\abrah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
#split the text into sentences
sentences = sent_tokenize(text)
sentences

['Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry.',
 'The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066.',
 'Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace.',
 "Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."]

In [4]:
#pick out seperate elements from the sentences
sentences[2]

'Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace.'

In [5]:
#punctuation removal
import re

#remove punctuation characters
text = re.sub(r"[^a-zA-Z0-9]"," ",sentences[2])
text

'Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace '

Tokenization

In [6]:
from nltk.tokenize import word_tokenize

In [8]:
words = word_tokenize(text)
print(words)

['Queen', 'Camilla', 'was', 'crowned', 'alongside', 'him', 'before', 'a', 'huge', 'parade', 'back', 'to', 'Buckingham', 'Palace']


Removal of stop words

In [10]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abrah\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
#Remove stop words
words = [w for w in words if w not in stopwords.words('english')]
print(words)

['Queen', 'Camilla', 'crowned', 'alongside', 'huge', 'parade', 'back', 'Buckingham', 'Palace']


In [17]:
#a glimpse of the stop words in nltk's corpus
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Stemming and Lemmatization

In [18]:
nltk.download('wordnet') #download for lemmatization
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abrah\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\abrah\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [19]:
#Stemming
from nltk.stem.porter import PorterStemmer

#reduce words to their stems
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)

['queen', 'camilla', 'crown', 'alongsid', 'huge', 'parad', 'back', 'buckingham', 'palac']


In [22]:
#Lemmatize
from nltk.stem.wordnet import WordNetLemmatizer

#reduce words to their root form
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmatized)

['Queen', 'Camilla', 'crowned', 'alongside', 'huge', 'parade', 'back', 'Buckingham', 'Palace']


In [25]:
#Better illustration of the difference between stemming and lemmatization
words2 = ['wait', 'waiting', 'studies', 'studying', 'computers']

#Stemming
stemmed_words2 = [PorterStemmer().stem(w) for w in words2]
print("Stemming Output:{}".format(stemmed_words2))

#Lemmatization
lemmatized_words = [WordNetLemmatizer().lemmatize(w) for w in words2]
print("Lemmatization Output:{}".format(lemmatized_words))

Stemming Output:['wait', 'wait', 'studi', 'studi', 'comput']
Lemmatization Output:['wait', 'waiting', 'study', 'studying', 'computer']


Part of Speech Tagging

In [26]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\abrah\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\abrah\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [27]:
from nltk import pos_tag

In [29]:
#tag each word with parts of speech
pos_tag(words)

[('Queen', 'NNP'),
 ('Camilla', 'NNP'),
 ('crowned', 'VBD'),
 ('alongside', 'RB'),
 ('huge', 'JJ'),
 ('parade', 'NN'),
 ('back', 'RB'),
 ('Buckingham', 'NNP'),
 ('Palace', 'NNP')]

Named Entity Recognition

In [32]:
from nltk import ne_chunk
nltk.download("words")

[nltk_data] Downloading package words to
[nltk_data]     C:\Users\abrah\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [33]:
ner_tree = ne_chunk(pos_tag(word_tokenize(sentences[2])))
print(ner_tree)

(S
  (PERSON Queen/NNP)
  (PERSON Camilla/NNP)
  was/VBD
  crowned/VBN
  alongside/RB
  him/PRP
  before/IN
  a/DT
  huge/JJ
  parade/NN
  back/RB
  to/TO
  (PERSON Buckingham/NNP Palace/NNP)
  ./.)


In [38]:
text = "Millions of people across the UK and beyond have celebrated the coronation of King Charles III - a symbolic ceremony combining a religious service and pageantry. The ceremony was held at Westminster Abbey, with the King becoming the 40th reigning monarch to be crowned there since 1066. Queen Camilla was crowned alongside him before a huge parade back to Buckingham Palace. Here's how the day of splendour and formality, which featured customs dating back more than 1,000 years, unfolded."

ner_tree = ne_chunk(pos_tag(word_tokenize(text)))
print(ner_tree)

(S
  Millions/NNS
  of/IN
  people/NNS
  across/IN
  the/DT
  (ORGANIZATION UK/NNP)
  and/CC
  beyond/IN
  have/VBP
  celebrated/VBN
  the/DT
  coronation/NN
  of/IN
  King/NNP
  (PERSON Charles/NNP III/NNP)
  -/:
  a/DT
  symbolic/JJ
  ceremony/NN
  combining/VBG
  a/DT
  religious/JJ
  service/NN
  and/CC
  pageantry/NN
  ./.
  The/DT
  ceremony/NN
  was/VBD
  held/VBN
  at/IN
  (ORGANIZATION Westminster/NNP Abbey/NNP)
  ,/,
  with/IN
  the/DT
  King/NNP
  becoming/VBG
  the/DT
  40th/CD
  reigning/VBG
  monarch/NN
  to/TO
  be/VB
  crowned/VBN
  there/RB
  since/IN
  1066/CD
  ./.
  (PERSON Queen/NNP Camilla/NNP)
  was/VBD
  crowned/VBN
  alongside/RB
  him/PRP
  before/IN
  a/DT
  huge/JJ
  parade/NN
  back/RB
  to/TO
  (PERSON Buckingham/NNP Palace/NNP)
  ./.
  Here/RB
  's/VBZ
  how/WRB
  the/DT
  day/NN
  of/IN
  splendour/NN
  and/CC
  formality/NN
  ,/,
  which/WDT
  featured/VBD
  customs/NNS
  dating/VBG
  back/RB
  more/JJR
  than/IN
  1,000/CD
  years/NNS
  ,/,
  unfolded/

In [39]:
text = "Twitter CEO Elon Musk arrived at the Staples Center in Los Angeles, California."
ner_tree = ne_chunk(pos_tag(word_tokenize(text)))
print(ner_tree)

(S
  (PERSON Twitter/NNP)
  (ORGANIZATION CEO/NNP Elon/NNP Musk/NNP)
  arrived/VBD
  at/IN
  the/DT
  (FACILITY Staples/NNP Center/NNP)
  in/IN
  (GPE Los/NNP Angeles/NNP)
  ,/,
  (GPE California/NNP)
  ./.)
