# Natural Language Processing Introduction

In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

import string

from collections import Counter

# You need to run this once!
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/lecturer/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lecturer/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Tokenization

In [2]:
example_text = \
"""
This is an example paragraph for tokenization. We will implement character and word-based tokenization from scratch using plain Python. 
Let's build vocabularies for both and use them to encode and decode the paragraph.
"""

## Manual Character based Tokenization
* mapping each character to an integer

In [3]:
def character_tokenization(text):
    return list(text)

### Vocabulary
* token to integer mapping

In [4]:
def build_vocab(tokens):
    vocab = {}
    index = 0
    
    for token in tokens:
        if token not in vocab:
            vocab[token] = index
            index += 1
            
    return vocab

In [5]:
char_tokens = character_tokenization(example_text)
char_vocab = build_vocab(char_tokens)

In [6]:
print('Character Tokens:')
print(char_tokens)

print('Character Vocabulary:')
print(char_vocab)

Character Tokens:
['\n', 'T', 'h', 'i', 's', ' ', 'i', 's', ' ', 'a', 'n', ' ', 'e', 'x', 'a', 'm', 'p', 'l', 'e', ' ', 'p', 'a', 'r', 'a', 'g', 'r', 'a', 'p', 'h', ' ', 'f', 'o', 'r', ' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', '.', ' ', 'W', 'e', ' ', 'w', 'i', 'l', 'l', ' ', 'i', 'm', 'p', 'l', 'e', 'm', 'e', 'n', 't', ' ', 'c', 'h', 'a', 'r', 'a', 'c', 't', 'e', 'r', ' ', 'a', 'n', 'd', ' ', 'w', 'o', 'r', 'd', '-', 'b', 'a', 's', 'e', 'd', ' ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'a', 't', 'i', 'o', 'n', ' ', 'f', 'r', 'o', 'm', ' ', 's', 'c', 'r', 'a', 't', 'c', 'h', ' ', 'u', 's', 'i', 'n', 'g', ' ', 'p', 'l', 'a', 'i', 'n', ' ', 'P', 'y', 't', 'h', 'o', 'n', '.', ' ', '\n', 'L', 'e', 't', "'", 's', ' ', 'b', 'u', 'i', 'l', 'd', ' ', 'v', 'o', 'c', 'a', 'b', 'u', 'l', 'a', 'r', 'i', 'e', 's', ' ', 'f', 'o', 'r', ' ', 'b', 'o', 't', 'h', ' ', 'a', 'n', 'd', ' ', 'u', 's', 'e', ' ', 't', 'h', 'e', 'm', ' ', 't', 'o', ' ', 'e', 'n', 'c', 'o', 'd', 'e', ' ', 'a',

## Manual Word based Tokenization
* mapping each word to an integer

In [7]:
def word_tokenization(text):
    #text = ''.join([char if char not in string.punctuation else ' ' for char in text])
    return text.split()

In [8]:
word_tokens = word_tokenization(example_text)
word_vocab = build_vocab(word_tokens)

In [9]:
print('Word Tokens:')
print(word_tokens)

print('Word Vocabulary:')
print(word_vocab)

Word Tokens:
['This', 'is', 'an', 'example', 'paragraph', 'for', 'tokenization.', 'We', 'will', 'implement', 'character', 'and', 'word-based', 'tokenization', 'from', 'scratch', 'using', 'plain', 'Python.', "Let's", 'build', 'vocabularies', 'for', 'both', 'and', 'use', 'them', 'to', 'encode', 'and', 'decode', 'the', 'paragraph.']
Word Vocabulary:
{'This': 0, 'is': 1, 'an': 2, 'example': 3, 'paragraph': 4, 'for': 5, 'tokenization.': 6, 'We': 7, 'will': 8, 'implement': 9, 'character': 10, 'and': 11, 'word-based': 12, 'tokenization': 13, 'from': 14, 'scratch': 15, 'using': 16, 'plain': 17, 'Python.': 18, "Let's": 19, 'build': 20, 'vocabularies': 21, 'both': 22, 'use': 23, 'them': 24, 'to': 25, 'encode': 26, 'decode': 27, 'the': 28, 'paragraph.': 29}


### Encoding and Decoding of Tokens

In [10]:
def token_to_idx(text, vocab):
    return [vocab[token] for token in text]

def idx_to_tokens(indices, vocab):
    return ' '.join([key for key, value in vocab.items() if value in indices])

In [11]:
encoded_word = token_to_idx(word_tokens, word_vocab)
decoded_word = idx_to_tokens(encoded_word, word_vocab)

print('Original Text:')
print(example_text)
print('Encoded Word Indices:')
print(encoded_word)
print('Decoded Word Text:')
print(decoded_word)

Original Text:

This is an example paragraph for tokenization. We will implement character and word-based tokenization from scratch using plain Python. 
Let's build vocabularies for both and use them to encode and decode the paragraph.

Encoded Word Indices:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 5, 22, 11, 23, 24, 25, 26, 11, 27, 28, 29]
Decoded Word Text:
This is an example paragraph for tokenization. We will implement character and word-based tokenization from scratch using plain Python. Let's build vocabularies both use them to encode decode the paragraph.


## Word Tokenization With NLTK

In [12]:
text = 'Few foxes are running towards us!'
tokens = word_tokenize(text)

print('Tokens:', tokens)

Tokens: ['Few', 'foxes', 'are', 'running', 'towards', 'us', '!']


# Stop word removal

In [13]:
stop_words = set(stopwords.words('english'))

filtered_tokens = [token for token in tokens if token.lower() not in stop_words]

print("Filtered Tokens:", filtered_tokens)

Filtered Tokens: ['foxes', 'running', 'towards', 'us', '!']


# Stemming
* Stemming is a text normalization technique that involves reducing words to their base or root form, known as the "stem." 
* The process involves removing suffixes or prefixes from words, aiming to obtain the root form that captures the core meaning
* "running" -> "runs"

In [14]:
ps = PorterStemmer()
stemmed_tokens = [ps.stem(token) for token in filtered_tokens]

print('Stemmed Tokens:', stemmed_tokens)

Stemmed Tokens: ['fox', 'run', 'toward', 'us', '!']


# Lemmatization
* Lemmatization is also a text normalization technique, but it focuses on reducing words to their base or dictionary form, known as the "lemma." 
* Unlike stemming, lemmatization considers the context of words and their part of speech to provide a meaningful transformation.

In [15]:
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

print('Lemmatized Tokens:', lemmatized_tokens)

Lemmatized Tokens: ['fox', 'running', 'towards', 'u', '!']


# Sparse Vectoral Representation
* **Numerical Input for Models:** Machine learning models, including neural networks, require numerical input.
* Vector representations convert textual data into numerical format, enabling models to process and learn from the data.
* **Sparse vectors are representations where the majority of elements are zero, and only a small number of non-zero elements carry meaningful information.**

### Bag-of-Words (BoW) model
*  It represents a document as an unordered set of words, disregarding grammar and word order but keeping track of the frequency of each word.
* **corpus:** collection of text documents

In [39]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

corpus = [
    'This is a simple example.', 
    'Another example for illustration.', 
    'Illustration is important.'
]

vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus)

print("BoW Matrix:")
print(X.toarray())

BoW Matrix:
[[0 1 0 0 0 1 1 1]
 [1 1 1 1 0 0 0 0]
 [0 0 0 1 1 1 0 0]]


### TF-IDF representation
* TF-IDF is a text representation technique that takes into account both the frequency of a term in a document (Term Frequency) and its importance in the entire corpus (Inverse Document Frequency). 
* It aims to highlight terms that are distinctive to a document while downweighting common terms.

In [17]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

print('TF-IDF Matrix:')
print(X_tfidf.toarray())

TF-IDF Matrix:
[[0.         0.42804604 0.         0.         0.         0.42804604
  0.5628291  0.5628291 ]
 [0.5628291  0.42804604 0.5628291  0.42804604 0.         0.
  0.         0.        ]
 [0.         0.         0.         0.51785612 0.68091856 0.51785612
  0.         0.        ]]


## Dense Vectoral Representation
*  **Dense vectors are representations where most elements contain non-zero values, and each element typically contributes to the overall representation.**
### Word2Vec representation

In [18]:
from gensim.models import Word2Vec

# word tokenized corpus
corpus = [
    ['natural', 'language', 'processing'], 
    ['word', 'embeddings', 'are', 'interesting']
]

model = Word2Vec(corpus, vector_size=10, window=2, min_count=1, workers=4)

# get the word vectors (embbedings)
word_embeddings = model.wv

print("Word Embedding for 'language':") 
print(word_embeddings['language'])

Word Embedding for 'language':
[-0.08157917  0.04495798 -0.04137076  0.00824536  0.08498619 -0.04462177
  0.045175   -0.0678696  -0.03548489  0.09398508]


# spaCy

In [19]:
import spacy
from spacy import displacy

# You need to run this once!
#!python -m spacy download en

2024-01-09 12:45:04.371749: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-09 12:45:04.373177: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-01-09 12:45:04.394381: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-09 12:45:04.394405: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-09 12:45:04.394420: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

### Named Entity Recognition (NER)

In [46]:
#sentence = "Let's meet with Jonh at Istanbul on this Monday at 9:00 PM"
sentence = "Linux is way better and Windows 11"

nlp = spacy.load('en_core_web_sm')

doc = nlp(sentence)

entities = [(ent.text, ent.label_) for ent in doc.ents]

print('Named Entities:', entities)

Named Entities: [('Windows 11', 'PRODUCT')]


In [47]:
displacy.render(doc, style='ent', jupyter=True, options={'compact': True})

### Dependency Tree Parsing

In [48]:
displacy.render(doc, style='dep', jupyter=True, options={'distance': 120})

### Keyword Extraction using Part of Speech (POS) and NNP tag

In [49]:
#example_text = 'Apple Inc. is planning to open a new store.'
example_text = "Linux is way better and Windows 11"

doc = nlp(example_text)

# Extract keywords (NNP)
keywords = [token.text for token in doc if token.pos_ == 'PROPN']

print('Keywords (NNP):', keywords)

Keywords (NNP): ['Linux', 'Windows']


In [24]:
example_paragraph = """
Natural Language Processing (NLP) is a subfield of artificial intelligence
that focuses on the interaction between computers and humans using natural
language. It encompasses the development of algorithms and models to enable computers to
understand, interpret, and generate human-like text. NLP plays a crucial role in various
applications, including chatbots, sentiment analysis, language translation, and information
retrieval.  In recent years, there has been a tremendous growth in the adoption of NLP
techniques due to advancements in machine learning and deep learning. These techniques
allow NLP models to capture more complex linguistic patterns and nuances, making
them highly effective in tasks such as named entity recognition, text
summarization, and question answering. 
"""

In [25]:
doc = nlp(example_paragraph)
keywords = [token.text for token in doc if token.pos_ == 'PROPN']

print('Keywords (NNP):', keywords)

Keywords (NNP): ['Natural', 'Language', 'Processing', 'NLP', 'NLP', 'NLP', 'NLP']


In [26]:
keyword_counts = Counter(keywords)

most_common_keywords = keyword_counts.most_common(n=2)

print('Most Common Keywords:')

for keyword, count in most_common_keywords:
    print(f"{keyword}: {count} occurrences")

Most Common Keywords:
NLP: 4 occurrences
Natural: 1 occurrences


In [27]:
example_paragraph_2 = """
The latest flagship smartphone, the Galaxy Pro, stands out as a true
marvel in the ever-evolving world of mobile technology. Boasting a sleek and premium
design, the phone's glass back seamlessly curves into a metal frame, providing a
comfortable grip and a sophisticated aesthetic. The vibrant 6.5-inch Super AMOLED display
offers stunning clarity and vibrant colors, making every video and image a visual
delight. The Galaxy Pro is powered by the latest octa-core processor and 8GB of
RAM, ensuring seamless multitasking and smooth performance. The camera setup is
nothing short of impressive, with a triple-lens system that captures sharp and
detailed photos in various lighting conditions. The 4500mAh battery provides all-day
longevity, and the inclusion of fast charging ensures you spend more time enjoying your
device and less time waiting for it to power up. The user interface is intuitive,
running on the latest version of the Galaxy OS, with a host of customizable features that
cater to both tech enthusiasts and casual users alike. Overall, the Galaxy Pro
sets a new standard for flagship smartphones, combining cutting-edge technology
with a refined design for a truly exceptional user experience
"""

In [28]:
doc = nlp(example_paragraph_2)
keywords_2 = [token.text for token in doc if token.pos_ == 'PROPN']

print('Keywords (NNP):', keywords_2)

Keywords (NNP): ['Galaxy', 'Pro', 'Galaxy', 'Pro', 'GB', 'Galaxy', 'OS', 'Galaxy']


In [29]:
keyword_counts_2 = Counter(keywords_2)

most_common_keywords_2 = keyword_counts_2.most_common(n=2)

print('Most Common Keywords:')

for keyword, count in most_common_keywords_2:
    print(f"{keyword}: {count} occurrences")

Most Common Keywords:
Galaxy: 4 occurrences
Pro: 2 occurrences
