<a href="https://colab.research.google.com/github/burrittresearch/natural-language-processing/blob/main/nlp-notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing (NLP) Notes

This notebook includes coding and notes Natural Language Processing (NLP).

Data Source: https://www.gutenberg.org

# Project Workflow
* Define the Problem
* Process Data
* NLP Notes

# Define the Problem
Create notes for NLP

# Process Data

In [1]:
# Import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import spacy


In [2]:
# Set display options
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 500)
pd.set_option('display.colheader_justify', 'left')
pd.set_option('display.precision', 3)

# Line break utility
str_lb = '\n \n'

In [3]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# Process Data with spaCy

In [4]:
# Make language object returned by the load() function
nlp = spacy.load('en_core_web_sm')
nlp

<spacy.lang.en.English at 0x7847d0876bf0>

In [5]:
# Data type of the variable
type(nlp)

In [6]:
# Make string object
str_about = ('This is sentence 1'
  ' and this is sentence 1 continuing. But'
  ' now this is sentence 2.'
)

# Explore string object
print(type(str_about))
print(str_about)

# Make word token object of string
token_about = nlp(str_about)

# Explore word tokens
print(token_about)
print(type(token_about))
print(len(token_about))

# Get word token indices
for token in token_about:
  print(token, token.idx)

<class 'str'>
This is sentence 1 and this is sentence 1 continuing. But now this is sentence 2.
This is sentence 1 and this is sentence 1 continuing. But now this is sentence 2.
<class 'spacy.tokens.doc.Doc'>
18
This 0
is 5
sentence 8
1 17
and 19
this 23
is 28
sentence 31
1 40
continuing 42
. 52
But 54
now 58
this 62
is 67
sentence 70
2 79
. 80


In [7]:
# Make list of token objects
lst_token_about = [token for token in token_about]
print(lst_token_about)

[This, is, sentence, 1, and, this, is, sentence, 1, continuing, ., But, now, this, is, sentence, 2, .]


In [8]:
# Make span objects sentences using .sents property
token_about_sents = token_about.sents

# Explore sentence tokens
print(type(token_about_sents))
print(token_about_sents)

# Make list of sentence tokens
lst_token_about_sents = [token for token in token_about_sents]
print(lst_token_about_sents)

# Explore sentence tokens
print(type(lst_token_about_sents))
print(len(lst_token_about_sents))

<class 'generator'>
<generator object at 0x7846db759bc0>
[This is sentence 1 and this is sentence 1 continuing., But now this is sentence 2.]
<class 'list'>
2


In [9]:
# Print each sentence in sentence list
for sentence in lst_token_about_sents:
  print(sentence)

This is sentence 1 and this is sentence 1 continuing.
But now this is sentence 2.


In [10]:
# Print first words of each sentence
for sentence in lst_token_about_sents:
  print(f'{sentence[:3]}')

This is sentence
But now this


# Process Data with spaCy Using Romeo and Juliet from Project Gutenberg

In [11]:
# Download book from Project Gutenberg and save locally
!wget -P '/content/drive/MyDrive/colab-notebooks/github/nlp-notes/input/' \
-O '/content/drive/MyDrive/colab-notebooks/github/nlp-notes/input/romeo-juliet.txt' \
https://www.gutenberg.org/cache/epub/1513/pg1513.txt


--2024-03-13 20:43:05--  https://www.gutenberg.org/cache/epub/1513/pg1513.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 169575 (166K) [text/plain]
Saving to: ‘/content/drive/MyDrive/colab-notebooks/github/nlp-notes/input/romeo-juliet.txt’


2024-03-13 20:43:06 (2.20 MB/s) - ‘/content/drive/MyDrive/colab-notebooks/github/nlp-notes/input/romeo-juliet.txt’ saved [169575/169575]



In [12]:
# Make file object and read into string object
str_path_romeo = '/content/drive/MyDrive/colab-notebooks/github/nlp-notes/input/romeo-juliet.txt'
str_romeo = open(str_path_romeo, 'r')
str_romeo = str_romeo.read()

# Explore string object
print(type(str_romeo))
print(f'{str_romeo}'[:100])

<class 'str'>
﻿The Project Gutenberg eBook of Romeo and Juliet
    
This ebook is for the use of anyone anywhere i


In [13]:
# Make word token object of string
token_romeo = nlp(str_romeo)

# Explore word tokens
print(f'{token_romeo}'[:100])
print(type(token_romeo))
print(len(token_romeo))

# Get word token indices
for token in token_romeo:
  if token.idx <= 100:
    print(token, token.idx)

﻿The Project Gutenberg eBook of Romeo and Juliet
    
This ebook is for the use of anyone anywhere i
<class 'spacy.tokens.doc.Doc'>
41501
﻿The 0
Project 5
Gutenberg 13
eBook 23
of 29
Romeo 32
and 38
Juliet 42

    
 48
This 54
ebook 59
is 65
for 68
the 72
use 76
of 80
anyone 83
anywhere 90
in 99


In [14]:
# Make list of token objects
lst_token_romeo = [token for token in token_romeo]
print(lst_token_romeo[:10])

[﻿The, Project, Gutenberg, eBook, of, Romeo, and, Juliet, 
    
, This]


In [15]:
# Make span objects sentences using .sents property
token_romeo_sents = token_romeo.sents

# Explore sentence tokens
print(type(token_romeo_sents))
print(token_romeo_sents)

# Make list of sentence tokens
lst_token_romeo_sents = [token for token in token_romeo_sents]

# Explore list of sentence tokens
print(lst_token_romeo_sents[:10])
print(type(lst_token_romeo_sents))
print(len(lst_token_romeo_sents))


<class 'generator'>
<generator object at 0x7846db7cf240>
[﻿The Project Gutenberg eBook of Romeo and Juliet
    
, This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever., You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org., If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.

, Title: Romeo and Juliet


Author: William Shakespeare

Release date: November 1, 1998 [eBook #1513]
                Most recently updated: June 27, 2023

Language: English

Credits: the PG Shakespeare Team, a team of about twenty Project Gutenberg volunteers


*** START OF THE PROJECT GUTENBERG EBOOK ROMEO AND JULIET ***




THE TRAGEDY OF ROMEO AND JULIET

by William Shakespeare




Contents

THE PROLOGUE.

, ACT I
Scene I. A public 

In [16]:
# Print each sentence in sentence list
for sentence in lst_token_romeo_sents[:10]:
  print(sentence)

# Print first words of each sentence
for sentence in lst_token_romeo_sents[:10]:
  print(f'{sentence[:3]}')

﻿The Project Gutenberg eBook of Romeo and Juliet
    

This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever.
You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org.
If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.


Title: Romeo and Juliet


Author: William Shakespeare

Release date: November 1, 1998 [eBook #1513]
                Most recently updated: June 27, 2023

Language: English

Credits: the PG Shakespeare Team, a team of about twenty Project Gutenberg volunteers


*** START OF THE PROJECT GUTENBERG EBOOK ROMEO AND JULIET ***




THE TRAGEDY OF ROMEO AND JULIET

by William Shakespeare




Contents

THE PROLOGUE.


ACT I
Scene I. A public place.

Scene II.
A Street.

Scene III.
Room in Capulet’s House

In [17]:
# Filter out stop words
lst_token_romeo_filtered = [token for token in token_romeo if not token.is_stop]
print(lst_token_romeo_filtered[:100])

[﻿The, Project, Gutenberg, eBook, Romeo, Juliet, 
    
, ebook, use, United, States, 
, parts, world, cost, restrictions, 
, whatsoever, ., copy, ,, away, -, use, terms, 
, Project, Gutenberg, License, included, ebook, online, 
, www.gutenberg.org, ., located, United, States, ,, 
, check, laws, country, located, 
, eBook, ., 

, Title, :, Romeo, Juliet, 


, Author, :, William, Shakespeare, 

, Release, date, :, November, 1, ,, 1998, [, eBook, #, 1513, ], 
                , recently, updated, :, June, 27, ,, 2023, 

, Language, :, English, 

, Credits, :, PG, Shakespeare, Team, ,, team, Project, Gutenberg, volunteers, 


, *, *, *, START, PROJECT, GUTENBERG]


In [18]:
# Check lemmatization for filtered list
lst_token_romeo_lemma = [f'{token} {token.lemma_}' for token in lst_token_romeo_filtered]
lst_token_romeo_lemma[:100]

['\ufeffThe \ufeffthe',
 'Project Project',
 'Gutenberg Gutenberg',
 'eBook eBook',
 'Romeo Romeo',
 'Juliet Juliet',
 '\n    \n \n    \n',
 'ebook ebook',
 'use use',
 'United United',
 'States States',
 '\n \n',
 'parts part',
 'world world',
 'cost cost',
 'restrictions restriction',
 '\n \n',
 'whatsoever whatsoever',
 '. .',
 'copy copy',
 ', ,',
 'away away',
 '- -',
 'use use',
 'terms term',
 '\n \n',
 'Project Project',
 'Gutenberg Gutenberg',
 'License License',
 'included include',
 'ebook ebook',
 'online online',
 '\n \n',
 'www.gutenberg.org www.gutenberg.org',
 '. .',
 'located locate',
 'United United',
 'States States',
 ', ,',
 '\n \n',
 'check check',
 'laws law',
 'country country',
 'located locate',
 '\n \n',
 'eBook eBook',
 '. .',
 '\n\n \n\n',
 'Title title',
 ': :',
 'Romeo Romeo',
 'Juliet Juliet',
 '\n\n\n \n\n\n',
 'Author Author',
 ': :',
 'William William',
 'Shakespeare Shakespeare',
 '\n\n \n\n',
 'Release Release',
 'date date',
 ': :',
 'November Nove