<a href="https://colab.research.google.com/github/burrittresearch/natural-language-processing/blob/main/nlp-notes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing (NLP) Notes

This notebook includes coding and notes for Natural Language Processing (NLP).

Data Source: https://www.gutenberg.org

# Project Workflow
* Define the Problem
* Process Data
* NLP Notes

# Define the Problem
Create notes for NLP

# Process Data

In [1]:
# Import libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import spacy


In [2]:
# Set display options
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 500)
pd.set_option('display.colheader_justify', 'left')
pd.set_option('display.precision', 3)

# Line break utility
str_lb = '\n \n'

In [3]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# NLP Notes

In [4]:
# Make language object returned by the load() function
nlp = spacy.load('en_core_web_sm')
nlp

<spacy.lang.en.English at 0x7cfe1489f610>

In [5]:
# Data type of the variable
type(nlp)

spacy.lang.en.English

In [6]:
# Make str object about spacy
str_about = 'this is about spacy nlp'
print(str_about)
print(type(str_about))

# Make token object of spacy string
token_str = nlp(str_about)
print(token_str)
print(type(token_str))


this is about spacy nlp
<class 'str'>
this is about spacy nlp
<class 'spacy.tokens.doc.Doc'>


In [7]:
# Make list of token object
lst_token = []
for token in token_str:
  lst_token.append(token)

lst_token

[this, is, about, spacy, nlp]

In [8]:
# Using list comprehension to do the same thing
# More pythonic than loops
lst_token = [token for token in token_str]
lst_token

[this, is, about, spacy, nlp]

In [9]:
# Make str object of sentences
str_sent = ('This is sentence 1'
  ' and this is sentence 1 continuing. But'
  ' now this is sentence 2.'
)
print(str_sent)
print(type(str_sent))

# Make token objects of sentences string
token_str_sent = nlp(str_sent)
print(token_str_sent)
print(type(token_str_sent))

# Verify length of token objects in the list
print(len(token_str_sent))


This is sentence 1 and this is sentence 1 continuing. But now this is sentence 2.
<class 'str'>
This is sentence 1 and this is sentence 1 continuing. But now this is sentence 2.
<class 'spacy.tokens.doc.Doc'>
18


In [10]:
# Make list of span objects sentences using .sents property
lst_span_str_sent = list(token_str_sent.sents)
print(lst_span_str_sent)
print(type(lst_span_str_sent))

# Verify length of span objects in the list
print(len(lst_span_str_sent))

[This is sentence 1 and this is sentence 1 continuing., But now this is sentence 2.]
<class 'list'>
2


In [11]:
# Print each sentence
for sentence in lst_span_str_sent:
  print(sentence)

This is sentence 1 and this is sentence 1 continuing.
But now this is sentence 2.


In [12]:
# Print first words of each sentence
for sentence in lst_span_str_sent:
  print(f'{sentence[:3]}')

This is sentence
But now this


In [13]:
# Find out more information about the token list
for token in token_str:
  print(token, token.idx)

# Find out more information about the span list
for span in token_str_sent:
  print(span, span.idx)


this 0
is 5
about 8
spacy 14
nlp 20
This 0
is 5
sentence 8
1 17
and 19
this 23
is 28
sentence 31
1 40
continuing 42
. 52
But 54
now 58
this 62
is 67
sentence 70
2 79
. 80


In [14]:
# Download book from Project Gutenberg and save locally
!wget -P '/content/drive/MyDrive/Colab Notebooks/input/' \
-O '/content/drive/MyDrive/Colab Notebooks/input/romeo-juliet.txt' \
https://www.gutenberg.org/cache/epub/1513/pg1513.txt


--2023-09-03 15:55:11--  https://www.gutenberg.org/cache/epub/1513/pg1513.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 169486 (166K) [text/plain]
Saving to: ‘/content/drive/MyDrive/Colab Notebooks/input/romeo-juliet.txt’


2023-09-03 15:55:18 (546 KB/s) - ‘/content/drive/MyDrive/Colab Notebooks/input/romeo-juliet.txt’ saved [169486/169486]



In [15]:
f = open('/content/drive/MyDrive/Colab Notebooks/input/romeo-juliet.txt', 'r')
x = f.read()

In [16]:
type(x)

str