In [None]:
import re
import nltk
import string
import numpy as np

In [None]:
text = "     \t\t <b>Natural   language  processing (NLP)<b>    is a branch of   artificial intelligence (AI)   that enables computers to comprehend, generate, and manipulate human language.\n Natural language  processing has the ability to    interrogate the   data with natural   language text or voice.                 \n Check out this article for more information: https://en.wikipedia.org/wiki/Natural_language_processing.\n Natural language processing has its roots in the 1940s. Already in 1940, Alan Turing published an article titled Computing Machinery and Intelligence which proposed what is now called the Turing test as a criterion of intelligence,"
print(text)

**Whitespace Removal**

In [None]:
# remove leading and trailing white space

text = text.strip()
print(text)

In [None]:
# replace multiple consecutive white space characters with a single space

text = " ".join(text.split())
print(text)

**URL and HTML Removal**

In [None]:
# regular expression pattern to match URLs
url_pattern = r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?"

# replace URLs with an empty string
text = re.sub(url_pattern, "", text)
print(text)

**HTML Code Removal**

In [None]:
# rgular expression pattern to match HTML tags
html_pattern = r"<[^>]+>"

# replace HTML tags with an empty string
text = re.sub(html_pattern, "", text)
print(text)

**Tokenization**

In [None]:
# Split sentence into words and punctuations
# requires punkt package - nltk.download('punkt')

from nltk.tokenize import word_tokenize

tokens = word_tokenize(text)
print(tokens)

In [None]:
# Sentence tokenizer
from nltk.tokenize import sent_tokenize

print(sent_tokenize(text))

**Part-of-Speech Tagging**

In [None]:
from nltk import pos_tag

tagged_tokens = pos_tag(tokens)
print(tagged_tokens)

**Named-Entity Recognition**

In [None]:
from nltk import ne_chunk

ne_chunk(tagged_tokens)

**Lowercasing**

In [None]:
tokens = [token.lower() for token in tokens]
print(tokens)

In [None]:
# equivalent for loop

lowercased_tokens = []
for token in tokens:
    lowercased_tokens.append(token.lower())
print(lowercased_tokens)

**Punctuation Removal**

In [None]:
# punctuations

print(string.punctuation)

In [None]:
# exclude punctuations

tokens = [token for token in tokens if token not in string.punctuation]
print(tokens)

In [None]:
# equivalent for loop

punctuation_free_tokens = []
for token in tokens:
    if token not in string.punctuation:
        punctuation_free_tokens.append(token)
print(punctuation_free_tokens)

**Stemming**

In [None]:
# removes common suffixes from words to get close to base form
# e.g. sized -> size, flies -> fli,

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

print(stemmer.stem('sized'))
print(stemmer.stem('sizing'))
print(stemmer.stem('flies'))

In [None]:
# apply stemming on each word

stemmed_tokens = [stemmer.stem(token) for token in tokens]
print(stemmed_tokens)

**Lemmatization**

In [None]:
# transforming words to base form

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize('sized'))
print(lemmatizer.lemmatize('flies'))

In [None]:
# apply lemmatization on each word

tokens = [lemmatizer.lemmatize(token) for token in tokens]
print(tokens)

**Stopword Removal**

In [None]:
# stopwords
from nltk.corpus import stopwords

stopwords_list = stopwords.words("english")
print(stopwords_list)

In [None]:
# languages available

print(stopwords.fileids())

In [None]:
# exclude stopwords - need .lower() becuase stopwords are in lower case

tokens = [token for token in tokens if token.lower() not in stopwords_list]
print(tokens)

In [None]:
# equivalent for loop

stopword_free_tokens = []
for token in tokens:
    if token.lower() not in stopwords_list:
        stopword_free_tokens.append(token)
print(stopword_free_tokens)

**Frequent Word Removal**

In [None]:
# word frequency distribution

from nltk.probability import FreqDist

fdist = FreqDist(tokens)
fdist

In [None]:
# count of each word

fdist['natural']

In [None]:
# total word count

fdist.N() * 0.15

In [None]:
# exclude words that constitute 15% of total word frequency

tokens = [token for token in tokens if  fdist[token] < 0.15*fdist.N()]
print(tokens)

**Spelling Correction**

In [None]:
# language vocabulary

from nltk.corpus import words

vocabulary = words.words()
print(len(vocabulary))
print(vocabulary[:50])

In [None]:
# edit-distance: number of changes required to transform one word into another

from nltk.metrics import edit_distance

print(edit_distance('hello', 'helo'))
print(edit_distance('hello', 'helloo'))
print(edit_distance('hello', 'bye'))
print(edit_distance('hello', 'hello'))

In [None]:
# correct spelling of each word

corrected_tokens = []
for token in tokens:
    # calculate edit distance of each token with all words in vocabulary
    # to find the word with the lowest edit distance
    # correctly spelled words will have distance 0 with themselved
    distances = np.array([edit_distance(x, token) for x in vocabulary])
    corrected_token = vocabulary[np.argmin(distances)]
    corrected_tokens.append(corrected_token)
print(corrected_tokens)

**Exercise 10.1** Write a language detector that uses stopwords to identify the language used in a given text.

In [None]:
def detect_language(text):
    # TODO: Your Code Here


**Text Analysis**

In [None]:
# file reading

with open('sherlock.txt', 'r') as f:
    raw_text = f.read()
raw_text

In [None]:
# tokenize

from nltk.tokenize import word_tokenize

tokens = word_tokenize(raw_text)
print(tokens[:50])

In [None]:
# lowercase

tokens = [token.lower() for token in tokens]
print(tokens[:50])

In [None]:
# Text class

from nltk.text import Text

text = Text(tokens)
text

In [None]:
# frequency distribution of words

text.vocab()

In [None]:
# word frequency

text.count('sherlock')

In [None]:
# first occurance of word

text.index('sherlock')

In [None]:
# display all occucrencecs of a word with context

text.concordance("sherlock", lines=20)

In [None]:
# return list of word occurances with context

concordances = text.concordance_list("sherlock")
concordances[1]

In [None]:
# display words that occur together frequently

text.collocations()

In [None]:
# returns list of collocating words

collocations = text.collocation_list()
print(collocations)

In [None]:
# words that occur in the same context as specified word

text.similar('holmes')

In [None]:
# contexts where word occurs frequently

text.common_contexts(['holmes'])

In [None]:
# plot most common words

text.plot(30)

In [None]:
# plot of where in text the specified words occur

text.dispersion_plot(['sherlock', 'holmes', 'watson', 'lestrade'])

**Corpus**

In [None]:
# Gutenberg Corpus

from nltk.corpus import gutenberg

print(gutenberg.fileids())

In [None]:
# Alice in Wonderland

alice = gutenberg.words('carroll-alice.txt')
alice

In [None]:
text = Text(alice)
text