LAB ASSIGNMENTS Module 3: Deep Learning and Natural Language Processing Natural Language Processing Methods

Q1. Locate a free classical book on the Project Gutenberg website, download the ASCII version of the book and tokenize the text using any NLP library and save the result to a new file.

In [1]:
import requests
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [2]:


# Downloading a free classical book from Project Gutenberg
def download_book(book_url):
    response = requests.get(book_url)
    return response.text

# Tokenize the text and remove stopwords
def tokenize_and_clean_text(text):
    tokens = word_tokenize(text)
    words = [word for word in tokens if word.isalnum()]
    words = [word.lower() for word in words]
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    return words

# Save the tokenized text to a new file
def save_tokens_to_file(tokens, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(" ".join(tokens))




In [3]:
if __name__ == '__main__':
    # Replacing the URL with the Project Gutenberg book URL you want to download
    book_url = "https://gutenberg.org/ebooks/71952"

    # Downloading the book
    book_text = download_book(book_url)

    # Tokenizing and clean the text
    tokens = tokenize_and_clean_text(book_text)

    # Specifying the output file for the tokenized text
    output_file = "tokenized_book.txt"

    # Saving the tokenized text to the output file
    save_tokens_to_file(tokens, output_file)

Q2. Encode small contrived text documents for the bag-of-words model using scikit-learn and Keras methods

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences



In [5]:
# Sample text documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

# Using scikit-learn for Bag of Words encoding
vectorizer = CountVectorizer()
X_scikit = vectorizer.fit_transform(documents)

# Using Keras for Bag of Words encoding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(documents)
X_keras = tokenizer.texts_to_matrix(documents, mode='count')

# Printing the scikit-learn and Keras results
print("Using scikit-learn (CountVectorizer):")
print(X_scikit.toarray())
print("\nUsing Keras (Tokenizer):")
print(X_keras)

# If you want to pad sequences to have the same length using Keras
# (This step is optional, but often used in NLP tasks)
max_sequence_length = len(max(documents, key=len).split())
X_keras_padded = pad_sequences(tokenizer.texts_to_sequences(documents), maxlen=max_sequence_length)

print("\nUsing Keras with Padding:")
print(X_keras_padded)


Using scikit-learn (CountVectorizer):
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]

Using Keras (Tokenizer):
[[0. 1. 1. 1. 1. 1. 0. 0. 0. 0.]
 [0. 1. 1. 1. 2. 0. 1. 0. 0. 0.]
 [0. 1. 1. 1. 0. 0. 0. 1. 1. 1.]
 [0. 1. 1. 1. 1. 1. 0. 0. 0. 0.]]

Using Keras with Padding:
[[0 1 2 3 5 4]
 [1 4 2 3 6 4]
 [7 1 2 3 8 9]
 [0 2 1 3 5 4]]
