Text Preprocessing

In [1]:
import warnings
warnings.filterwarnings('ignore')

import nltk
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Ensure you have downloaded the necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Sample text
text = """
At eight o'clock on Thursday morning, Arthur didn't feel very good.
The website https://example.com is a URL that should be removed,
as well as HTML tags like <br> or <p>, and punctuation like ! or # or $.
"""

# Lowercase the text
text = text.lower()

# Remove URLs
text = re.sub(r'http[s]?://\S+', '', text)

# Remove HTML tags
text = re.sub(r'<[^>]+>', '', text)

# Remove punctuation
punctuation_table = str.maketrans('', '', string.punctuation)
text = text.translate(punctuation_table)

# Tokenization
tokens = word_tokenize(text)

# Removing stop words
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word not in stop_words]

# Stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

# Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

# Output the results
print("Cleaned Text:", text)
print("Filtered Tokens:", filtered_tokens)
print("Stemmed Tokens:", stemmed_tokens)
print("Lemmatized Tokens:", lemmatized_tokens)

[nltk_data] Downloading package punkt to /home/codespace/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/codespace/nltk_data...


Cleaned Text: 
at eight oclock on thursday morning arthur didnt feel very good
the website  is a url that should be removed
as well as html tags like  or  and punctuation like  or  or 

Filtered Tokens: ['eight', 'oclock', 'thursday', 'morning', 'arthur', 'didnt', 'feel', 'good', 'website', 'url', 'removed', 'well', 'html', 'tags', 'like', 'punctuation', 'like']
Stemmed Tokens: ['eight', 'oclock', 'thursday', 'morn', 'arthur', 'didnt', 'feel', 'good', 'websit', 'url', 'remov', 'well', 'html', 'tag', 'like', 'punctuat', 'like']
Lemmatized Tokens: ['eight', 'oclock', 'thursday', 'morning', 'arthur', 'didnt', 'feel', 'good', 'website', 'url', 'removed', 'well', 'html', 'tag', 'like', 'punctuation', 'like']


Text Feature Extraction

1. Bag-of-Words (BoW)
Bag-of-Words is a simple yet effective method of converting text to numerical representation. It involves two steps: first, creating a vocabulary of all the unique words in the text, and then, measuring the presence of each word in your text data.

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

# Example corpus of text
corpus = [
    'Text of the first document.',
    'Text of the second document made longer.',
    'Number three exists.',
    'This is number four.',
]

# Initialize a CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the corpus
X = vectorizer.fit_transform(corpus)

# Convert to an array for easy viewing
X_array = X.toarray()

# Output the feature names and the BoW array
print(vectorizer.get_feature_names_out())
print(X_array)

['document' 'exists' 'first' 'four' 'is' 'longer' 'made' 'number' 'of'
 'second' 'text' 'the' 'this' 'three']
[[1 0 1 0 0 0 0 0 1 0 1 1 0 0]
 [1 0 0 0 0 1 1 0 1 1 1 1 0 0]
 [0 1 0 0 0 0 0 1 0 0 0 0 0 1]
 [0 0 0 1 1 0 0 1 0 0 0 0 1 0]]


2. TF-IDF
Term Frequency-Inverse Document Frequency (TF-IDF) is a numerical statistic that reflects how important a word is to a document in a collection or corpus. It increases proportionally with the number of times a word appears in the document but is offset by the frequency of the word in the corpus.

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

# Convert to an array for easy viewing
X_tfidf_array = X_tfidf.toarray()

# Output the feature names and the TF-IDF array
print(tfidf_vectorizer.get_feature_names_out())
print(X_tfidf_array)

['document' 'exists' 'first' 'four' 'is' 'longer' 'made' 'number' 'of'
 'second' 'text' 'the' 'this' 'three']
[[0.4222466  0.         0.53556627 0.         0.         0.
  0.         0.         0.4222466  0.         0.4222466  0.4222466
  0.         0.        ]
 [0.3365971  0.         0.         0.         0.         0.42693074
  0.42693074 0.         0.3365971  0.42693074 0.3365971  0.3365971
  0.         0.        ]
 [0.         0.61761437 0.         0.         0.         0.
  0.         0.48693426 0.         0.         0.         0.
  0.         0.61761437]
 [0.         0.         0.         0.52547275 0.52547275 0.
  0.         0.41428875 0.         0.         0.         0.
  0.52547275 0.        ]]


3. n-gram
An n-gram model considers a sequence of n items from a given sample of text or speech. The items can be phonemes, syllables, letters, words, or base pairs according to the application.

In [4]:
# Initialize a CountVectorizer with n-gram
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))  # Here, we look at unigrams and bigrams

# Fit and transform the corpus
X_ngram = ngram_vectorizer.fit_transform(corpus)

# Output the feature names and the n-gram array
print(ngram_vectorizer.get_feature_names_out())
print(X_ngram.toarray())

['document' 'document made' 'exists' 'first' 'first document' 'four' 'is'
 'is number' 'longer' 'made' 'made longer' 'number' 'number four'
 'number three' 'of' 'of the' 'second' 'second document' 'text' 'text of'
 'the' 'the first' 'the second' 'this' 'this is' 'three' 'three exists']
[[1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 1 1 1 1 0 0 0 0 0]
 [1 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 1 1 1 1 1 0 1 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1]
 [0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0]]


4. Word2Vec
Word2Vec is a two-layer neural network that processes text by "vectorizing" words. Its input is a text corpus, and its output is a set of vectors: feature vectors for words in that corpus.

In [5]:
from gensim.models import Word2Vec

# Tokenize the corpus
tokenized_corpus = [doc.split() for doc in corpus]

# Train a Word2Vec model
#sg = 0, COB model, sg = 1, skip-gram model
word2vec_model = Word2Vec(tokenized_corpus, vector_size=100, window=5, min_count=1, workers=2,sg = 1)

# Get the vector for a word
word_vector = word2vec_model.wv['document']

# Output the vector for the word 'document'
print(word_vector)

[-1.9442164e-03 -5.2675214e-03  9.4471136e-03 -9.2987325e-03
  4.5039477e-03  5.4041781e-03 -1.4092624e-03  9.0070926e-03
  9.8853596e-03 -5.4750429e-03 -6.0210000e-03 -6.7469729e-03
 -7.8948820e-03 -3.0479168e-03 -5.5940272e-03 -8.3446801e-03
  7.8290224e-04  2.9946566e-03  6.4147436e-03 -2.6289499e-03
 -4.4534765e-03  1.2495709e-03  3.9146186e-04  8.1169987e-03
  1.8280029e-04  7.2315861e-03 -8.2645155e-03  8.4335366e-03
 -1.8889094e-03  8.7011540e-03 -7.6168370e-03  1.7963862e-03
  1.0564864e-03  4.6005251e-05 -5.1032533e-03 -9.2476979e-03
 -7.2642174e-03 -7.9511739e-03  1.9137275e-03  4.7846674e-04
 -1.8131376e-03  7.1201660e-03 -2.4756920e-03 -1.3473093e-03
 -8.9005642e-03 -9.9254129e-03  8.9493981e-03 -5.7539381e-03
 -6.3729975e-03  5.1994072e-03  6.6699935e-03 -6.8316413e-03
  9.5975993e-04 -6.0084737e-03  1.6473436e-03 -4.2892788e-03
 -3.4407973e-03  2.1856665e-03  8.6615775e-03  6.7281104e-03
 -9.6770572e-03 -5.6221043e-03  7.8803329e-03  1.9893574e-03
 -4.2560520e-03  5.98812

5. GloVe
Global Vectors for Word Representation (GloVe) is an unsupervised learning algorithm for obtaining vector representations for words. The model is an extension to Word2Vec and is based on matrix factorization techniques on the word-context matrix.

In [7]:
import numpy as np
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.keyedvectors import KeyedVectors

# You need to downloaded the GloVe pre-trained vectors and unzipped them first
# Due to the size of the pre-trained vectors, please complete this part after class
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

# Load the converted GloVe vectors
glove_model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

# Get the vector for a word
word_vector = glove_model['document']

# Output the vector for the word 'document'
print(word_vector)

[-2.7285e-01 -9.6449e-02  4.1131e-01  3.7925e-01  8.9352e-01  4.5227e-01
  1.9478e-01 -3.6985e-01  5.9704e-01  1.3387e-01  4.2878e-01 -2.8012e-01
  2.0141e-01 -1.9995e-02 -6.2983e-02  7.1399e-01  8.9025e-01 -3.1009e-01
 -1.9911e-01 -4.6591e-01 -8.8145e-01 -5.4318e-01 -5.2839e-01  7.0794e-02
 -3.1042e-01 -9.8677e-01  1.0283e-01  1.6911e-01 -4.4878e-01  1.6171e-01
  3.9394e-01  1.2655e-01 -1.2540e-01 -6.6462e-02 -1.2977e-01 -3.9406e-02
  4.4811e-02 -4.2534e-01  2.6742e-02 -3.8609e-01 -8.4547e-01 -6.4412e-02
  6.8974e-01  2.4521e-01 -7.3434e-01 -7.7389e-01 -1.5336e-01 -2.9057e-01
 -6.8358e-01 -3.8785e-01  1.2230e+00  1.7723e-01  1.6004e-01  8.3723e-01
 -3.1238e-01 -1.3138e+00 -2.6000e-01 -4.8754e-01  1.6751e+00  1.7320e-01
 -2.9494e-01  1.6038e-01 -5.3087e-01 -9.0950e-01  6.7436e-01 -5.2625e-01
 -3.0406e-01  8.5552e-01 -2.6879e-01 -9.0492e-01  3.0380e-01  2.0591e-01
  3.3439e-01 -6.2308e-01  6.4306e-02  2.2179e-01 -9.2076e-02  2.1894e-01
 -1.4015e+00 -4.4588e-02  2.6263e-01  1.5343e-01 -8