<a href="https://colab.research.google.com/github/amyliaw/CS6220-Data-Mining-Project/blob/master/Basic_Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer


In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [3]:
# Sample text data
texts = [
    "Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.",
    "Preprocessing text data is an essential step in natural language processing (NLP) tasks such as sentiment analysis, text classification, and machine translation.",
    "NLTK provides easy-to-use tools for text preprocessing, including tokenization, stopword removal, and lemmatization."
]


In [6]:
# Step 1: Tokenization
# at this step, we separate individual words and punctuation from the sentences
tokenized_texts = [word_tokenize(text.lower()) for text in texts]
print(tokenized_texts)

[['natural', 'language', 'processing', '(', 'nlp', ')', 'is', 'a', 'subfield', 'of', 'linguistics', ',', 'computer', 'science', ',', 'and', 'artificial', 'intelligence', 'concerned', 'with', 'the', 'interactions', 'between', 'computers', 'and', 'human', 'language', ',', 'in', 'particular', 'how', 'to', 'program', 'computers', 'to', 'process', 'and', 'analyze', 'large', 'amounts', 'of', 'natural', 'language', 'data', '.'], ['preprocessing', 'text', 'data', 'is', 'an', 'essential', 'step', 'in', 'natural', 'language', 'processing', '(', 'nlp', ')', 'tasks', 'such', 'as', 'sentiment', 'analysis', ',', 'text', 'classification', ',', 'and', 'machine', 'translation', '.'], ['nltk', 'provides', 'easy-to-use', 'tools', 'for', 'text', 'preprocessing', ',', 'including', 'tokenization', ',', 'stopword', 'removal', ',', 'and', 'lemmatization', '.']]


In [8]:
# remove stopwords
stop_words = set(stopwords.words('english'))
filtered_words = [[word for word in text if word not in stop_words] for text in tokenized_texts]
print(filtered_words)

[['natural', 'language', 'processing', '(', 'nlp', ')', 'subfield', 'linguistics', ',', 'computer', 'science', ',', 'artificial', 'intelligence', 'concerned', 'interactions', 'computers', 'human', 'language', ',', 'particular', 'program', 'computers', 'process', 'analyze', 'large', 'amounts', 'natural', 'language', 'data', '.'], ['preprocessing', 'text', 'data', 'essential', 'step', 'natural', 'language', 'processing', '(', 'nlp', ')', 'tasks', 'sentiment', 'analysis', ',', 'text', 'classification', ',', 'machine', 'translation', '.'], ['nltk', 'provides', 'easy-to-use', 'tools', 'text', 'preprocessing', ',', 'including', 'tokenization', ',', 'stopword', 'removal', ',', 'lemmatization', '.']]


In [11]:
# Step 3: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_texts = [[lemmatizer.lemmatize(word) for word in text] for text in filtered_words]
print(lemmatized_texts)

[['natural', 'language', 'processing', '(', 'nlp', ')', 'subfield', 'linguistics', ',', 'computer', 'science', ',', 'artificial', 'intelligence', 'concerned', 'interaction', 'computer', 'human', 'language', ',', 'particular', 'program', 'computer', 'process', 'analyze', 'large', 'amount', 'natural', 'language', 'data', '.'], ['preprocessing', 'text', 'data', 'essential', 'step', 'natural', 'language', 'processing', '(', 'nlp', ')', 'task', 'sentiment', 'analysis', ',', 'text', 'classification', ',', 'machine', 'translation', '.'], ['nltk', 'provides', 'easy-to-use', 'tool', 'text', 'preprocessing', ',', 'including', 'tokenization', ',', 'stopword', 'removal', ',', 'lemmatization', '.']]


In [13]:
# Print the preprocessed texts
for i, text in enumerate(lemmatized_texts):
    print(f"Preprocessed Text {i+1}: {' '.join(text)}\n")

# Convert preprocessed texts to a bag-of-words representation
vectorizer = CountVectorizer()
bow_matrix = vectorizer.fit_transform([' '.join(text) for text in lemmatized_texts])
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Display the bag-of-words matrix
print("Bag-of-Words Matrix:")
print(bow_df)

Preprocessed Text 1: natural language processing ( nlp ) subfield linguistics , computer science , artificial intelligence concerned interaction computer human language , particular program computer process analyze large amount natural language data .

Preprocessed Text 2: preprocessing text data essential step natural language processing ( nlp ) task sentiment analysis , text classification , machine translation .

Preprocessed Text 3: nltk provides easy-to-use tool text preprocessing , including tokenization , stopword removal , lemmatization .

Bag-of-Words Matrix:
   amount  analysis  analyze  artificial  classification  computer  concerned  \
0       1         0        1           1               0         3          1   
1       0         1        0           0               1         0          0   
2       0         0        0           0               0         0          0   

   data  easy  essential  ...  step  stopword  subfield  task  text  to  \
0     1     0          0 