<a href="https://colab.research.google.com/github/amirrezast/NLP_First_Encounter/blob/main/Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
# Paragraph about Egypt lore
paragraph = """Ancient Egypt, known for its rich mythology, has a pantheon of gods and goddesses such as Ra, the sun god,
and Isis, the goddess of magic and motherhood. The lore includes the story of Osiris, the god of the afterlife, who was
killed and dismembered by his brother Seth. Osiris's wife, Isis, reassembled his body and resurrected him, allowing him
to become the ruler of the underworld. The pharaohs, believed to be the earthly embodiments of gods, built grand pyramids
and temples to honor their deities. The ancient texts, like the Book of the Dead, provided instructions for navigating
the afterlife. Hieroglyphs, the writing system of ancient Egyptians, depicted these myths and were considered sacred.
The lore of Egypt continues to fascinate and influence modern culture, inspiring countless stories, films, and scholarly
research."""

In [3]:
# Lowercase words
paragraph_lower = paragraph.lower()

In [4]:
# Sentence tokenization
sentences = nltk.sent_tokenize(paragraph_lower)

In [5]:
# Initialize the lemmatizer and poststemmer
lemmatizer = WordNetLemmatizer()
ps = PorterStemmer()

In [6]:
# Lemmatization and stemming

lematized_word_sentences = []
stemmed_word_sentences = []

for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])

    words_lemmatized = [lemmatizer.lemmatize(word) for word in words \
                        if word not in set(stopwords.words('english'))]
    lematized_word_sentences.append(' '.join(words_lemmatized))

    words_stemmed = [ps.stem(word) for word in words \
                        if word not in set(stopwords.words('english'))]
    stemmed_word_sentences.append(' '.join(words_stemmed))


In [7]:
lematized_word_sentences[0:5]

['ancient egypt , known rich mythology , pantheon god goddess ra , sun god , isi , goddess magic motherhood .',
 'lore includes story osiris , god afterlife , killed dismembered brother seth .',
 "osiris 's wife , isi , reassembled body resurrected , allowing become ruler underworld .",
 'pharaoh , believed earthly embodiment god , built grand pyramid temple honor deity .',
 'ancient text , like book dead , provided instruction navigating afterlife .']

In [8]:
stemmed_word_sentences[0:5]

['ancient egypt , known rich mytholog , pantheon god goddess ra , sun god , isi , goddess magic motherhood .',
 'lore includ stori osiri , god afterlif , kill dismemb brother seth .',
 "osiri 's wife , isi , reassembl bodi resurrect , allow becom ruler underworld .",
 'pharaoh , believ earthli embodi god , built grand pyramid templ honor deiti .',
 'ancient text , like book dead , provid instruct navig afterlif .']

In [9]:
# Creating the Bag of Words model using stemmed sentences
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
x_bow = cv.fit_transform(stemmed_word_sentences).toarray()

# # Print the Bag of Words array
# print("\nBag of Words model array:")
# print(x_bow)

In [10]:
# Creating the TF-IDF model using lemmatized sentences
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer()
x_tfidf = tf.fit_transform(lematized_word_sentences).toarray()

# Print the TF-IDF array
# print("\nTF-IDF model array:")
# print(x_tfidf)