***Exercise 1***

* Implement Lemmatization: Replace the stemming process with lemmatization using WordNetLemmatizer.

In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Chopr\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Chopr\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Chopr\AppData\Roaming\nltk_data...


True

In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()

    text = re.sub(r'[^\w\s]', '', text)

    tokens = word_tokenize(text)

    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    return tokens

# Sample text
text = "Hello world! This is an example of text preprocessing in Python. Let's clean this text."

# Preprocess the sample text
preprocessed_tokens = preprocess_text(text)
print(preprocessed_tokens)

['hello', 'world', 'example', 'text', 'preprocessing', 'python', 'let', 'clean', 'text']


***Exercise 2***

* Handle Different Languages: Modify the code to preprocess text in a different language using the appropriate stopwords and tokenizers.

* Here Spanish is taken.

In [9]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

stemmer = SnowballStemmer('spanish')
stop_words = set(stopwords.words('spanish'))

def preprocess_text(text):
    text = text.lower()

    text = re.sub(r'[^\w\s]', '', text)

    tokens = word_tokenize(text, language='spanish')

    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]

    return tokens

text = "¡Hola mundo! Este es un ejemplo de procesamiento de texto en Python. Vamos a limpiar este texto."

preprocessed_tokens = preprocess_text(text)
print(preprocessed_tokens)


['hol', 'mund', 'ejempl', 'proces', 'text', 'python', 'vam', 'limpi', 'text']


***Exercise 3***

* Explore Other Vectorizers: Try using HashingVectorizer and compare it with CountVectorizer and TfidfVectorizer.

In [16]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

documents = [
    "The quick brown fox",
    "Jumps over the lazy dog",
    "The quick dog jumps"
]

# documents = [
#     "Data science is an interdisciplinary field",
#     "It uses scientific methods, processes, algorithms",
#     "To extract knowledge from structured and unstructured data",
#     "Machine learning is a key component of data science"
# ]

count_vectorizer = CountVectorizer()
tfidf_vectorizer = TfidfVectorizer()
hashing_vectorizer = HashingVectorizer(n_features=10) 

count_matrix = count_vectorizer.fit_transform(documents)
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)
hashing_matrix = hashing_vectorizer.transform(documents)

count_dense = count_matrix.toarray()
tfidf_dense = tfidf_matrix.toarray()
hashing_dense = hashing_matrix.toarray()

print("CountVectorizer Matrix:")
print(count_dense)
print("Feature Names:", count_vectorizer.get_feature_names_out())

print("\nTfidfVectorizer Matrix:")
print(tfidf_dense)
print("Feature Names:", tfidf_vectorizer.get_feature_names_out())

print("\nHashingVectorizer Matrix:")
print(hashing_dense)


CountVectorizer Matrix:
[[1 0 1 0 0 0 1 1]
 [0 1 0 1 1 1 0 1]
 [0 1 0 1 0 0 1 1]]
Feature Names: ['brown' 'dog' 'fox' 'jumps' 'lazy' 'over' 'quick' 'the']

TfidfVectorizer Matrix:
[[0.5844829  0.         0.5844829  0.         0.         0.
  0.44451431 0.34520502]
 [0.         0.40619178 0.         0.40619178 0.53409337 0.53409337
  0.         0.31544415]
 [0.         0.52682017 0.         0.52682017 0.         0.
  0.52682017 0.40912286]]
Feature Names: ['brown' 'dog' 'fox' 'jumps' 'lazy' 'over' 'quick' 'the']

HashingVectorizer Matrix:
[[ 0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.        ]
 [ 0.          0.57735027  0.         -0.57735027  0.          0.
   0.          0.         -0.57735027  0.        ]
 [ 0.          0.          0.         -0.5         0.          0.5
   0.          0.5        -0.5         0.        ]]
