In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

# Sample sentences for the task
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "The lazy dog sleeps all day long.",
    "A fox is a wild animal and it loves to jump."
]

# Initialize the CountVectorizer with different parameters and fit the model
count_vectorizer_1 = CountVectorizer(max_df=1, min_df=1, max_features=3)
count_vectorizer_2 = CountVectorizer(max_df=2, min_df=1, max_features=3)
count_vectorizer_3 = CountVectorizer(max_df=0.75, min_df=1, max_features=3)

# Fit and transform the data
count_matrix_1 = count_vectorizer_1.fit_transform(sentences)
count_matrix_2 = count_vectorizer_2.fit_transform(sentences)
count_matrix_3 = count_vectorizer_3.fit_transform(sentences)

# Display the feature names and their corresponding count vectors for each case
print("CountVectorizer with max_df=1, min_df=1, max_features=3:")
print(pd.DataFrame(count_matrix_1.toarray(), columns=count_vectorizer_1.get_feature_names_out()))

print("\nCountVectorizer with max_df=2, min_df=1, max_features=3:")
print(pd.DataFrame(count_matrix_2.toarray(), columns=count_vectorizer_2.get_feature_names_out()))

print("\nCountVectorizer with max_df=0.75, min_df=1, max_features=3:")
print(pd.DataFrame(count_matrix_3.toarray(), columns=count_vectorizer_3.get_feature_names_out()))

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the data
tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)

# Display the TF-IDF features
print("\nTF-IDF Matrix:")
print(pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out()))


In [None]:
import nltk
import string
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import spacy

# Download NLTK data (if not already downloaded)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')


In [None]:
# Sample text
text = """
Transformer is an exceptional innovation in the field of Deep Learning, contributed by Ashish 
Vaswani et al. (2017), Google. The transformer is the most influential Neural Network model that has 
shown outstanding performance on various NLP tasks including Machine Reading Comprehension, 
Machine translation and sentence classification. Attention mechanism and parallelization are the 
prominent features in the transformers. Consequently, it can facilitate long-range dependencies 
without any gradient vanishing or gradient explosion problems and it overcomes the drawbacks of 
the existing methods such as RNN and LSTM. The transformer is executed with an encoder-decoder 
mechanism and the original article of transformers # “Attention All You Need”.
"""


In [None]:

# a. Word and Sentence Tokenization
sentences = sent_tokenize(text)
words = word_tokenize(text)


In [None]:
words

In [None]:
# b. Stopwords removal
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words]


In [None]:
filtered_words

In [None]:
# c. Punctuation Removal
filtered_words_no_punctuation = [word for word in filtered_words if word not in string.punctuation]
filtered_words_no_punctuation

In [None]:
# d. Frequency Distribution
fdist = FreqDist(filtered_words_no_punctuation)
fdist

In [None]:

# Plot Frequency Distribution
fdist.plot(30, cumulative=False)
plt.show()

In [None]:

# e. Stemming and Lemmatization
# Stemming (using PorterStemmer and LancasterStemmer)
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()

In [None]:

porter_stems = [porter_stemmer.stem(word) for word in filtered_words_no_punctuation]
lancaster_stems = [lancaster_stemmer.stem(word) for word in filtered_words_no_punctuation]

In [None]:
# Lemmatization using WordNetLemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words_no_punctuation]

In [None]:

# Output Stems and Lemmas
print("\nPorter Stems:", porter_stems[:10])
print("\nLancaster Stems:", lancaster_stems[:10])
print("\nLemmatized Words:", lemmatized_words[:10])

In [None]:

# g. Named Entity Recognition (NER)
# Load Spacy's pre-trained model for NER
nlp = spacy.load('en_core_web_sm')
doc = nlp(text)
ner = [(entity.text, entity.label_) for entity in doc.ents]

print("\nNamed Entities:", ner)

In [None]:
# f. PoS Tagging
pos_tags = pos_tag(filtered_words_no_punctuation)
print("\nPart-of-Speech Tags:", pos_tags[:10])
