In [None]:
import nltk
import spacy
import string
import matplotlib.pyplot as plt
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from nltk import pos_tag
from wordcloud import WordCloud

# Load English stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nlp = spacy.load('en_core_web_sm')

# Given Corpus
corpus = """Transformer is an exceptional innovation in the field of Deep Learning, contributed by Ashish 
Vaswani et al. (2017), Google. The transformer is the most influential Neural Network model that has 
shown outstanding performance on various NLP tasks including Machine Reading Comprehension, 
Machine translation and sentence classification. Attention mechanism and parallelization are the 
prominent features in the transformers. Consequently, it can facilitate long-range dependencies 
without any gradient vanishing or gradient explosion problems and it overcomes the drawbacks of 
the existing methods such as RNN and LSTM. The transformer is executed with an encoder-decoder 
mechanism and the original article of transformers # “Attention All You Need”."""

# **(a) Word and Sentence Tokenization**
sent_tokens = sent_tokenize(corpus)
word_tokens = word_tokenize(corpus)

print("Sentence Tokenization:\n", sent_tokens)
print("\nWord Tokenization:\n", word_tokens)

# **(b) Stopwords Removal**
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in word_tokens if word.lower() not in stop_words]

print("\nWords after Stopword Removal:\n", filtered_words)

# **(c) Punctuation Removal**
filtered_words = [word for word in filtered_words if word not in string.punctuation]

print("\nWords after Punctuation Removal:\n", filtered_words)

# **(d) Frequency Distribution and Visualization**
word_freq = Counter(filtered_words)

# Visualizing using WordCloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_freq)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Frequency Distribution")
plt.show()

# **(e) Stemming (Porter and Lancaster) and Lemmatization**
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_words_porter = [porter_stemmer.stem(word) for word in filtered_words]
stemmed_words_lancaster = [lancaster_stemmer.stem(word) for word in filtered_words]
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

print("\nPorter Stemmer Output:\n", stemmed_words_porter)
print("\nLancaster Stemmer Output:\n", stemmed_words_lancaster)
print("\nLemmatization Output:\n", lemmatized_words)

# **(f) Part-of-Speech (PoS) Tagging**
pos_tags = pos_tag(filtered_words)
print("\nPoS Tagging:\n", pos_tags)

# **(g) Named Entity Recognition (NER)**
doc = nlp(corpus)
print("\nNamed Entities:\n")
for ent in doc.ents:
    print(ent.text, "->", ent.label_)


In [None]:
import gensim
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from sklearn.decomposition import PCA
from gensim.models import Word2Vec

# Sample Text
text = """Mickey Mouse, a cheerful and optimistic mouse clad in red shorts and yellow shoes, 
is the iconic mascot of The Walt Disney Company. Debuting in 1928, this spunky character has 
charmed audiences for generations with his adventures and can-do attitude."""

# Tokenization
nltk.download('punkt')
sentences = [word_tokenize(text.lower())]  # Convert to lowercase and tokenize

# Train Word2Vec Model (Skip-gram)
skipgram_model = Word2Vec(sentences, vector_size=50, window=5, sg=1, min_count=1)

# Train Word2Vec Model (CBOW)
cbow_model = Word2Vec(sentences, vector_size=50, window=5, sg=0, min_count=1)

print("Skip-gram Example:", skipgram_model.wv.most_similar('mouse'))
print("CBOW Example:", cbow_model.wv.most_similar('mouse'))


In [None]:
def plot_embeddings(model, title):
    words = list(model.wv.key_to_index.keys())[:10]  # Get first 10 words
    word_vectors = np.array([model.wv[word] for word in words])

    # Reduce dimensions using PCA
    pca = PCA(n_components=2)
    reduced_vectors = pca.fit_transform(word_vectors)

    # Plot the vectors
    plt.figure(figsize=(8, 6))
    sns.scatterplot(x=reduced_vectors[:, 0], y=reduced_vectors[:, 1], marker='o')

    for i, word in enumerate(words):
        plt.text(reduced_vectors[i, 0], reduced_vectors[i, 1], word, fontsize=12)

    plt.title(title)
    plt.show()

# Plot Word Embeddings for Skip-gram and CBOW
plot_embeddings(skipgram_model, "Word2Vec - Skip-gram Visualization")
plot_embeddings(cbow_model, "Word2Vec - CBOW Visualization")


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Sample Sentences
sentences = [
    "Mickey Mouse is a famous Disney character.",
    "Walt Disney created Mickey Mouse in 1928.",
    "The Disney Company is known for Mickey Mouse and animated films."
]

# Count Vectorizer with different parameters
vectorizer1 = CountVectorizer(max_df=1)
vectorizer2 = CountVectorizer(max_df=2)
vectorizer3 = CountVectorizer(max_df=0.75, min_df=1, max_features=3)

# Fit the vectorizer on sentences
X1 = vectorizer1.fit_transform(sentences).toarray()
X2 = vectorizer2.fit_transform(sentences).toarray()
X3 = vectorizer3.fit_transform(sentences).toarray()

print("\nCount Vectorizer (max_df=1):\n", vectorizer1.get_feature_names_out())
print(X1)

print("\nCount Vectorizer (max_df=2):\n", vectorizer2.get_feature_names_out())
print(X2)

print("\nCount Vectorizer (max_df=0.75, min_df=1, max_features=3):\n", vectorizer3.get_feature_names_out())
print(X3)

# **TF-IDF Implementation**
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(sentences).toarray()

print("\nTF-IDF Feature Names:\n", tfidf.get_feature_names_out())
print(X_tfidf)
