In [None]:
import nltk
import re
import numpy as np
import heapq
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

# Load stopwords
nltk.download('stopwords')
nltk.download('punkt')
stopwords = set(nltk.corpus.stopwords.words('english'))

# Input text
text = """Natural Language Processing (NLP) is a fascinating field of artificial intelligence
that focuses on the interaction between computers and humans through natural language.
Machine learning techniques, such as deep learning and reinforcement learning, have significantly
improved NLP applications like machine translation, sentiment analysis, and text summarization.
Stopwords are common words that do not add much meaning to sentences and are often removed during text preprocessing.
Word embeddings, such as Word2Vec, capture the contextual meaning of words and are widely used in NLP tasks."""

# Sentence tokenization
sentences = sent_tokenize(text)

# Text preprocessing
cleaned_sentences = []
for sentence in sentences:
    sentence = sentence.lower()
    sentence = re.sub(r'\W', ' ', sentence)
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    words = [word for word in sentence.split() if word not in stopwords]  # Remove stopwords
    cleaned_sentences.append(" ".join(words))

print("Dataset: ", cleaned_sentences)
print("Length: ", len(cleaned_sentences))


Dataset:  ['natural language processing nlp fascinating field artificial intelligence focuses interaction computers humans natural language', 'machine learning techniques deep learning reinforcement learning significantly improved nlp applications like machine translation sentiment analysis text summarization', 'stopwords common words add much meaning sentences often removed text preprocessing', 'word embeddings word2vec capture contextual meaning words widely used nlp tasks']
Length:  4


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
# Word frequency calculation
word2count = {}
for sentence in cleaned_sentences:
    words = word_tokenize(sentence)
    for word in words:
        word2count[word] = word2count.get(word, 0) + 1

# Extract top 100 frequent words
freq_words = heapq.nlargest(100, word2count, key=word2count.get)
print(freq_words)
print(word2count)

['nlp', 'learning', 'natural', 'language', 'machine', 'text', 'words', 'meaning', 'processing', 'fascinating', 'field', 'artificial', 'intelligence', 'focuses', 'interaction', 'computers', 'humans', 'techniques', 'deep', 'reinforcement', 'significantly', 'improved', 'applications', 'like', 'translation', 'sentiment', 'analysis', 'summarization', 'stopwords', 'common', 'add', 'much', 'sentences', 'often', 'removed', 'preprocessing', 'word', 'embeddings', 'word2vec', 'capture', 'contextual', 'widely', 'used', 'tasks']
{'natural': 2, 'language': 2, 'processing': 1, 'nlp': 3, 'fascinating': 1, 'field': 1, 'artificial': 1, 'intelligence': 1, 'focuses': 1, 'interaction': 1, 'computers': 1, 'humans': 1, 'machine': 2, 'learning': 3, 'techniques': 1, 'deep': 1, 'reinforcement': 1, 'significantly': 1, 'improved': 1, 'applications': 1, 'like': 1, 'translation': 1, 'sentiment': 1, 'analysis': 1, 'text': 2, 'summarization': 1, 'stopwords': 1, 'common': 1, 'words': 2, 'add': 1, 'much': 1, 'meaning':

In [None]:
# Bag of Words (BoW) - Raw Counts
vectorizer = CountVectorizer(vocabulary=freq_words)
bow_matrix = vectorizer.fit_transform(cleaned_sentences)
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

print("\nBag of Words (Count Occurrences):")
print(bow_df)


Bag of Words (Count Occurrences):
   nlp  learning  natural  language  machine  text  words  meaning  \
0    1         0        2         2        0     0      0        0   
1    1         3        0         0        2     1      0        0   
2    0         0        0         0        0     1      1        1   
3    1         0        0         0        0     0      1        1   

   processing  fascinating  ...  removed  preprocessing  word  embeddings  \
0           1            1  ...        0              0     0           0   
1           0            0  ...        0              0     0           0   
2           0            0  ...        1              1     0           0   
3           0            0  ...        0              0     1           1   

   word2vec  capture  contextual  widely  used  tasks  
0         0        0           0       0     0      0  
1         0        0           0       0     0      0  
2         0        0           0       0     0      0  
3   

In [None]:

# Bag of Words (Normalized Counts)
bow_normalized = bow_matrix.toarray() / bow_matrix.toarray().sum(axis=1, keepdims=True)
print("\nBoW (Normalized Counts):\n", bow_normalized)

# Assuming stop_words_set is a set of stopwords
stop_words_set = {'down', 'than', 'now', 'against', 'that', 'out', 'did', 'him', 'myself', 'and'}  # Example set

tfidf_vectorizer = TfidfVectorizer(stop_words=list(stop_words_set))  # Convert set to list
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_sentences)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("\nTF-IDF Representation:")
print(tfidf_df)


BoW (Normalized Counts):
 [[0.07142857 0.         0.14285714 0.14285714 0.         0.
  0.         0.         0.07142857 0.07142857 0.07142857 0.07142857
  0.07142857 0.07142857 0.07142857 0.07142857 0.07142857 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.05555556 0.16666667 0.         0.         0.11111111 0.05555556
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.05555556 0.05555556
  0.05555556 0.05555556 0.05555556 0.05555556 0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.        ]
 [0.         0.         0.         0.         0.   

In [None]:
# Convert sentences to binary vectors
X = []
for sentence in cleaned_sentences:
    vector = [1 if word in word_tokenize(sentence) else 0 for word in freq_words]
    X.append(vector)

value = np.asarray(X)
print("\nBinary Word Vector Representation:\n", value)


Binary Word Vector Representation:
 [[1 0 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0]
 [1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0]
 [0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1
  0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  1 1 1 1 1 1 1 1]]


In [None]:
# Tokenization for Word2Vec
tokenized_dataset = [word_tokenize(sentence) for sentence in cleaned_sentences]

# Train Word2Vec model using Skip-Gram approach
word2vec_model = Word2Vec(sentences=tokenized_dataset, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Get embedding for a sample word
word = "learning"
if word in word2vec_model.wv:
    print(f"\nWord2Vec Embedding for '{word}':\n", word2vec_model.wv[word])
else:
    print(f"\n'{word}' not in vocabulary")

# Save the Word2Vec model
word2vec_model.save("word2vec.model")

# Get embeddings for all words
word_embeddings = {word: word2vec_model.wv[word] for word in word2vec_model.wv.index_to_key}
print("\nAll Word Embeddings:\n", word_embeddings)


Word2Vec Embedding for 'learning':
 [-8.5976161e-03  3.6749146e-03  5.1976210e-03  5.7099299e-03
  7.4425838e-03 -6.2079998e-03  1.1219241e-03  6.0673780e-03
 -2.8449413e-03 -6.2000742e-03 -3.9544734e-04 -8.3563728e-03
 -5.6282119e-03  7.0662415e-03  3.3483698e-03  7.1890610e-03
  6.8065729e-03  7.5120013e-03 -3.7882826e-03 -5.9188076e-04
  2.3362972e-03 -4.5466926e-03  8.4433807e-03 -9.8961275e-03
  6.7436974e-03  2.9207503e-03 -4.9509946e-03  4.4159591e-03
 -1.7549819e-03  6.7268731e-03  9.9961162e-03 -4.3911734e-03
 -5.7961576e-04 -5.7506254e-03  3.8779380e-03  2.7868797e-03
  6.8864268e-03  6.0910569e-03  9.5052719e-03  9.2581669e-03
  7.8865681e-03 -7.0112445e-03 -9.1759032e-03 -3.4382386e-04
 -3.1084430e-03  7.8388304e-03  5.9542791e-03 -1.5460993e-03
  1.5082511e-03  1.8219675e-03  7.8318380e-03 -9.5181232e-03
 -1.8789008e-04  3.5035321e-03 -9.6889702e-04  8.3926311e-03
  9.0175653e-03  6.5336726e-03 -7.4344187e-04  7.6985927e-03
 -8.5201310e-03  3.1937642e-03 -4.6243323e-03 -5