**Perform bag-of-words approach (count occurrence, normalized count occurrence), TF-IDF on data. Create embeddings using Word2Vec**

In [None]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import numpy as np

# Ensure you have the necessary NLTK resources
nltk.download('punkt')
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
# Sample text data (list of sentences)
corpus = [
    "Natural Language Processing is fun.",
    "Machine learning is a subfield of artificial intelligence.",
    "Deep learning is a part of machine learning.",
    "NLP techniques include tokenization, stemming, and lemmatization."
]


In [None]:
# Initialize CountVectorizer to count word occurrences
count_vectorizer = CountVectorizer()

# Fit the vectorizer on the corpus and transform the text data
X_count = count_vectorizer.fit_transform(corpus)

# Convert to an array and print the result
count_matrix = X_count.toarray()
print("Count Occurrence (Bag-of-Words):\n", count_matrix)

# Display the feature names (words)
print("\nFeature Names (Words):", count_vectorizer.get_feature_names_out())


Count Occurrence (Bag-of-Words):
 [[0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0]
 [0 1 0 0 0 1 1 0 1 0 1 0 0 1 0 0 0 1 0 0]
 [0 0 1 0 0 0 1 0 2 0 1 0 0 1 1 0 0 0 0 0]
 [1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 1 1]]

Feature Names (Words): ['and' 'artificial' 'deep' 'fun' 'include' 'intelligence' 'is' 'language'
 'learning' 'lemmatization' 'machine' 'natural' 'nlp' 'of' 'part'
 'processing' 'stemming' 'subfield' 'techniques' 'tokenization']


In [None]:
# Normalize word occurrences by converting the count matrix into term frequency
X_normalized = X_count.toarray() / X_count.sum(axis=1).reshape(-1, 1)

print("\nNormalized Count Occurrence (Term Frequency):\n", X_normalized)



Normalized Count Occurrence (Term Frequency):
 [[0.         0.         0.         0.2        0.         0.
  0.2        0.2        0.         0.         0.         0.2
  0.         0.         0.         0.2        0.         0.
  0.         0.        ]
 [0.         0.14285714 0.         0.         0.         0.14285714
  0.14285714 0.         0.14285714 0.         0.14285714 0.
  0.         0.14285714 0.         0.         0.         0.14285714
  0.         0.        ]
 [0.         0.         0.14285714 0.         0.         0.
  0.14285714 0.         0.28571429 0.         0.14285714 0.
  0.         0.14285714 0.14285714 0.         0.         0.
  0.         0.        ]
 [0.14285714 0.         0.         0.         0.14285714 0.
  0.         0.         0.         0.14285714 0.         0.
  0.14285714 0.         0.         0.         0.14285714 0.
  0.14285714 0.14285714]]


In [None]:
# Initialize TfidfVectorizer to calculate TF-IDF scores
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer on the corpus and transform the text data
X_tfidf = tfidf_vectorizer.fit_transform(corpus)

# Convert to an array and print the result
tfidf_matrix = X_tfidf.toarray()
print("\nTF-IDF Matrix:\n", tfidf_matrix)

# Display the feature names (words)
print("\nTF-IDF Feature Names (Words):", tfidf_vectorizer.get_feature_names_out())



TF-IDF Matrix:
 [[0.         0.         0.         0.47633035 0.         0.
  0.30403549 0.47633035 0.         0.         0.         0.47633035
  0.         0.         0.         0.47633035 0.         0.
  0.         0.        ]
 [0.         0.43551643 0.         0.         0.         0.43551643
  0.27798449 0.         0.34336615 0.         0.34336615 0.
  0.         0.34336615 0.         0.         0.         0.43551643
  0.         0.        ]
 [0.         0.         0.40366689 0.         0.         0.
  0.25765535 0.         0.63651122 0.         0.31825561 0.
  0.         0.31825561 0.40366689 0.         0.         0.
  0.         0.        ]
 [0.37796447 0.         0.         0.         0.37796447 0.
  0.         0.         0.         0.37796447 0.         0.
  0.37796447 0.         0.         0.         0.37796447 0.
  0.37796447 0.37796447]]

TF-IDF Feature Names (Words): ['and' 'artificial' 'deep' 'fun' 'include' 'intelligence' 'is' 'language'
 'learning' 'lemmatization' 'mach

In [None]:
# Tokenize the corpus (split text into words)
tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]

# Initialize the Word2Vec model
word2vec_model = Word2Vec(tokenized_corpus, vector_size=50, window=3, min_count=1, sg=0)

# Train the Word2Vec model
word2vec_model.train(tokenized_corpus, total_examples=len(corpus), epochs=10)

# Get the vector representation (embedding) for a specific word
word_vector = word2vec_model.wv['language']  # Example word: 'language'
print("\nWord2Vec Embedding for 'language':\n", word_vector)

# Get the vector representation for a list of words
words = ['language', 'machine', 'learning']
word_vectors = [word2vec_model.wv[word] for word in words]
print("\nWord2Vec Embeddings for multiple words:", list(zip(words, word_vectors)))





Word2Vec Embedding for 'language':
 [-0.01428453  0.00248143 -0.01435139 -0.00447787  0.00744023  0.01165937
  0.00240805  0.00420501 -0.0082356   0.01444217 -0.01261733  0.00928294
 -0.016431    0.00407175 -0.00994886 -0.00848354 -0.00621368  0.01131507
  0.01157967 -0.00995801  0.00154655 -0.01698448  0.01563551  0.01850453
 -0.00548207  0.00159986  0.00148957  0.01096059 -0.01722168  0.0011757
  0.01374442  0.00445443  0.00224918 -0.01864685  0.016965   -0.01252744
 -0.00597483  0.00698619 -0.00154332  0.00282392  0.00356977 -0.01366348
 -0.01945646  0.01807972  0.01240432 -0.01382146  0.00681484  0.0004075
  0.00950946 -0.01423734]

Word2Vec Embeddings for multiple words: [('language', array([-0.01428453,  0.00248143, -0.01435139, -0.00447787,  0.00744023,
        0.01165937,  0.00240805,  0.00420501, -0.0082356 ,  0.01444217,
       -0.01261733,  0.00928294, -0.016431  ,  0.00407175, -0.00994886,
       -0.00848354, -0.00621368,  0.01131507,  0.01157967, -0.00995801,
        0.00