In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample documents
documents = ["Dog barks loudly", "Cat meows", "Bird sings", "Dog and cat are friends"]

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform documents
BoW_matrix = vectorizer.fit_transform(documents)

# Convert BoW matrix to array
print("Bag of Words (count occurrence):\n", BoW_matrix.toarray())


Bag of Words (count occurrence):
 [[0 0 1 0 0 1 0 1 0 0]
 [0 0 0 0 1 0 0 0 1 0]
 [0 0 0 1 0 0 0 0 0 1]
 [1 1 0 0 1 1 1 0 0 0]]


In [11]:
import numpy as np

# Normalized count occurrence
bow_matrix_dense = BoW_matrix.toarray()
norms = np.linalg.norm(bow_matrix_dense, axis=1)
normalized_bow_matrix = bow_matrix_dense / norms[:, None]

print("\nBag-of-Words (Normalized count occurrence):")
print(normalized_bow_matrix)



Bag-of-Words (Normalized count occurrence):
[[0.         0.         0.57735027 0.         0.         0.57735027
  0.         0.57735027 0.         0.        ]
 [0.         0.         0.         0.         0.70710678 0.
  0.         0.         0.70710678 0.        ]
 [0.         0.         0.         0.70710678 0.         0.
  0.         0.         0.         0.70710678]
 [0.4472136  0.4472136  0.         0.         0.4472136  0.4472136
  0.4472136  0.         0.         0.        ]]


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform documents
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Convert TF-IDF matrix to array
print("TF-IDF Matrix:\n", tfidf_matrix.toarray())


TF-IDF Matrix:
 [[0.         0.         0.61761437 0.         0.         0.48693426
  0.         0.61761437 0.         0.        ]
 [0.         0.         0.         0.         0.6191303  0.
  0.         0.         0.78528828 0.        ]
 [0.         0.         0.         0.70710678 0.         0.
  0.         0.         0.         0.70710678]
 [0.48546061 0.48546061 0.         0.         0.38274272 0.38274272
  0.48546061 0.         0.         0.        ]]


In [6]:
from gensim.models import Word2Vec

# Tokenized sentences
sentences = [doc.split() for doc in documents]

# Train a Word2Vec model
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=2)

# Get the vector for a word
vector = model.wv['Dog']
print("Vector for 'dog':\n", vector)


Vector for 'dog':
 [-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 -4.1406299e-03 -7.6826881e-03
 -1.5