In [7]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec
# Sample text data
text_data = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
    ]
# i) One Hot Encoding
def one_hot_encoding(text_data):
    unique_words = set(" ".join(text_data).split())
    encoded_data = []
    for text in text_data:
        encoded_text = [1 if word in text else 0 for word in unique_words]
        encoded_data.append(encoded_text)
    return np.array(encoded_data)
one_hot_encoded = one_hot_encoding(text_data)
print("One Hot Encoding:")
print(one_hot_encoded)
# ii) Bag of Words (BOW)
vectorizer = CountVectorizer()
bow_features = vectorizer.fit_transform(text_data)
print("\nBag of Words (BOW):")
print(bow_features.toarray())
# iii) n-grams
ngram_vectorizer = CountVectorizer(ngram_range=(1, 2))
ngram_features = ngram_vectorizer.fit_transform(text_data)
print("\nn-grams:")
print(ngram_features.toarray())
# iv) Tf-Idf
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(text_data)
print("\nTf-Idf:")
print(tfidf_features.toarray())
# v) Custom features (e.g., length of documents)
custom_features = np.array([[len(doc)] for doc in text_data])
print("\nCustom Features:")
print(custom_features)
# vi) Word2Vec (Word Embedding)
word2vec_model = Word2Vec([doc.split() for doc in text_data], min_count=1)

# Generate Word2Vec features
word2vec_model = Word2Vec([doc.split() for doc in text_data], min_count=1, vector_size=21)

# Generate Word2Vec features by averaging word vectors in each document
word2vec_features = np.array([
    np.mean([word2vec_model.wv[word] for word in doc.split() if word in word2vec_model.wv], axis=0)
    for doc in text_data
])
print("\nWord2Vec (Word Embedding) Features:")
print(word2vec_features)

One Hot Encoding:
[[0 0 1 1 1 1 0 0 1 0 0 1 0]
 [0 0 1 1 0 1 0 0 1 1 0 1 0]
 [1 0 1 0 0 0 0 1 1 0 1 0 1]
 [1 1 1 0 1 1 1 0 1 0 0 0 0]]

Bag of Words (BOW):
[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]

n-grams:
[[0 0 1 0 1 1 1 1 0 0 0 0 1 1 0 0 0 0 1 0 1 0]
 [0 0 2 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 0 0]
 [1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 1 1 1 1 0 1 0]
 [0 0 1 0 1 1 1 0 1 0 0 0 1 1 0 0 0 0 1 0 0 1]]

Tf-Idf:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]

Custom Features:
[[27]
 [37]
 [26]
 [27]]

Word2Vec (Word Embedding) Features:
[[-0.01430683  0.00804962  0.01949739  0.00651358 -0.01567571 -0.00421082
   0