# A. BASIC WORD EMBEDDINGS WITH TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer  

corpus = [ 
    "Deep learning is fun", 
    "Word embeddings can be learned", 
    "TF-IDF captures word importance", 
    "Embeddings represent words as vectors" 
] 

vectorizer = TfidfVectorizer() 
X = vectorizer.fit_transform(corpus) 

features = vectorizer.get_feature_names_out()
tfidf_matrix = X.toarray() 

print("TF-IDF Feature Names:") 
print(features) 
print("\nTF-IDF Matrix:") 
print(tfidf_matrix) 


# B. GENERATING WORD EMBEDDINGS USING WORD2VEC AND GLOVE

In [1]:
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
corpus = [
    "Natural language processing enables computers to understand human language.",
    "Word embeddings capture semantic relationships between words in a vector space.",
    "Deep learning techniques such as Word2Vec and GloVe are widely used in NLP applications.",
    "This is a sample document for generating word embeddings.",
    "Another example document is provided for demonstration purposes."
]
tokenized_corpus = [simple_preprocess(sentence) for sentence in corpus]
print("Sample tokenized sentences:\n", tokenized_corpus)
print("\nTraining Word2Vec model...")
w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, sg=1, min_count=1, workers=4)

print("\nWord2Vec: Similar words to 'document'")
print(w2v_model.wv.most_similar("document", topn=5))
print("\nLoading GloVe embeddings...")
glove_file = "glove.6B.100d.txt"
glove2word2vec(glove_file, "glove.6B.100d.word2vec.txt")
glove_model = KeyedVectors.load_word2vec_format("glove.6B.100d.word2vec.txt", binary=False)
print("\nGloVe: Similar words to 'document'")
print(glove_model.most_similar("document", topn=5))


Sample tokenized sentences:
 [['natural', 'language', 'processing', 'enables', 'computers', 'to', 'understand', 'human', 'language'], ['word', 'embeddings', 'capture', 'semantic', 'relationships', 'between', 'words', 'in', 'vector', 'space'], ['deep', 'learning', 'techniques', 'such', 'as', 'word', 'vec', 'and', 'glove', 'are', 'widely', 'used', 'in', 'nlp', 'applications'], ['this', 'is', 'sample', 'document', 'for', 'generating', 'word', 'embeddings'], ['another', 'example', 'document', 'is', 'provided', 'for', 'demonstration', 'purposes']]

Training Word2Vec model...

Word2Vec: Similar words to 'document'
[('semantic', 0.25275734066963196), ('example', 0.20082315802574158), ('vec', 0.19600167870521545), ('this', 0.1760866791009903), ('in', 0.17080651223659515)]

Loading GloVe embeddings...


  glove2word2vec(glove_file, "glove.6B.100d.word2vec.txt")


FileNotFoundError: [Errno 2] No such file or directory: 'glove.6B.100d.txt'

In [None]:
!pip install -q gensim
!wget -q http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip glove.6B.100d.txt
import gensim
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
corpus = [
    "Natural language processing enables computers to understand human language.",
    "Word embeddings capture semantic relationships between words in a vector space.",
    "Deep learning techniques such as Word2Vec and GloVe are widely used in NLP applications.",
    "This is a sample document for generating word embeddings.",
    "Another example document is provided for demonstration purposes."
]

tokenized_corpus = [simple_preprocess(sentence) for sentence in corpus]
print("Sample tokenized sentences:\n", tokenized_corpus)

print("\nTraining Word2Vec model...")
w2v_model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, sg=1, min_count=1, workers=4)
print("\nWord2Vec: Similar words to 'document'")
print(w2v_model.wv.most_similar("document", topn=5))

glove2word2vec("glove.6B.100d.txt", "glove.6B.100d.word2vec.txt")
glove_model = KeyedVectors.load_word2vec_format("glove.6B.100d.word2vec.txt", binary=False)
print("\nGloVe: Similar words to 'document'")
print(glove_model.most_similar("document", topn=5))


# C. SENTENCE EMBEDDINGS WITH UNIVERSAL SENTENCE ENCODER(google collab)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
sentences = [
    "This is a sentence.",
    "Another example sentence.",
    "Machine learning is fascinating.",
    "I love natural language processing.",
    "The sky is blue today."
]
print("Loading Universal Sentence Encoder model...")
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
print("Model loaded!")
sentence_embeddings = embed(sentences)
print(f"Sentence Embeddings Shape: {sentence_embeddings.shape}\n")
for i, sentence in enumerate(sentences):
    print(f"Sentence: {sentence}")
    print(f"Embedding vector (first 5 values): {sentence_embeddings[i][:5].numpy()}\n")
