Count Vectorizer

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample corpus (list of text documents)
corpus = [
    "I love natural language processing",
    "Language processing is amazing",
    "I love learning new things in NLP"
]

# Initialize the Count Vectorizer
vectorizer = CountVectorizer()

# Fit the model and transform the text data to count vectors
X = vectorizer.fit_transform(corpus)

# Convert the result to an array for easier visualization
X_array = X.toarray()

# Get feature names (unique words in the corpus)
feature_names = vectorizer.get_feature_names_out()

# Print the results
print("Feature Names:", feature_names)
print("\nCount Vectors:\n", X_array)


Feature Names: ['amazing' 'in' 'is' 'language' 'learning' 'love' 'natural' 'new' 'nlp'
 'processing' 'things']

Count Vectors:
 [[0 0 0 1 0 1 1 0 0 1 0]
 [1 0 1 1 0 0 0 0 0 1 0]
 [0 1 0 0 1 1 0 1 1 0 1]]


TF-IDF

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus (list of text documents)
corpus = [
    "I love natural language processing",
    "Language processing is amazing",
    "I love learning new things in NLP"
]

# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the model and transform the text data to TF-IDF vectors
X = tfidf_vectorizer.fit_transform(corpus)

# Convert the result to an array for easier visualization
X_array = X.toarray()

# Get feature names (unique words in the corpus)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Print the results
print("Feature Names:", feature_names)
print("\nTF-IDF Vectors:\n", X_array)


Feature Names: ['amazing' 'in' 'is' 'language' 'learning' 'love' 'natural' 'new' 'nlp'
 'processing' 'things']

TF-IDF Vectors:
 [[0.         0.         0.         0.45985353 0.         0.45985353
  0.60465213 0.         0.         0.45985353 0.        ]
 [0.5628291  0.         0.5628291  0.42804604 0.         0.
  0.         0.         0.         0.42804604 0.        ]
 [0.         0.42339448 0.         0.         0.42339448 0.32200242
  0.         0.42339448 0.42339448 0.         0.42339448]]


Word2Vec / GloVe

In [None]:
!pip install gensim



Word2Vec

In [4]:
from gensim.models import Word2Vec

# Sample corpus
sentences = [
    ["I", "love", "natural", "language", "processing"],
    ["Language", "processing", "is", "amazing"],
    ["I", "love", "learning", "new", "things", "in", "NLP"],
    ["Natural", "language", "processing", "is", "fun"]
]

# Train Word2Vec model with CBOW
model = Word2Vec(sentences, vector_size=50, window=2, min_count=1, sg=0)  # sg=0 for CBOW, sg=1 for Skip-gram

# Get the vector for a word
print("Vector for 'language':\n", model.wv['language'])

# Find similar words
similar_words = model.wv.most_similar("language")
print("\nMost similar words to 'language':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.2f}")


Vector for 'language':
 [-0.01723938  0.00733148  0.01037977  0.01148388  0.01493384 -0.01233535
  0.00221123  0.01209456 -0.0056801  -0.01234705 -0.00082045 -0.0167379
 -0.01120002  0.01420908  0.00670508  0.01445134  0.01360049  0.01506148
 -0.00757831 -0.00112361  0.00469675 -0.00903806  0.01677746 -0.01971633
  0.01352928  0.00582883 -0.00986566  0.00879638 -0.00347915  0.01342277
  0.0199297  -0.00872489 -0.00119868 -0.01139127  0.00770164  0.00557325
  0.01378215  0.01220219  0.01907699  0.01854683  0.01579614 -0.01397901
 -0.01831173 -0.00071151 -0.00619968  0.01578863  0.01187715 -0.00309133
  0.00302193  0.00358008]

Most similar words to 'language':
I: 0.17
learning: 0.16
in: 0.14
processing: 0.13
new: 0.12
Natural: 0.09
natural: 0.03
Language: 0.02
is: 0.01
fun: -0.03


GLOVE

Download the embeddings from Glove official site https://nlp.stanford.edu/projects/glove/
unzip the 6B file and provide the path here bellow.

In [5]:
from gensim.models.keyedvectors import KeyedVectors

# Convert GloVe file format to Word2Vec format (run this once)
glove_file = 'D:/Basudev/genaiprereq/Day2/glove.6B/glove.6B.100d.txt'  # Update with your path
word2vec_output_file = 'glove.6B.100d.word2vec.txt'
from gensim.scripts.glove2word2vec import glove2word2vec
glove2word2vec(glove_file, word2vec_output_file)

# Load the converted model
model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

# Check the vector for a word
print("Vector for 'language':\n", model['language'])

# Find similar words
similar_words = model.most_similar("language")
print("\nMost similar words to 'language':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.2f}")


  glove2word2vec(glove_file, word2vec_output_file)


Vector for 'language':
 [ 0.18519   0.34111   0.36097   0.27093  -0.031335  0.83923  -0.50534
 -0.80062   0.40695   0.82488  -0.98239  -0.6354   -0.21382   0.079889
 -0.29557   0.17075   0.17479  -0.74214  -0.2677    0.21074  -0.41795
  0.027713  0.71123   0.2063   -0.12266  -0.80088   0.22942   0.041037
 -0.56901   0.097472 -0.59139   1.0524   -0.66803  -0.70471   0.69757
 -0.11137  -0.27816   0.047361  0.020305 -0.184    -1.0254    0.11297
 -0.79547   0.41642  -0.2508   -0.3188    0.37044  -0.26873  -0.36185
 -0.096621 -0.029956  0.67308   0.53102   0.62816  -0.11507  -1.5524
 -0.30628  -0.4253    1.8887    0.3247    0.60202   0.81163  -0.46029
 -1.4061    0.80229   0.2019    0.60938   0.063545  0.21925  -0.043372
 -0.36648   0.61308   1.0207   -0.39014   0.1717    0.61272  -0.80342
  0.71295  -1.0938   -0.50546  -0.99668  -1.6701   -0.31804  -0.62934
 -2.0226    0.79405  -0.16994  -0.37627   0.57998   0.16643   0.1356
  0.0943   -0.24154   0.7123   -0.4201    0.24735  -0.94449  -1.0

FastText

In [6]:
from gensim.models import FastText

# Sample corpus (list of tokenized sentences)
sentences = [
    ["I", "love", "natural", "language", "processing"],
    ["Language", "processing", "is", "amazing"],
    ["I", "love", "learning", "new", "things", "in", "NLP"],
    ["Natural", "language", "processing", "is", "fun"]
]

# Train FastText model
model = FastText(sentences, vector_size=50, window=3, min_count=1, sg=1)  # sg=1 for Skip-gram; sg=0 for CBOW

# Example 1: Get the vector for a word
print("Vector for 'language':\n", model.wv['language'])

# Example 2: Find similar words
similar_words = model.wv.most_similar("language")
print("\nMost similar words to 'language':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.2f}")

# Example 3: Handle an out-of-vocabulary (OOV) word
oov_word = "processor"
print(f"\nVector for OOV word '{oov_word}':")
print(model.wv[oov_word])  # FastText can generate embeddings for OOV words based on subwords


Vector for 'language':
 [-4.6425052e-03  3.6429535e-03  1.2540421e-03 -3.6066731e-03
 -3.2773227e-03  2.4448987e-03 -6.6240068e-04  5.1185279e-04
 -2.1971019e-03 -1.1697644e-03  1.7850193e-03 -1.0030718e-03
  3.3278466e-05  1.0310608e-03  3.8224044e-03  1.8119046e-03
  1.2733500e-03  5.3642802e-03  2.2080135e-04 -1.1716852e-03
 -2.3896585e-04 -9.9625357e-04 -1.1096887e-03  3.0506297e-03
  5.1598290e-05 -3.7843618e-04 -5.6672175e-03 -2.2503231e-03
  3.3958235e-03 -2.9582152e-04  4.9308175e-03  5.3104747e-04
  5.3159345e-04 -3.1725504e-06 -1.5835118e-03 -1.6003581e-03
  1.8443600e-03  9.1458380e-04 -2.5605112e-03  3.3275173e-03
 -7.8906858e-04  2.9359653e-03 -6.4219598e-04  1.5274928e-03
  1.5035694e-03 -1.5052463e-04  2.5598896e-03 -1.7102089e-03
 -7.0781645e-04  1.4422407e-03]

Most similar words to 'language':
Language: 0.77
new: 0.28
learning: 0.27
in: 0.16
things: 0.08
natural: 0.08
fun: 0.07
I: 0.05
processing: 0.04
amazing: 0.02

Vector for OOV word 'processor':
[-2.1812057e-03  2

ELMO

##Use Colab with T4

In [9]:

Will try something else

SyntaxError: invalid syntax (4283510776.py, line 1)

BERT

In [11]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Define sentences for embedding
sentences = [
    "I love machine learning.",
    "ELMo embeddings capture syntax and semantics.",
    "Deep learning is awesome!",
    "Natural language processing is fascinating.",
    "The weather is nice today."
]

# Get embeddings for each sentence
with torch.no_grad():
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors='pt')
        outputs = model(**inputs)
        last_hidden_states = outputs.last_hidden_state
        print(f"Embedding for '{sentence}': {last_hidden_states[0][0][:5]}...")  # Display first 5 values for readability


Embedding for 'I love machine learning.': tensor([ 0.0995,  0.2099, -0.1130, -0.2212, -0.4113])...
Embedding for 'ELMo embeddings capture syntax and semantics.': tensor([-0.6196, -0.2122, -0.4904, -0.2664, -0.3653])...
Embedding for 'Deep learning is awesome!': tensor([ 0.2572,  0.0931,  0.0439,  0.0604, -0.2219])...
Embedding for 'Natural language processing is fascinating.': tensor([-0.0247, -0.0657, -0.4309, -0.0512, -0.4532])...
Embedding for 'The weather is nice today.': tensor([ 0.2954, -0.0861, -0.1512, -0.2963, -0.1568])...


GPT

In [13]:
from transformers import GPT2Tokenizer, GPT2Model
import torch

# Load pre-trained GPT-2 model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2Model.from_pretrained("gpt2")

# Set the padding token to the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

# Sample sentences
sentences = [
    "I love machine learning.",
    "GPT models are powerful.",
    "Natural language processing is fascinating.",
    "The weather is nice today."
]

# Tokenize sentences and convert to tensor format
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Generate embeddings
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state  # [batch_size, sequence_length, hidden_size]

# Display the embeddings
for i, sentence in enumerate(sentences):
    print(f"\nSentence {i+1}: '{sentence}'")
    for j, token_embedding in enumerate(embeddings[i]):
        print(f"Token {j+1} Embedding: {token_embedding[:5]}...")  # Display first 5 values for readability



Sentence 1: 'I love machine learning.'
Token 1 Embedding: tensor([-0.0796, -0.0654, -0.0842, -0.0337, -0.0758])...
Token 2 Embedding: tensor([ 0.0757, -0.0351, -0.4318,  0.4180, -0.1469])...
Token 3 Embedding: tensor([ 0.2298,  0.0530, -0.1179, -0.3026,  0.0565])...
Token 4 Embedding: tensor([-0.5382,  0.2217, -1.8012, -0.6799,  0.2488])...
Token 5 Embedding: tensor([-0.0386, -0.3294, -0.1851, -0.1470, -0.1241])...
Token 6 Embedding: tensor([-0.0819,  0.2561, -0.5680, -0.4453,  0.2434])...

Sentence 2: 'GPT models are powerful.'
Token 1 Embedding: tensor([-0.2023, -0.1084, -0.1990, -0.0576,  0.0202])...
Token 2 Embedding: tensor([ 0.3028, -0.0956,  0.3298,  0.1205,  0.2338])...
Token 3 Embedding: tensor([ 0.0383, -0.3765, -0.8445,  0.0795,  0.5644])...
Token 4 Embedding: tensor([ 0.0440, -0.0018, -1.0083, -0.1281,  0.3954])...
Token 5 Embedding: tensor([ 0.0569, -0.0707, -1.2651, -0.3378,  0.1564])...
Token 6 Embedding: tensor([ 0.1741, -0.1055, -0.2135,  0.1332, -0.2807])...

Sentenc

T5

In [15]:
from transformers import T5Tokenizer, T5EncoderModel
import torch

# Load pre-trained T5 encoder model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5EncoderModel.from_pretrained("t5-small")  # Use T5EncoderModel instead of T5Model

# Sample sentences
sentences = [
    "I love machine learning.",
    "T5 models are powerful.",
    "Natural language processing is fascinating.",
    "The weather is nice today."
]

# Tokenize sentences and convert to tensor format
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

# Generate embeddings
with torch.no_grad():
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state  # [batch_size, sequence_length, hidden_size]

# Display the embeddings
for i, sentence in enumerate(sentences):
    print(f"\nSentence {i+1}: '{sentence}'")
    for j, token_embedding in enumerate(embeddings[i]):
        print(f"Token {j+1} Embedding: {token_embedding[:5]}...")  # Display first 5 values for readability



Sentence 1: 'I love machine learning.'
Token 1 Embedding: tensor([ 0.1647, -0.0915,  0.1383, -0.0698,  0.1096])...
Token 2 Embedding: tensor([-0.0996,  0.1405, -0.1474, -0.0586,  0.0590])...
Token 3 Embedding: tensor([ 0.3002, -0.0310, -0.1677, -0.1467,  0.0316])...
Token 4 Embedding: tensor([-0.0468, -0.3225, -0.1392,  0.0620, -0.1065])...
Token 5 Embedding: tensor([-0.0811,  0.0449,  0.0255, -0.1082, -0.0470])...
Token 6 Embedding: tensor([ 0.1029,  0.0481, -0.0459, -0.0317, -0.0194])...
Token 7 Embedding: tensor([ 0.2274,  0.0168,  0.2259, -0.2130, -0.2057])...

Sentence 2: 'T5 models are powerful.'
Token 1 Embedding: tensor([-0.0280,  0.3353, -0.0237,  0.0870,  0.0312])...
Token 2 Embedding: tensor([-0.1778,  0.0678,  0.0381, -0.0259, -0.1440])...
Token 3 Embedding: tensor([ 0.1269,  0.0421,  0.0097,  0.0684, -0.0343])...
Token 4 Embedding: tensor([-0.0498, -0.1587, -0.2012,  0.0405, -0.0585])...
Token 5 Embedding: tensor([-0.1183, -0.1588, -0.0993, -0.2670, -0.1317])...
Token 6 E