<a href="https://colab.research.google.com/github/ankit-singh26/NaturalLanguageProcessing/blob/main/Encoders.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# One-Hot Encoding
from sklearn.preprocessing import LabelBinarizer

In [2]:
corpus = ['I', 'love', 'NLP']
encoder = LabelBinarizer()
one_hot = encoder.fit_transform(corpus)

In [3]:
print(encoder.classes_)
print(one_hot)

['I' 'NLP' 'love']
[[1 0 0]
 [0 0 1]
 [0 1 0]]


Bag of Words(CountVectorizer)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
docs = ['I love NLP', 'No NLP here']
vectorizer = CountVectorizer()
bow = vectorizer.fit_transform(docs)

In [8]:
print(vectorizer.get_feature_names_out())
print(bow.toarray())

['here' 'love' 'nlp' 'no']
[[0 1 1 0]
 [1 0 1 1]]


TF-IDF Encoding

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
docs = ["I love NLP", "NLP loves me"]
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(docs)

In [11]:
print(vectorizer.get_feature_names_out())
print(tfidf.toarray())

['love' 'loves' 'me' 'nlp']
[[0.81480247 0.         0.         0.57973867]
 [0.         0.6316672  0.6316672  0.44943642]]


Word2Vec(using Gensim)

In [1]:
from gensim.models import Word2Vec

In [2]:
sentences = [["I", "love", "NLP"], ["NLP", "is", "awesome"]]
model = Word2Vec(sentences, vector_size=10, window=2, min_count=1)

In [3]:
print(model.wv["NLP"])

[-0.00536227  0.00236431  0.0510335   0.09009273 -0.0930295  -0.07116809
  0.06458873  0.08972988 -0.05015428 -0.03763372]


Positional Encoding (simplified version)

In [4]:
import numpy as np

In [8]:
def positional_encoding(seq_len, d_model):
    pos = np.arange(seq_len)[:, np.newaxis]
    i = np.arange(d_model)[np.newaxis, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / d_model)
    angle_rads = pos * angle_rates

    # apply sin to even indices and cos to odd
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    return angle_rads

In [9]:
pos_encoding = positional_encoding(seq_len=10, d_model=16)
print(pos_encoding.shape)  # (10, 16)

(10, 16)


Contextual Embedding (BERT via Transformers)

In [1]:
from transformers import BertTokenizer, BertModel
import torch

In [2]:
# Load pretrained BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [3]:
text = "I love NLP"

In [4]:
# Tokenize and get input tensors
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)

In [5]:
cls_embedding = outputs.last_hidden_state[:, 0, :]
print(cls_embedding.shape)  # (1, 768)

torch.Size([1, 768])
