In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus
corpus = [
    "I love programming in Python",
    "Python is great for machine learning",
    "I enjoy coding with Python and machine learning"
]

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus to get the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Display the TF-IDF matrix (as dense array)
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

# Show feature names (words)
print("\nFeature Names (Words):")
print(tfidf_vectorizer.get_feature_names_out())

TF-IDF Matrix:
[[0.         0.         0.         0.         0.         0.54645401
  0.         0.         0.54645401 0.         0.54645401 0.32274454
  0.        ]
 [0.         0.         0.         0.4711101  0.4711101  0.
  0.4711101  0.35829137 0.         0.35829137 0.         0.27824521
  0.        ]
 [0.4261835  0.4261835  0.4261835  0.         0.         0.
  0.         0.32412354 0.         0.32412354 0.         0.25171084
  0.4261835 ]]

Feature Names (Words):
['and' 'coding' 'enjoy' 'for' 'great' 'in' 'is' 'learning' 'love'
 'machine' 'programming' 'python' 'with']


In [10]:
import nltk
from nltk.tokenize import word_tokenize

# Sample text
text = "Hello! How are you doing today?"

# Tokenize text
tokens = word_tokenize(text)

print("Tokens:", tokens)

Tokens: ['Hello', '!', 'How', 'are', 'you', 'doing', 'today', '?']


In [11]:
#Example: Subword Tokenization with BPE (Byte Pair Encoding)
#You can use libraries like sentencepiece or tokenizers for subword tokenization. Here’s an example using tokenizers from Hugging Face:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

# Initialize tokenizer and trainer
tokenizer = Tokenizer(BPE())
trainer = BpeTrainer(vocab_size=1000, min_frequency=2)

# Sample sentences for training
corpus = ["Hello, how are you?", "I am learning NLP.", "NLP is fun."]

# Train the tokenizer
tokenizer.train_from_iterator(corpus, trainer=trainer)

# Encode a text sample
output = tokenizer.encode("Hello, how are you?")
print("Encoded Output:", output.tokens)


Encoded Output: ['H', 'e', 'l', 'l', 'o', ',', ' ', 'h', 'o', 'w', ' a', 'r', 'e', ' ', 'y', 'o', 'u', '?']


In [12]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

# Sample corpus
corpus = [
    "low frequency data",
    "high frequency data",
    "more data with high frequency"
]

# Define a function to train BPE tokenizer with different min_frequency values
def train_bpe(min_frequency):
    tokenizer = Tokenizer(BPE())
    trainer = BpeTrainer(vocab_size=50, min_frequency=min_frequency)
    tokenizer.train_from_iterator(corpus, trainer=trainer)
    return tokenizer

# Train tokenizer with min_frequency=2
tokenizer_2 = train_bpe(min_frequency=2)

# Train tokenizer with min_frequency=1 (allowing all pairs)
tokenizer_1 = train_bpe(min_frequency=1)

# Check vocabulary for both models
vocab_2 = tokenizer_2.get_vocab()
vocab_1 = tokenizer_1.get_vocab()

print("Vocabulary with min_frequency=2:", list(vocab_2.keys())[:])  # Show the first 10 tokens
print("Vocabulary with min_frequency=1:", list(vocab_1.keys())[:])  # Show the first 10 tokens


Vocabulary with min_frequency=2: ['i', 'e', 'a', 'qu', 'cy', 'en', 'd', 'at', 'g', 'y', 'ency', 'n', 'w', 'hi', 'u', 'r', 'q', 'f', 't', 'c', ' ', 'h', 're', ' f', 'requ', ' dat', ' data', ' frequency data', ' frequency', 'l', 'm', 'gh', 'o', ' d', ' frequ', 'high']
Vocabulary with min_frequency=1: [' f', 'at', 'u', 'y', 're data', 'n', ' frequ', 'qu', 'gh', ' frequency data', 'a', 'hi', 'ency', 'high frequency data', 'it', 'c', ' dat', 'requ', 'e', 'f', 'd', 'g', ' ', 'h', 't', 'o', 're', ' frequency', 'm', ' data', 'high', ' wit', 'h high frequency', 'cy', ' with high frequency', 'q', 'lo', 'i', 'en', 'mo', 'h high', 'more data', ' d', 'low frequency data', 'r', ' w', 'l', ' high', 'w frequency data', 'w']


In [13]:
# 3. Word Embeddings
# Word embeddings are a type of word representation that allows words to be represented as dense vectors in a continuous vector space. Common algorithms for generating word embeddings include Word2Vec, GloVe, and FastText.

# Example: Using Pre-trained Word Embeddings (GloVe) with Gensim
# You can load pre-trained word embeddings (like GloVe) using the gensim library.

import gensim.downloader as api

# Load pre-trained GloVe embeddings
glove = api.load("glove-wiki-gigaword-100")  # 100-dimensional GloVe vectors

# Check similarity between words
similarity = glove.similarity('king', 'queen')
print(f"Similarity between 'king' and 'queen': {similarity}")

# Find similar words
similar_words = glove.most_similar('king', topn=5)
print("Words similar to 'king':", similar_words)


Similarity between 'king' and 'queen': 0.7507690787315369
Words similar to 'king': [('prince', 0.7682328820228577), ('queen', 0.7507690787315369), ('son', 0.7020888328552246), ('brother', 0.6985775232315063), ('monarch', 0.6977890729904175)]


In [14]:
from gensim.models import Word2Vec

# Sample corpus
corpus = [
    ["hello", "how", "are", "you"],
    ["I", "am", "learning", "NLP"],
    ["NLP", "is", "fun"]
]

# Train Word2Vec model
model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4)

# Find most similar words to "NLP"
similar_words = model.wv.most_similar("NLP", topn=5)
print("Words similar to 'NLP':", similar_words)


Words similar to 'NLP': [('hello', 0.21617142856121063), ('are', 0.09291722625494003), ('how', 0.027057476341724396), ('you', 0.016134677454829216), ('fun', -0.01083916611969471)]


In [15]:
# 4. Zero-Shot Learning
# Zero-shot learning refers to the ability of a model to perform a task without having seen any examples of that specific task during training. It is a powerful feature for tasks like text classification, where the model can classify text into categories it hasn't been explicitly trained on. Models like GPT-3 and BERT can be used for zero-shot tasks via prompt engineering.

# Example: Zero-Shot Text Classification using Hugging Face's transformers library
# Hugging Face provides a zero-shot classification pipeline using models like BART and RoBERTa.

from transformers import pipeline

# Load a zero-shot classification model
classifier = pipeline("zero-shot-classification")

# Sample text
text = "I love playing soccer on the weekends."

# Define candidate labels
candidate_labels = ["sports", "cooking", "politics", "technology"]

# Perform zero-shot classification
result = classifier(text, candidate_labels)

print("Zero-Shot Classification Result:")
print(result)


No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


Zero-Shot Classification Result:
{'sequence': 'I love playing soccer on the weekends.', 'labels': ['sports', 'technology', 'cooking', 'politics'], 'scores': [0.9956403970718384, 0.0020962031558156013, 0.0013117057969793677, 0.0009516139980405569]}


In [None]:


from transformers import pipeline

# Specify a model explicitly
ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

# Sample text
text = "Barack Obama was born in Hawaii."

# Perform NER
entities = ner_model(text)

print("Named Entities:", entities)



No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Named Entities: [{'entity': 'I-PER', 'score': 0.9990103, 'index': 1, 'word': 'Barack', 'start': 0, 'end': 6}, {'entity': 'I-PER', 'score': 0.999342, 'index': 2, 'word': 'Obama', 'start': 7, 'end': 12}, {'entity': 'I-LOC', 'score': 0.99945, 'index': 6, 'word': 'Hawaii', 'start': 25, 'end': 31}]
