In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample corpus
corpus = [
    "I love programming in Python",
    "Python is great for machine learning",
    "I enjoy coding with Python and machine learning"
]

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the corpus to get the TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Display the TF-IDF matrix (as dense array)
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

# Show feature names (words)
print("\nFeature Names (Words):")
print(tfidf_vectorizer.get_feature_names_out())

In [None]:
import nltk
import ssl

# Disable SSL verification
ssl._create_default_https_context = ssl._create_unverified_context

# Then try downloading
nltk.download()


In [None]:
import nltk
from nltk.tokenize import word_tokenize

# Sample text
text = "Hello! How are you doing today?"

# Tokenize text
tokens = word_tokenize(text)

print("Tokens:", tokens)

In [None]:
#Example: Subword Tokenization with BPE (Byte Pair Encoding)
#You can use libraries like sentencepiece or tokenizers for subword tokenization. Here’s an example using tokenizers from Hugging Face:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

# Initialize tokenizer and trainer
tokenizer = Tokenizer(BPE())
trainer = BpeTrainer(vocab_size=1000, min_frequency=2)

# Sample sentences for training
corpus = ["Hello, how are you?", "I am learning NLP.", "NLP is fun."]

# Train the tokenizer
tokenizer.train_from_iterator(corpus, trainer=trainer)

# Encode a text sample
output = tokenizer.encode("Hello, how are you?")
print("Encoded Output:", output.tokens)


In [None]:
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer

# Sample corpus
corpus = [
    "low frequency data",
    "high frequency data",
    "more data with high frequency"
]

# Define a function to train BPE tokenizer with different min_frequency values
def train_bpe(min_frequency):
    tokenizer = Tokenizer(BPE())
    trainer = BpeTrainer(vocab_size=50, min_frequency=min_frequency)
    tokenizer.train_from_iterator(corpus, trainer=trainer)
    return tokenizer

# Train tokenizer with min_frequency=2
tokenizer_2 = train_bpe(min_frequency=2)

# Train tokenizer with min_frequency=1 (allowing all pairs)
tokenizer_1 = train_bpe(min_frequency=1)

# Check vocabulary for both models
vocab_2 = tokenizer_2.get_vocab()
vocab_1 = tokenizer_1.get_vocab()

print("Vocabulary with min_frequency=2:", list(vocab_2.keys())[:])  # Show the first 10 tokens
print("Vocabulary with min_frequency=1:", list(vocab_1.keys())[:])  # Show the first 10 tokens


In [None]:
# 3. Word Embeddings
# Word embeddings are a type of word representation that allows words to be represented as dense vectors in a continuous vector space. Common algorithms for generating word embeddings include Word2Vec, GloVe, and FastText.

# Example: Using Pre-trained Word Embeddings (GloVe) with Gensim
# You can load pre-trained word embeddings (like GloVe) using the gensim library.

import gensim.downloader as api

# Load pre-trained GloVe embeddings
glove = api.load("glove-wiki-gigaword-100")  # 100-dimensional GloVe vectors

# Check similarity between words
similarity = glove.similarity('king', 'queen')
print(f"Similarity between 'king' and 'queen': {similarity}")

# Find similar words
similar_words = glove.most_similar('king', topn=5)
print("Words similar to 'king':", similar_words)


In [None]:
from gensim.models import Word2Vec

# Sample corpus
corpus = [
    ["hello", "how", "are", "you"],
    ["I", "am", "learning", "NLP"],
    ["NLP", "is", "fun"]
]

# Train Word2Vec model
model = Word2Vec(corpus, vector_size=100, window=5, min_count=1, workers=4)

# Find most similar words to "NLP"
similar_words = model.wv.most_similar("NLP", topn=5)
print("Words similar to 'NLP':", similar_words)


In [None]:
# 4. Zero-Shot Learning
# Zero-shot learning refers to the ability of a model to perform a task without having seen any examples of that specific task during training. It is a powerful feature for tasks like text classification, where the model can classify text into categories it hasn't been explicitly trained on. Models like GPT-3 and BERT can be used for zero-shot tasks via prompt engineering.

# Example: Zero-Shot Text Classification using Hugging Face's transformers library
# Hugging Face provides a zero-shot classification pipeline using models like BART and RoBERTa.

from transformers import pipeline

# Load a zero-shot classification model
classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

# Sample text
text = "I love playing soccer on the weekends."

# Define candidate labels
candidate_labels = ["sports", "cooking", "politics", "technology"]

# Perform zero-shot classification
result = classifier(text, candidate_labels)

print("Zero-Shot Classification Result:")
print(result)


In [None]:


from transformers import pipeline

# Specify a model explicitly
ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")

# Sample text
text = "Barack Obama was born in Hawaii."

# Perform NER
entities = ner_model(text)

print("Named Entities:", entities)
