In [None]:
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download resources for NLTK
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

# spaCy English model
nlp = spacy.load("en_core_web_sm")

text = "The sun is the primary source of energy for our planet. It provides light and warmth, making life possible. The sun’s rays drive photosynthesis, essential for plant growth. Additionally, the sun influences weather patterns and climate. This celestial body is central to our solar system, giving us daylight and driving the cycles of nature."

# NLTK
try:
    tokens_nltk = word_tokenize(text)
except LookupError:
    nltk.download('punkt')
    tokens_nltk = word_tokenize(text)

lemmatizer = WordNetLemmatizer()
stop_words_nltk = set(stopwords.words("english"))
nltk_processed = [lemmatizer.lemmatize(word.lower()) for word in tokens_nltk if word.lower() not in stop_words_nltk]

# spaCy Preprocessing
doc = nlp(text)
stop_words_spacy = nlp.Defaults.stop_words
spacy_processed = [token.lemma_.lower() for token in doc if token.text.lower() not in stop_words_spacy and not token.is_punct]

# Print results
print("NLTK Processed:", nltk_processed)
print("spaCy Processed:", spacy_processed)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


NLTK Processed: ['natural', 'language', 'processing', 'exciting', 'field', 'artificial', 'intelligence', '.', 'enables', 'machine', 'understand', 'process', 'human', 'language', '.']
spaCy Processed: ['natural', 'language', 'processing', 'exciting', 'field', 'artificial', 'intelligence', 'enable', 'machine', 'understand', 'process', 'human', 'language']


In [14]:
import spacy
from spacy import displacy

# SpaCy's English model with built-in NER
nlp = spacy.load("en_core_web_sm")

text = "Donald John Trump (born June 14, 1946) is an American politician, media personality, and businessman serving as the 47th president of the United States since January 2025. A member of the Republican Party, he previously served as the 45th president from 2017 to 2021."

doc = nlp(text)

# Print entities
print("Named Entities, Phrases, and Concepts:")
for ent in doc.ents:
    print(f"{ent.text} ({ent.label_})")

# Visualization 
displacy.render(doc, style="ent", jupyter=True)


Named Entities, Phrases, and Concepts:
Donald John Trump (PERSON)
June 14, 1946 (DATE)
American (NORP)
47th (ORDINAL)
the United States (GPE)
January 2025 (DATE)
the Republican Party (ORG)
45th (ORDINAL)
2017 (DATE)
2021 (DATE)


In [2]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load pre-trained model and tokenizer (BERT base, uncased)
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Sample text
text = "Donald John Trump (born June 14, 1946) is an American politician, media personality, and businessman serving as the 47th president of the United States since January 2025. A member of the Republican Party, he previously served as the 45th president from 2017 to 2021."

# Tokenize and encode
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)

# Forward pass through the model to get hidden states
with torch.no_grad():
    outputs = model(**inputs)

# Extract last hidden state (word embeddings)
hidden_states = outputs.last_hidden_state

# Print shape of the embeddings (batch_size, sequence_length, hidden_dim)
print("Embeddings shape:", hidden_states.shape)


Embeddings shape: torch.Size([1, 55, 768])


In [None]:
from transformers import pipeline, AutoTokenizer, AutoModel
import torch

# Load sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")

# Sample texts for analysis
texts = [
    "I love being alone", 
    "This music is awful", 
    "The movie was good, not the best but not the worst."
]

# Sentiment analysis
results = sentiment_pipeline(texts)

# Results
for text, result in zip(texts, results):
    print(f"Text: {text}\nSentiment: {result['label']} (Score: {result['score']:.4f})\n")

# Load pre-trained model and tokenizer for comparison
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Tokenize and encode for traditional approach
inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)

# Forward pass through the model to get hidden states
with torch.no_grad():
    outputs = model(**inputs)

# Extract last hidden state (word embeddings)
hidden_states = outputs.last_hidden_state

# Print shape of the embeddings (batch_size, sequence_length, hidden_dim)
print("Embeddings shape:", hidden_states.shape)





model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Error while downloading from https://cdn-lfs.hf.co/distilbert-base-uncased-finetuned-sst-2-english/7c3919835e442510166d267fe7cbe847e0c51cd26d9ba07b89a57b952b49b8aa?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1739896347&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTczOTg5NjM0N319LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9kaXN0aWxiZXJ0LWJhc2UtdW5jYXNlZC1maW5ldHVuZWQtc3N0LTItZW5nbGlzaC83YzM5MTk4MzVlNDQyNTEwMTY2ZDI2N2ZlN2NiZTg0N2UwYzUxY2QyNmQ5YmEwN2I4OWE1N2I5NTJiNDliOGFhP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiJ9XX0_&Signature=aoASncgKWeRj8Smq-Y90cmIBWsMgz69M1KQrJnUwpNGZJrvYsRA93DsAzZYNVIWTaWb79W6Ky4yAJ6owL6vy0h97EgNKE7f9QNorUiiDe9KuesTTN6QmERhBDNsKUmjxrtd3APMm9sFXFez6PKfIBkR6QOmiUiAQdN3Vx96P%7Ens2hTAZTFpNisq-WWEwopjkyKyZouQy0L2KKUe%7EaHeJ0N9ZO6-xusUOWkskJyoEciDviUIyD5cj7K%7EU2q%7EbM7LzfaKDtnRKvTe7C1ZgtpO03xqmzXMsbriqRppeVLCYLy4c-jprmgRKi8RHKsH9s2Yk3wZf1gxo9T

model.safetensors:   4%|3         | 10.5M/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


Text: I love being alone
Sentiment: POSITIVE (Score: 0.9994)

Text: This music is awful
Sentiment: NEGATIVE (Score: 0.9998)

Text: The movie was good, not the best but not the worst.
Sentiment: POSITIVE (Score: 0.9989)

Embeddings shape: torch.Size([3, 15, 768])
