## Figure 1: Tokenization using NLTK

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
sample_text = "Natural language processing (NLP) is an exciting field. It combines linguistics and computer science."
nltk_tokens = word_tokenize(sample_text)
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
nltk_processed = [lemmatizer.lemmatize(token.lower()) for token in nltk_tokens if token.lower() not in stop_words and token.isalpha()]
nltk_processed

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aliha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\aliha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aliha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['natural',
 'language',
 'processing',
 'nlp',
 'exciting',
 'field',
 'combine',
 'linguistics',
 'computer',
 'science']

## Figure 2: Tokenization using spaCy

In [3]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp(sample_text)
spacy_processed = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
spacy_processed

['natural',
 'language',
 'processing',
 'nlp',
 'exciting',
 'field',
 'combine',
 'linguistic',
 'computer',
 'science']

## Figure 3: Named Entity Recognition output using spaCy

In [4]:
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')
ner_text = "Apple is looking at buying U.K. startup for $1 billion."
doc = nlp(ner_text)

# Print Named Entities
for ent in doc.ents:
    print(f" - {ent.text} ({ent.label_})")

displacy.render(doc, style='ent', jupyter=True)

 - Apple (ORG)
 - U.K. (GPE)
 - $1 billion (MONEY)


## Figure 4: Word Embeddings using BERT

In [5]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

sample_sentence = "This is a sample sentence for embedding extraction."
inputs = tokenizer(sample_sentence, return_tensors="pt")
outputs = model(**inputs)
embeddings = outputs.last_hidden_state
embeddings

tensor([[[-0.3291, -0.5040, -0.2125,  ..., -0.6529,  0.0756,  0.8283],
         [-0.6841, -0.8352, -0.4526,  ..., -0.3956,  0.7534,  0.2176],
         [-0.4530, -0.6353,  0.1921,  ..., -0.1519, -0.0703,  0.7846],
         ...,
         [-0.4442, -0.0778, -0.1558,  ..., -0.5368, -0.3660,  0.4046],
         [ 0.5656, -0.0401, -0.7370,  ...,  0.2329, -0.5112, -0.3431],
         [ 0.1879, -0.1038, -0.3582,  ...,  0.6637, -1.0281, -0.1297]]],
       grad_fn=<NativeLayerNormBackward0>)

## Figure 5: Sentiment Analysis Results

In [7]:
from transformers import pipeline
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download("vader_lexicon")
sentiment_pipeline = pipeline("sentiment-analysis")
sentences = [
    "I absolutely love this product! It works perfectly.",
    "The service was terrible. I'm never coming back!",
    "The experience was okay, not the best, but not the worst either."
]
print("Transformer-based Sentiment Analysis:")
for sentence in sentences:
    result = sentiment_pipeline(sentence)
    print(f"Sentence: {sentence}\nResult: {result}\n")
sia = SentimentIntensityAnalyzer()
print("\nTraditional (Vader) Sentiment Analysis:")
for sentence in sentences:
    scores = sia.polarity_scores(sentence)
    sentiment = "Positive" if scores["compound"] > 0 else "Negative" if scores["compound"] < 0 else "Neutral"
    print(f"Sentence: {sentence}\nScores: {scores}\nVader Sentiment: {sentiment}\n")


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\aliha\AppData\Roaming\nltk_data...
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0


Transformer-based Sentiment Analysis:
Sentence: I absolutely love this product! It works perfectly.
Result: [{'label': 'POSITIVE', 'score': 0.9998786449432373}]

Sentence: The service was terrible. I'm never coming back!
Result: [{'label': 'NEGATIVE', 'score': 0.9989019632339478}]

Sentence: The experience was okay, not the best, but not the worst either.
Result: [{'label': 'NEGATIVE', 'score': 0.9642871618270874}]


Traditional (Vader) Sentiment Analysis:
Sentence: I absolutely love this product! It works perfectly.
Scores: {'neg': 0.0, 'neu': 0.358, 'pos': 0.642, 'compound': 0.8746}
Vader Sentiment: Positive

Sentence: The service was terrible. I'm never coming back!
Scores: {'neg': 0.326, 'neu': 0.674, 'pos': 0.0, 'compound': -0.5255}
Vader Sentiment: Negative

Sentence: The experience was okay, not the best, but not the worst either.
Scores: {'neg': 0.128, 'neu': 0.527, 'pos': 0.345, 'compound': 0.5729}
Vader Sentiment: Positive



In [6]:
from transformers import pipeline

sentiment_pipeline = pipeline("sentiment-analysis")
sentences = [
    "I absolutely love this product! It exceeded my expectations.",
    "This is the worst experience I have ever had with a service."
]

for sentence in sentences:
    result = sentiment_pipeline(sentence)
    print(f"Sentence: {sentence}\nResult: {result}\n")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.





Device set to use cuda:0


Sentence: I absolutely love this product! It exceeded my expectations.
Result: [{'label': 'POSITIVE', 'score': 0.9998810291290283}]

Sentence: This is the worst experience I have ever had with a service.
Result: [{'label': 'NEGATIVE', 'score': 0.9997770190238953}]

