In [None]:
!pip install nltk
!pip install wikipedia
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')



In [None]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Import libraries
import nltk
import wikipedia
import spacy
import pandas as pd
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize NLTK Lemmatizer and Stopwords
lemmatizer = WordNetLemmatizer()
stop_words_nltk = set(stopwords.words('english'))

# Get a random Wikipedia article
while True:
    try:
        random_title = wikipedia.random(pages=1)
        page = wikipedia.page(random_title)
        break
    except wikipedia.exceptions.DisambiguationError:
        print(f" Disambiguation Error: {random_title} has multiple meanings, trying again.")
    except wikipedia.exceptions.PageError:
        print(f" PageError: No page found for {random_title}, trying again.")

# Fetch the first 500 characters of the article
summary = page.summary[:500].replace("\n", " ")  # Remove newline characters

# Print the article title and summary
print(f"📖 Random Wikipedia Article: {random_title}\n")
print(summary)

# 1. Word Tokenization
nltk_tokens = word_tokenize(summary)
spacy_doc = nlp(summary)
spacy_tokens = [token.text for token in spacy_doc]

# 2. Sentence Tokenization
nltk_sentences = sent_tokenize(summary)
spacy_sentences = [sent.text for sent in spacy_doc.sents]

# 3. Stopwords Removal
nltk_cleaned = [token if token.lower() not in stop_words_nltk and token not in string.punctuation else "(removed)" for token in nltk_tokens]
spacy_cleaned = [token.text if not token.is_stop and token.text not in string.punctuation else "(removed)" for token in spacy_doc]

# 4. Lemmatization
nltk_lemmatized = [lemmatizer.lemmatize(token) for token in nltk_tokens]
spacy_lemmatized = [token.lemma_ for token in spacy_doc]

# Display results
print("\n### 1. WORD TOKENIZE ###")
print("\n[SPACY OUTPUT]:\n", spacy_tokens)
print("\n[NLTK OUTPUT]:\n", nltk_tokens)

print("\n### 2. SENTENCE TOKENIZE ###")
print("\n[SPACY OUTPUT]:\n", spacy_sentences)
print("\n[NLTK OUTPUT]:\n", nltk_sentences)

print("\n### 3. STOP WORDS REMOVAL ###")
print("\n[SPACY OUTPUT]:\n", spacy_cleaned)
print("\n[NLTK OUTPUT]:\n", nltk_cleaned)

print("\n### 4. LEMMATIZATION ###")
print("\n[SPACY OUTPUT]:\n", spacy_lemmatized)
print("\n[NLTK OUTPUT]:\n", nltk_lemmatized)



📖 Random Wikipedia Article: Scott Corner, Indiana

Scott Corner is an unincorporated community in Union Township, Randolph County, in the U.S. state of Indiana. In the 19th century, it was the home of several families free African Americans, part of the Cabin Creek Settlement. The name derives from Robert Scott, who was born enslaved in Guilford County, North Carolina in 1770. After gaining his freedom, he moved in 1821 to Wayne County, Indiana, then in 1832 to what would become known as Scott's Corner. The "corner" in the name marks the locatio

### 1. WORD TOKENIZE ###

[SPACY OUTPUT]:
 ['Scott', 'Corner', 'is', 'an', 'unincorporated', 'community', 'in', 'Union', 'Township', ',', 'Randolph', 'County', ',', 'in', 'the', 'U.S.', 'state', 'of', 'Indiana', '.', 'In', 'the', '19th', 'century', ',', 'it', 'was', 'the', 'home', 'of', 'several', 'families', 'free', 'African', 'Americans', ',', 'part', 'of', 'the', 'Cabin', 'Creek', 'Settlement', '.', 'The', 'name', 'derives', 'from', 'Robert

In [None]:
# Combine all results into a single DataFrame for better visibility
max_length = max(len(nltk_tokens), len(spacy_tokens), len(nltk_sentences), len(spacy_sentences),
                 len(nltk_cleaned), len(spacy_cleaned), len(nltk_lemmatized), len(spacy_lemmatized))

comparison_df = pd.DataFrame({
    "NLTK Tokens": nltk_tokens[:max_length] + [""] * (max_length - len(nltk_tokens)),
    "spaCy Tokens": spacy_tokens[:max_length] + [""] * (max_length - len(spacy_tokens)),
    "NLTK Sentences": nltk_sentences[:max_length] + [""] * (max_length - len(nltk_sentences)),
    "spaCy Sentences": spacy_sentences[:max_length] + [""] * (max_length - len(spacy_sentences)),
    "NLTK Cleaned": nltk_cleaned[:max_length] + [""] * (max_length - len(nltk_cleaned)),
    "spaCy Cleaned": spacy_cleaned[:max_length] + [""] * (max_length - len(spacy_cleaned)),
    "NLTK Lemmatized": nltk_lemmatized[:max_length] + [""] * (max_length - len(nltk_lemmatized)),
    "spaCy Lemmatized": spacy_lemmatized[:max_length] + [""] * (max_length - len(spacy_lemmatized))
})

# Display the consolidated DataFrame using Pandas display function
from IPython.display import display  # Import display if not already imported
display(comparison_df)
# Remove the import ace_tools and tools.display_dataframe_to_user lines

Unnamed: 0,NLTK Tokens,spaCy Tokens,NLTK Sentences,spaCy Sentences,NLTK Cleaned,spaCy Cleaned,NLTK Lemmatized,spaCy Lemmatized
0,Scott,Scott,Scott Corner is an unincorporated community in...,Scott Corner is an unincorporated community in...,Scott,Scott,Scott,Scott
1,Corner,Corner,"In the 19th century, it was the home of severa...","In the 19th century, it was the home of severa...",Corner,Corner,Corner,Corner
2,is,is,"The name derives from Robert Scott, who was bo...","The name derives from Robert Scott, who was bo...",(removed),(removed),is,be
3,an,an,"After gaining his freedom, he moved in 1821 to...","After gaining his freedom, he moved in 1821 to...",(removed),(removed),an,an
4,unincorporated,unincorporated,"The ""corner"" in the name marks the locatio","The ""corner"" in the name marks the locatio",unincorporated,unincorporated,unincorporated,unincorporated
...,...,...,...,...,...,...,...,...
96,the,the,,,(removed),(removed),the,the
97,name,name,,,name,(removed),name,name
98,marks,marks,,,marks,marks,mark,mark
99,the,the,,,(removed),(removed),the,the


In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('maxent_ne_chunker_tab')
nltk.download('averaged_perceptron_tagger_eng')

In [None]:
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
# Named Entity Recognition (NER) using spaCy
spacy_doc = nlp(summary)

# Extract named entities
entities = {ent.label_: [] for ent in spacy_doc.ents}  # Dictionary to store entities by category

for ent in spacy_doc.ents:
    entities[ent.label_].append(ent.text)

# Display results in the required format
print("\n### 5. NER ###\n")
print("[SPACY OUTPUT]:")
for label, names in entities.items():
    print(f"{label} {names}")

# Named Entity Recognition using NLTK (alternative method)
nltk_tokens = word_tokenize(summary)
nltk_ner_tree = ne_chunk(pos_tag(nltk_tokens))

# Extract named entities from NLTK
nltk_entities = []
for subtree in nltk_ner_tree:
    if hasattr(subtree, "label"):  # If it's a named entity
        nltk_entities.append(" ".join([token for token, pos in subtree.leaves()]))

print("\n[NLTK OUTPUT]:")
print(nltk_entities)


### 5. NER ###

[SPACY OUTPUT]:
PERSON ['Scott Corner', 'Robert Scott']
GPE ['Union Township', 'Randolph County', 'U.S.', 'Indiana', 'Guilford County', 'North Carolina', 'Wayne County', 'Indiana']
DATE ['the 19th century', '1770', '1821', '1832']
NORP ['African Americans']
ORG ['the Cabin Creek Settlement', "Scott's Corner"]

[NLTK OUTPUT]:
['Scott', 'Corner', 'Union Township', 'Randolph County', 'U.S.', 'Indiana', 'African Americans', 'Cabin Creek Settlement', 'Robert Scott', 'Guilford County', 'North Carolina', 'Wayne County', 'Indiana', 'Scott', 'Corner']


In [None]:
# 6. Visualize the named entities using spaCy's displacy
from spacy import displacy
print("\n### 6. NER Visualization ###\n")
displacy.render(spacy_doc, style="ent", jupyter=True)


### 6. NER Visualization ###



In [None]:
# Install necessary libraries
!pip install transformers torch

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
# Import required libraries
from transformers import BertTokenizer, BertModel
import torch
from IPython.display import display # Import the display function

# Load the pre-trained BERT model and tokenizer from Hugging Face
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Tokenize the Wikipedia summary text
tokens = tokenizer(summary, padding=True, truncation=True, return_tensors="pt")

# Get BERT embeddings (vector representation)
with torch.no_grad():
    outputs = model(**tokens)
    embeddings = outputs.last_hidden_state  # Last hidden state contains word embeddings

# Convert to numpy for better visualization
bert_embeddings = embeddings.squeeze(0).numpy()

# Display embeddings shape
print(f"Shape of BERT Embeddings: {bert_embeddings.shape}")  # (Number of tokens, Hidden size)

# Show a few embeddings
import pandas as pd
embedding_df = pd.DataFrame(bert_embeddings[:10])  # Display first 10 token embeddings

# Display embeddings
import pandas as pd
embedding_df = pd.DataFrame(bert_embeddings[:10])  # Display first 10 token embeddings

# Display embeddings using IPython.display
display(embedding_df)

Shape of BERT Embeddings: (110, 768)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.473704,0.020425,0.050101,-0.402886,1.237658,0.164903,-0.414399,-0.068226,0.05788,-0.823978,...,-0.082178,-0.363327,-0.061794,-0.939078,0.137301,0.380059,-0.120822,0.575711,0.386995,0.645491
1,-0.212607,0.234216,-0.729757,0.094356,1.702709,-0.159239,0.314746,0.258116,0.072036,-0.251151,...,0.653169,-0.140908,-0.430964,-0.464274,1.088782,-0.119039,-0.721245,-0.434938,0.936795,-0.516231
2,-0.369501,-0.321719,-0.507602,0.15936,2.624837,-0.112645,-0.05443,0.78355,0.280899,0.157043,...,0.050841,0.083484,-0.952098,0.023224,0.022379,-0.716294,-0.639081,-0.199447,-0.072689,0.059277
3,-0.869538,-0.241972,0.191037,-0.034891,1.556679,0.01181,0.737943,0.434373,0.308119,-0.23284,...,-0.085891,-0.032841,-0.474608,-0.957007,0.204024,0.387534,-0.423658,-0.506585,-0.499847,0.865922
4,-0.966981,-0.241068,0.033839,-0.189937,1.481853,-0.041523,0.431998,0.129788,0.566142,-0.872034,...,-0.812576,0.19516,-0.384046,-1.154739,-0.014064,0.799553,0.103096,-0.207342,-0.630534,0.922046
5,-0.169735,-0.192586,-0.066285,-0.624381,1.415682,-0.101363,-0.165758,1.223217,1.045531,-0.584514,...,-0.865056,0.623704,-0.77931,-0.250987,0.010988,-0.848958,0.013339,-0.588625,0.25157,0.037431
6,-0.140233,-0.111836,0.582352,-0.519289,0.893065,0.632491,0.024225,0.750544,0.99398,0.106,...,-0.792195,0.562737,-0.844256,-0.604331,-0.075396,-0.188406,0.322948,-0.513447,-0.097079,-0.075954
7,-1.171454,-0.938656,0.26221,-0.296662,0.157958,0.40761,0.099208,0.45029,0.641413,-0.362522,...,-0.55002,-0.063803,-0.809914,-1.191824,-0.283734,0.515408,0.208543,-0.238811,-0.741369,0.886092
8,-0.586685,0.47059,-0.225686,0.604281,2.077971,0.207615,0.096394,0.711375,-0.854176,-0.789816,...,-0.482081,0.400982,-0.195835,-0.258426,0.297714,-0.606763,0.18783,-0.02734,-0.235447,0.049052
9,-0.755984,-0.298121,0.172369,0.233788,0.555342,-0.12253,0.249715,0.558588,0.132634,-0.760229,...,-0.78081,0.48795,-0.490087,-1.223475,-0.092189,-0.097956,-0.004982,0.615575,-0.489103,0.238879


In [None]:
# Tokenize and encode the Wikipedia summary text
encoded_tokens = tokenizer(summary, padding=True, truncation=True, return_tensors="pt")

# Print tokenized output
print("Tokenized Output:")
print(tokenizer.tokenize(summary))

# Print token IDs
print("\nToken IDs:")
print(encoded_tokens["input_ids"])

# Convert token IDs to actual tokens
decoded_tokens = [tokenizer.convert_ids_to_tokens(token_id) for token_id in encoded_tokens["input_ids"][0].tolist()]

# Display the tokens with their corresponding IDs
import pandas as pd
tokenization_df = pd.DataFrame({"Token": decoded_tokens, "Token ID": encoded_tokens["input_ids"][0].tolist()})
display(tokenization_df)

Tokenized Output:
['scott', 'corner', 'is', 'an', 'unincorporated', 'community', 'in', 'union', 'township', ',', 'randolph', 'county', ',', 'in', 'the', 'u', '.', 's', '.', 'state', 'of', 'indiana', '.', 'in', 'the', '19th', 'century', ',', 'it', 'was', 'the', 'home', 'of', 'several', 'families', 'free', 'african', 'americans', ',', 'part', 'of', 'the', 'cabin', 'creek', 'settlement', '.', 'the', 'name', 'derives', 'from', 'robert', 'scott', ',', 'who', 'was', 'born', 'enslaved', 'in', 'gui', '##lford', 'county', ',', 'north', 'carolina', 'in', '1770', '.', 'after', 'gaining', 'his', 'freedom', ',', 'he', 'moved', 'in', '1821', 'to', 'wayne', 'county', ',', 'indiana', ',', 'then', 'in', '1832', 'to', 'what', 'would', 'become', 'known', 'as', 'scott', "'", 's', 'corner', '.', 'the', '"', 'corner', '"', 'in', 'the', 'name', 'marks', 'the', 'lo', '##cat', '##io']

Token IDs:
tensor([[  101,  3660,  3420,  2003,  2019,  7754,  2451,  1999,  2586,  3545,
          1010, 13031,  2221,  1010,

Unnamed: 0,Token,Token ID
0,[CLS],101
1,scott,3660
2,corner,3420
3,is,2003
4,an,2019
...,...,...
105,the,1996
106,lo,8840
107,##cat,11266
108,##io,3695


In [None]:
# Extract word embeddings from BERT's hidden states
with torch.no_grad():
    outputs = model(**tokens)
    hidden_states = outputs.last_hidden_state  # Last hidden state contains word embeddings

# Convert to numpy for better visualization
bert_embeddings = hidden_states.squeeze(0).numpy()

# Display embeddings shape
print(f"Shape of BERT Embeddings: {bert_embeddings.shape}")  # (Number of tokens, Hidden size)

# Show a few embeddings
embedding_df = pd.DataFrame(bert_embeddings[:10])  # Display first 10 token embeddings

# Display embeddings using IPython.display
display(embedding_df) # This line replaces the tools.display_dataframe_to_user line

Shape of BERT Embeddings: (110, 768)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.473704,0.020425,0.050101,-0.402886,1.237658,0.164903,-0.414399,-0.068226,0.05788,-0.823978,...,-0.082178,-0.363327,-0.061794,-0.939078,0.137301,0.380059,-0.120822,0.575711,0.386995,0.645491
1,-0.212607,0.234216,-0.729757,0.094356,1.702709,-0.159239,0.314746,0.258116,0.072036,-0.251151,...,0.653169,-0.140908,-0.430964,-0.464274,1.088782,-0.119039,-0.721245,-0.434938,0.936795,-0.516231
2,-0.369501,-0.321719,-0.507602,0.15936,2.624837,-0.112645,-0.05443,0.78355,0.280899,0.157043,...,0.050841,0.083484,-0.952098,0.023224,0.022379,-0.716294,-0.639081,-0.199447,-0.072689,0.059277
3,-0.869538,-0.241972,0.191037,-0.034891,1.556679,0.01181,0.737943,0.434373,0.308119,-0.23284,...,-0.085891,-0.032841,-0.474608,-0.957007,0.204024,0.387534,-0.423658,-0.506585,-0.499847,0.865922
4,-0.966981,-0.241068,0.033839,-0.189937,1.481853,-0.041523,0.431998,0.129788,0.566142,-0.872034,...,-0.812576,0.19516,-0.384046,-1.154739,-0.014064,0.799553,0.103096,-0.207342,-0.630534,0.922046
5,-0.169735,-0.192586,-0.066285,-0.624381,1.415682,-0.101363,-0.165758,1.223217,1.045531,-0.584514,...,-0.865056,0.623704,-0.77931,-0.250987,0.010988,-0.848958,0.013339,-0.588625,0.25157,0.037431
6,-0.140233,-0.111836,0.582352,-0.519289,0.893065,0.632491,0.024225,0.750544,0.99398,0.106,...,-0.792195,0.562737,-0.844256,-0.604331,-0.075396,-0.188406,0.322948,-0.513447,-0.097079,-0.075954
7,-1.171454,-0.938656,0.26221,-0.296662,0.157958,0.40761,0.099208,0.45029,0.641413,-0.362522,...,-0.55002,-0.063803,-0.809914,-1.191824,-0.283734,0.515408,0.208543,-0.238811,-0.741369,0.886092
8,-0.586685,0.47059,-0.225686,0.604281,2.077971,0.207615,0.096394,0.711375,-0.854176,-0.789816,...,-0.482081,0.400982,-0.195835,-0.258426,0.297714,-0.606763,0.18783,-0.02734,-0.235447,0.049052
9,-0.755984,-0.298121,0.172369,0.233788,0.555342,-0.12253,0.249715,0.558588,0.132634,-0.760229,...,-0.78081,0.48795,-0.490087,-1.223475,-0.092189,-0.097956,-0.004982,0.615575,-0.489103,0.238879


In [None]:
!pip install textblob
nltk.download('vader_lexicon')



[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [None]:
from transformers import pipeline
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer

# Load sentiment analysis pipeline from Hugging Face (Transformer-based model)
sentiment_analyzer = pipeline("sentiment-analysis")

# Tokenize the Wikipedia summary into individual sentences
sentences = sent_tokenize(summary)

# Perform sentiment analysis using Transformers (Hugging Face)
transformer_results = sentiment_analyzer(sentences)

# Perform sentiment analysis using TextBlob (Traditional Lexicon-based Approach)
textblob_results = [TextBlob(sentence).sentiment.polarity for sentence in sentences]

# Perform sentiment analysis using NLTK's Vader (Rule-based Approach)
sia = SentimentIntensityAnalyzer()
nltk_results = [sia.polarity_scores(sentence)['compound'] for sentence in sentences]

# Convert sentiment scores into categorical labels
def categorize_sentiment(score):
    return "POSITIVE" if score > 0 else "NEGATIVE" if score < 0 else "NEUTRAL"

# Create a DataFrame to compare results
comparison_df = pd.DataFrame({
    "Sentence": sentences,
    "Transformers Sentiment": [res['label'] for res in transformer_results],
    "Transformers Confidence": [res['score'] for res in transformer_results],
    "TextBlob Sentiment": [categorize_sentiment(score) for score in textblob_results],
    "TextBlob Score": textblob_results,
    "NLTK Sentiment": [categorize_sentiment(score) for score in nltk_results],
    "NLTK Score": nltk_results
})

display(comparison_df)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Unnamed: 0,Sentence,Transformers Sentiment,Transformers Confidence,TextBlob Sentiment,TextBlob Score,NLTK Sentiment,NLTK Score
0,Scott Corner is an unincorporated community in...,POSITIVE,0.885485,NEUTRAL,0.0,NEUTRAL,0.0
1,"In the 19th century, it was the home of severa...",POSITIVE,0.967384,POSITIVE,0.133333,POSITIVE,0.5106
2,"The name derives from Robert Scott, who was bo...",POSITIVE,0.879812,NEUTRAL,0.0,NEGATIVE,-0.4019
3,"After gaining his freedom, he moved in 1821 to...",POSITIVE,0.893569,NEUTRAL,0.0,POSITIVE,0.7906
4,"The ""corner"" in the name marks the locatio",NEGATIVE,0.978955,NEUTRAL,0.0,NEUTRAL,0.0
