In [38]:
import requests
import nltk
from nltk import word_tokenize, pos_tag, ne_chunk
import spacy

# Fetch News Article
def fetch_news_article(api_key):
    url = f'https://newsapi.org/v2/top-headlines?country=us&apiKey={api_key}'
    response = requests.get(url)
    data = response.json()
    if data['status'] == 'ok' and data['articles']:
        articles = data['articles']
        for article in articles:
            if article['title'] and article['description']:
                return article['title'] + " " + article['description']
    return None


api_key = 'ab5ef6d0e2fc41d9bb51022951e52b41'
article = fetch_news_article(api_key)
if article:
    print("Fetched Article:\n", article)
else:
    print("Failed to fetch an article. Please check your API key and network connection.")

Fetched Article:
 Simone Biles makes history with second all-around Olympic gymnastics title, 8 years after her first - The Associated Press Simone Biles edged Rebeca Andrade of Brazil during a tense all-around gymnastics final Thursday to become a two-time Olympic champion. Biles’ total of 59.131 was just ahead of Andrade at 57.932. That's one of the closest calls Biles has ever endured at a majo…


In [39]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

def nltk_ner(text):
    words = word_tokenize(text)
    tags = pos_tag(words)
    tree = ne_chunk(tags)
    entities = []
    for subtree in tree:
        if isinstance(subtree, nltk.Tree):
            entity = " ".join([word for word, tag in subtree.leaves()])
            entity_type = subtree.label()
            entities.append((entity, entity_type))
    return entities

if article:
    nltk_entities = nltk_ner(article)
    print("NLTK Entities:", nltk_entities)
else:
    print("No article to process.")


NLTK Entities: [('Simone', 'PERSON'), ('Biles', 'PERSON'), ('Associated', 'ORGANIZATION'), ('Biles', 'PERSON'), ('Rebeca Andrade', 'PERSON'), ('Brazil', 'GPE'), ('Biles', 'PERSON'), ('Andrade', 'PERSON'), ('Biles', 'PERSON')]


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [None]:
import spacy
print(spacy.__version__)


3.7.5


In [42]:
nlp = spacy.load('en_core_web_sm')

def spacy_ner(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

if article:
    spacy_entities = spacy_ner(article)
    print("SpaCy Entities:", spacy_entities)
else:
    print("No article to process.")


SpaCy Entities: [('Simone Biles', 'PERSON'), ('second', 'ORDINAL'), ('Olympic gymnastics', 'ORG'), ('8 years', 'DATE'), ('first', 'ORDINAL'), ('Rebeca Andrade', 'PERSON'), ('Brazil', 'GPE'), ('Thursday', 'DATE'), ('two', 'CARDINAL'), ('59.131', 'CARDINAL'), ('Andrade', 'PERSON'), ('57.932', 'CARDINAL'), ('one', 'CARDINAL'), ('Biles', 'PERSON')]


In [43]:
def compare_entities(nltk_entities, spacy_entities):
    nltk_set = set(nltk_entities)
    spacy_set = set(spacy_entities)

    common = nltk_set & spacy_set
    only_nltk = nltk_set - spacy_set
    only_spacy = spacy_set - nltk_set

    return common, only_nltk, only_spacy

if article:
    common_entities, only_nltk_entities, only_spacy_entities = compare_entities(nltk_entities, spacy_entities)

    print("Common Entities:", common_entities)
    print("Entities only in NLTK:", only_nltk_entities)
    print("Entities only in SpaCy:", only_spacy_entities)
else:
    print("No entities to compare.")


Common Entities: {('Brazil', 'GPE'), ('Rebeca Andrade', 'PERSON'), ('Biles', 'PERSON'), ('Andrade', 'PERSON')}
Entities only in NLTK: {('Associated', 'ORGANIZATION'), ('Simone', 'PERSON')}
Entities only in SpaCy: {('Thursday', 'DATE'), ('57.932', 'CARDINAL'), ('59.131', 'CARDINAL'), ('first', 'ORDINAL'), ('8 years', 'DATE'), ('two', 'CARDINAL'), ('Olympic gymnastics', 'ORG'), ('one', 'CARDINAL'), ('second', 'ORDINAL'), ('Simone Biles', 'PERSON')}
