In [1]:
import nltk
from nltk.corpus import stopwords
from string import punctuation
import wikipedia
import pandas as pd
from collections import Counter

In [2]:
#nltk.download()

In [3]:
pd.options.display.max_rows = 100

In [4]:
text = None
with open('data.txt', 'r') as f:
    text = f.read()

In [5]:
def preprocessText(text):
    tokens = nltk.word_tokenize(text)
    tokens = [t.replace("â€™", "'") for t in tokens]
    stops = stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in punctuation]
    filtered_tokens = [token for token in filtered_tokens if token not in stops]
    return filtered_tokens

In [6]:
tokens = preprocessText(text)

In [7]:
tagged = nltk.pos_tag(tokens)
tagged

[('Sending', 'VBG'),
 ('astronauts', 'NNS'),
 ('back', 'RB'),
 ('moon', 'VBP'),
 ('one', 'CD'),
 ('top', 'JJ'),
 ('space', 'NN'),
 ('priorities', 'NNS'),
 ('President', 'NNP'),
 ('Trump', 'NNP'),
 ('But', 'CC'),
 ('administration', 'NN'),
 ('wants', 'VBZ'),
 ('accomplish', 'JJ'),
 ('without', 'IN'),
 ('giving', 'VBG'),
 ('NASA', 'NNP'),
 ('additional', 'JJ'),
 ('money', 'NN'),
 ('occur', 'VBP'),
 ('leaves', 'VBZ'),
 ('office', 'NN'),
 ('even', 'RB'),
 ('wins', 'VBZ'),
 ('re-election', 'NN'),
 ('Instead', 'RB'),
 ('aims', 'VBZ'),
 ('give', 'JJ'),
 ('private', 'JJ'),
 ('sector', 'NN'),
 ('greater', 'JJR'),
 ('role', 'NN'),
 ('according', 'VBG'),
 ('budget', 'NN'),
 ('proposal', 'NN'),
 ('released', 'VBN'),
 ('Monday', 'NNP'),
 ('The', 'DT'),
 ('administration', 'NN'),
 ('also', 'RB'),
 ('looking', 'VBG'),
 ('end', 'JJ'),
 ('American', 'JJ'),
 ('payments', 'NNS'),
 ('International', 'NNP'),
 ('Space', 'NNP'),
 ('Station', 'NNP'),
 ('2025', 'CD'),
 ('The', 'DT'),
 ('space', 'NN'),
 ('stati

In [8]:
def customPattern(tagged):
    my_entities = []
    entity = []
    for tagged_entry in tagged:
        if(tagged_entry[1].startswith("NN") or (entity and tagged_entry[1].startswith("IN"))):
            entity.append(tagged_entry)
        else:
            if(entity) and entity[-1][1].startswith("IN"):
                entity.pop()
            if(entity and " ".join(e[0] for e in entity)[0].isupper()):
                my_entities.append(" ".join(e[0] for e in entity))
            entity = []
    return my_entities

In [9]:
def extractEntities(ne_chunked):
    data = []
    for entity in ne_chunked:
        if isinstance(entity, nltk.tree.Tree):
            data.append(" ".join([word for word, tag in entity.leaves()]))
        else:
            continue
    return data

In [10]:
my_entities = customPattern(tagged)
for entity in my_entities:
    try:
        page = wikipedia.page(entity)
        print(entity)
        print(page.summary.split(".")[0])
    except:
        print(entity)
        print("Unknown")
    print("\n")

NASA
The National Aeronautics and Space Administration (NASA ) is an independent agency of the executive branch of the United States federal government responsible for the civilian space program, as well as aeronautics and aerospace research


Monday
Monday is the day of the week between Sunday and Tuesday


New York Times
The New York Times (sometimes abbreviated as The NYT or The Times) is an American newspaper based in New York City with worldwide influence and readership


Oct.
October is the tenth month of the year in the Julian and Gregorian Calendars and the sixth of seven months to have a length of 31 days


Congress
A congress is a formal meeting of the representatives of different nations, constituent states, organizations (such as trade unions, and political parties), or groups


Mr. Trump
Trump is a surname of English and German origin:
a German surname, from a word for "drum"; it is notable as the surname of the American entrepreneurial Trump family, but has an older prese



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


Who
Unknown


Elon Musk
Elon Reeve Musk (; born June 28, 1971) is a South African-born Canadian American business magnate, investor, and engineer


Mars
Mars is the fourth planet from the Sun and the second-smallest planet in the Solar System after Mercury


SEE ALL COMMENTS A NASA spokesman
Unknown


NASA's budget
As a federal agency, the National Aeronautics and Space Administration (NASA) receives its funding from the annual federal budget passed by the United States Congress


Jim Bridenstine Oklahoma congressman
James Frederick Bridenstine (born June 15, 1975) is an American politician who has been the United States Representative for Oklahoma's 1st congressional district, based in Tulsa, since 2013


Whether administration votes
Unknown


Trump administration
Donald Trump was inaugurated as the 45th President of the United States at noon EST on January 20, 2017, succeeding Barack Obama


National Space Council
The National Space Council is a body within the Executive Office of th

In [11]:
def wikipediaClassification(entity):
    try:
        page = wikipedia.page(entity)
        summary = page.summary.split(".")[0]
    except:
        return "Thing"
    
    tokens = preprocessText(summary)
    tagged = nltk.pos_tag(tokens)
    ne_chunked = nltk.ne_chunk(tagged)
    res = " ".join(extractEntities(ne_chunked))
    if not res:
        return "Thing"
    else:
        return res

def wikipediaClassificationCustom(entity):
    try:
        page = wikipedia.page(entity)
        summary = page.summary.split(".")[0]
    except:
        return "Thing"
    
    tokens = preprocessText(summary)
    tagged = nltk.pos_tag(tokens)
    ne_chunked = customPattern(tagged)
    res = " ".join(ne_chunked)
    if not res:
        return "Thing"
    else:
        return res

In [12]:
def processEntities(ne_chunked):
    df = pd.DataFrame(columns=["Entity", "Nltk-based Class", "Wiki-based Class using nltk entities", "Wiki-based Class using custom entities"])
    for entity in ne_chunked:
        if isinstance(entity, nltk.tree.Tree):
            text = " ".join([word for word, tag in entity.leaves()])
            ent = entity.label()
            wiki_nltk = wikipediaClassification(text)
            wiki_custom = wikipediaClassificationCustom(text)
            df = df.append({"Entity": text,
                            "Nltk-based Class": ent,
                            "Wiki-based Class using nltk entities": wiki_nltk,
                            "Wiki-based Class using custom entities": wiki_custom}, ignore_index=True)
            #print(".")
        else:
            continue
    return df

In [13]:
ne_chunked = nltk.ne_chunk(tagged, binary=False)
processEntities(ne_chunked)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "html5lib")

  markup_type=markup_type))


Unnamed: 0,Entity,Nltk-based Class,Wiki-based Class using nltk entities,Wiki-based Class using custom entities
0,Trump,PERSON,Thing,Thing
1,NASA,ORGANIZATION,National Aeronautics Space Administration NASA...,National Aeronautics Space Administration NASA
2,American,GPE,Thing,Thing
3,International Space,ORGANIZATION,International Space Station,International Space Station ISS space station
4,NASA,ORGANIZATION,National Aeronautics Space Administration NASA...,National Aeronautics Space Administration NASA
5,New York Times,GPE,New York Times Times American New York City,New York Times NYT Times American newspaper Ne...
6,Congress,ORGANIZATION,Thing,Thing
7,Mr. Trump,PERSON,Trump English German German American Trump Uni...,Trump surname German German surname word Trump...
8,RELATED,ORGANIZATION,Related American WB,WB network
9,No One Owns Moon,ORGANIZATION,Neil Alden Armstrong American Moon,Neil Alden Armstrong August


In [14]:
def counts(entities):
    counts = Counter(entities)
    sortedCounts = sorted(counts.items(), key=lambda count:count[1], reverse=True)
    return sortedCounts

In [15]:
counts(customPattern(tagged))

[('NASA', 3),
 ('Trump administration', 3),
 ('Congress', 2),
 ('February', 2),
 ('International Space Station', 2),
 ('Monday', 1),
 ('New York Times', 1),
 ('Oct.', 1),
 ('Mr. Trump', 1),
 ("NASA's", 1),
 ('One Owns Moon', 1),
 ('Make Money', 1),
 ('NOV.', 1),
 ("Falcon Heavy Roar Thunder Carries SpaceX's Ambition Into Orbit FEB.", 1),
 ("Google Lunar X Prize's Race Moon", 1),
 ('Nobody Won JAN.', 1),
 ('RECENT COMMENTS Bill February', 1),
 ('Predictably Trump regime', 1),
 ('NASA funding', 1),
 ('Climate change denier', 1),
 ('Navy pilot As line', 1),
 ('Who', 1),
 ('Elon Musk', 1),
 ('Mars', 1),
 ('SEE ALL COMMENTS A NASA spokesman', 1),
 ("NASA's budget", 1),
 ('Jim Bridenstine Oklahoma congressman', 1),
 ('Whether administration votes', 1),
 ('National Space Council', 1),
 ('Vice President Mike Pence', 1),
 ('Verge outlets', 1),
 ('Republicans', 1),
 ('Bigfoot Senator Ted Cruz Republican Texas chairman Senate Subcommittee Space Science Competitiveness',
  1),
 ('Wednesday Federal

In [16]:
ne_chunked = nltk.ne_chunk(tagged)
counts(extractEntities(ne_chunked))

[('NASA', 10),
 ('Trump', 4),
 ('Congress', 3),
 ('International Space', 2),
 ('Mr. Trump', 2),
 ('Google Lunar', 2),
 ('American', 1),
 ('New York Times', 1),
 ('RELATED', 1),
 ('No One Owns Moon', 1),
 ('Make Money', 1),
 ('Into Orbit', 1),
 ('Nobody Won', 1),
 ('RECENT', 1),
 ('COMMENTS', 1),
 ('Bill', 1),
 ('Climate', 1),
 ('Elon Musk', 1),
 ('Mars', 1),
 ('SEE', 1),
 ('Jim Bridenstine Oklahoma', 1),
 ('Senate', 1),
 ('Whether', 1),
 ('National Space Council', 1),
 ('Mike Pence', 1),
 ('Plankton Aerosol Cloud Ocean Ecosystem', 1),
 ('Verge', 1),
 ('Bigfoot Senator Ted Cruz Republican Texas', 1),
 ('Senate Subcommittee Space Science Competitiveness', 1),
 ('White House', 1),
 ('Scott Pace', 1),
 ('Mr.', 1),
 ('Mr. Pace', 1),
 ('United States', 1),
 ('Washington Post', 1),
 ('ISS', 1),
 ('Tommy Sanford', 1),
 ('Commercial Spaceflight Federation', 1),
 ('Mr. Sanford', 1),
 ('International Space Station', 1),
 ('John Elbon', 1),
 ('Boeing', 1),
 ('Moon Express Astrobotic', 1),
 ('Blue 