In [1]:
# open the new dataset

import codecs, nltk

dataset = codecs.open("dataset.tsv", "r", "utf-8").read().strip().split("\n")


# how to quickly find an article from the dataset
for k in range(len(dataset)):
    article = dataset[k]
    if "Trump" in article and "Hillary" in article:
        print (article.split("\t")[1])
        print (k)
        break


Hillary Clinton diagnosed with pneumonia, cancels California campaign trip, 'Ellen' appearance
4


In [2]:
# named entity recognition

article = dataset[4].split("\t")[3]

# first step you tokenize (read documentation to know the input of NER)
article = nltk.word_tokenize(article)

# you use the pos-tagger (it gives you back a list of tuples (word,pos))
pos_article = nltk.pos_tag(article)

# then you use the NER library
ner = nltk.ne_chunk(pos_article)

print (ner)

(S
  Dr./NNP
  Lisa/NNP
  Bardack/NNP
  ,/,
  Clinton’s/NNP
  personal/JJ
  doctor/NN
  since/IN
  2001/CD
  ,/,
  released/VBN
  a/DT
  statement/NN
  through/IN
  the/DT
  (PERSON Clinton/NNP)
  campaign/NN
  which/WDT
  said/VBD
  the/DT
  former/JJ
  secretary/NN
  of/IN
  state/NN
  had/VBD
  been/VBN
  diagnosed/VBN
  with/IN
  pneumonia/NN
  during/IN
  a/DT
  follow-up/JJ
  examination/NN
  regarding/VBG
  her/PRP$
  prolonged/JJ
  cough/NN
  ./.
  Dr./NNP
  (PERSON Lisa/NNP R./NNP Bardack/NNP)
  ,/,
  M.D./NNP
  ,/,
  (PERSON Clinton/NNP)
  's/POS
  doctor/NN
  ,/,
  says/VBZ
  the/DT
  (ORGANIZATION Democratic/JJ)
  nominee/NN
  has/VBZ
  pneumonia/VBN
  ./.
  Full/JJ
  statement/NN
  :/:
  pic.twitter.com/qloLbhjdZy/NN
  (PERSON Clinton/NNP)
  has/VBZ
  been/VBN
  “advised/VBN
  to/TO
  rest/VB
  and/CC
  modify/VB
  her/PRP$
  schedule/NN
  ,/,
  ”/NN
  and/CC
  was/VBD
  put/VBN
  on/IN
  antibiotics/NNS
  on/IN
  Friday/NNP
  ,/,
  (PERSON Bardack/NNP)
  said/VBD
  ./.
  

In [3]:
ner = [x for x in ner if type(x) == nltk.tree.Tree]
print (ner)

[Tree('PERSON', [('Clinton', 'NNP')]), Tree('PERSON', [('Lisa', 'NNP'), ('R.', 'NNP'), ('Bardack', 'NNP')]), Tree('PERSON', [('Clinton', 'NNP')]), Tree('ORGANIZATION', [('Democratic', 'JJ')]), Tree('PERSON', [('Clinton', 'NNP')]), Tree('PERSON', [('Bardack', 'NNP')]), Tree('PERSON', [('Video', 'NNP')]), Tree('PERSON', [('Clinton', 'NNP')]), Tree('PERSON', [('Nick', 'NNP'), ('Merrill', 'NNP')]), Tree('GPE', [('Cappaqua', 'NNP')]), Tree('ORGANIZATION', [('PDT', 'NNP')]), Tree('ORGANIZATION', [('HRC', 'NNP')]), Tree('ORGANIZATION', [('Reuters', 'NNPS')]), Tree('PERSON', [('Clinton', 'NNP')]), Tree('GPE', [('California', 'NNP')]), Tree('PERSON', [('San', 'NNP'), ('Francisco', 'NNP')]), Tree('GPE', [('Los', 'NNP'), ('Angeles', 'NNP')]), Tree('GPE', [('Las', 'NNP'), ('Vegas', 'NNP')]), Tree('GPE', [('Nevada', 'NNP')]), Tree('ORGANIZATION', [('Democratic', 'JJ')]), Tree('PERSON', [('Jim', 'NNP'), ('Jatras', 'NNP')]), Tree('GSP', [('US', 'NNP')]), Tree('PERSON', [('Clinton’s', 'NNP')]), Tree('

In [4]:
import tagme
# Set the authorization token for subsequent calls.
tagme.GCUBE_TOKEN = "0ff2cd5e-79d7-4dac-b721-186a1a8df1ca-843339462"

article = dataset[4].split("\t")[3]

annotated_article = tagme.annotate(article)


In [5]:
# check the type
print (type(annotated_article))

<class 'tagme.AnnotateResponse'>


In [7]:
# read the documentation to learn other commands

for ann in annotated_article.get_annotations(0.1):
    print (ann)

Dr -> Democratic-Republican Party (score: 0.11574618518352509)
Clinton -> Bill Clinton (score: 0.18557091057300568)
statement -> Press release (score: 0.13940580189228058)
Clinton -> Hillary Clinton (score: 0.3210620880126953)
Clinton campaign -> Hillary Clinton presidential campaign, 2008 (score: 0.36386874318122864)
secretary of state -> United States Secretary of State (score: 0.25825050473213196)
pneumonia -> Pneumonia (score: 0.4892668128013611)
examination -> Physical examination (score: 0.24207685887813568)
cough -> Cough medicine (score: 0.2726736068725586)
M.D -> Maryland (score: 0.14633125066757202)
Clinton -> Bill Clinton (score: 0.18490441143512726)
doctor -> Physician (score: 0.11644203960895538)
Democratic -> Democratic Party (United States) (score: 0.22528484463691711)
nominee -> Candidate (score: 0.10345222800970078)
pneumonia -> Pneumonia (score: 0.29946744441986084)
twitter -> Twitter (score: 0.5765565633773804)
twitter.com -> Twitter (score: 0.40020275115966797)
Clin

In [8]:
# test with this
sent = tagme.annotate("Yesterday I watched the debate between Clinton and Sanders.")

# Print annotations with a score higher than 0.1
for ann in sent.get_annotations(0.1):
    print (ann)

# why is it still making mistakes?

debate -> United States presidential election debates (score: 0.1599879115819931)
Clinton -> Bill Clinton (score: 0.22683759033679962)
Sanders -> Bernie Sanders (score: 0.19481973350048065)


In [10]:
# computing entity relatedness
rels = tagme.relatedness_title(("Hillary Clinton", "Bernie Sanders"))
print ("Hillary and Bernie have a semantic relation of", rels.relatedness[0].rel)

rels = tagme.relatedness_title(("Bill Clinton", "Bernie Sanders"))
print ("Bill and Bernie have a semantic relation of", rels.relatedness[0].rel)

rels = tagme.relatedness_title(("Bill Clinton", "Hillary Clinton"))
print ("Bill and Hillary have a semantic relation of", rels.relatedness[0].rel)

Hillary and Bernie have a semantic relation of 0.7044501304626465
Bill and Bernie have a semantic relation of 0.6134611368179321
Bill and Hillary have a semantic relation of 0.7114993929862976


In [6]:
#homework: extract the most popular NERs (not entities!) from the entire corpus

import codecs, nltk

dataset = codecs.open("dataset.tsv", "r", "utf-8").read().strip().split("\n")


from collections import Counter

box = []

# how to quickly find an article from the dataset
for k in range(len(dataset)):
    article = dataset[k]
    if "Trump" in article and "Hillary" in article:
        article = dataset[4].split("\t")[3]
        
        # first step you tokenize (read documentation to know the input of NER)
        article = nltk.word_tokenize(article)

        # you use the pos-tagger (it gives you back a list of tuples (word,pos))
        pos_article = nltk.pos_tag(article)

        # then you use the NER library
        ner = nltk.ne_chunk(pos_article)
        
        ner = [x[0][0] for x in ner if type(x) == nltk.tree.Tree]
        
        for entity in ner:
            box.append(entity)
        


KeyboardInterrupt: 

In [7]:
count = Counter(box)

print (count.most_common(30))

[('Clinton', 913), ('Democratic', 332), ('Clinton’s', 249), ('Pneumonia', 249), ('Bernie', 249), ('HRC', 166), ('Jatras', 166), ('HillaryHealth', 166), ('Twitter', 166), ('HillarysHealth', 166), ('Donald', 166), ('Hillary', 166), ('Trump', 166), ('Lisa', 83), ('Bardack', 83), ('Video', 83), ('Nick', 83), ('Cappaqua', 83), ('PDT', 83), ('Reuters', 83), ('California', 83), ('San', 83), ('Los', 83), ('Las', 83), ('Nevada', 83), ('Jim', 83), ('US', 83), ('Campaign', 83), ('Media', 83), ('Democrats', 83)]
