### Importing standard modules

In [1]:
#Reading input
import gzip, warc

#Preprocess and NER Tag
from bs4 import BeautifulSoup
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.tag.stanford import StanfordNERTagger 
from segtok.segmenter import split_single 

#Elasticsearch 
import elasticsearch,requests,os

#Other utilities (not necessarily required)
import numpy as np
#from tqdm import tqdm
import time

In [2]:
#Flair only imported now for built in embeddings, but can download those seperate.
import flair, torch
flair.device = torch.device('cpu')  #If want to run CPU even if have GPU

from flair.data import Sentence
from flair.models import SequenceTagger

I1203 22:15:14.150814 14656 file_utils.py:39] PyTorch version 1.2.0+cu92 available.
I1203 22:15:14.334442 14656 modeling_xlnet.py:194] Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


### Reading and extracting info from warc file

In [3]:
#Beautiful soup HTML to text 
with gzip.open('../data/sample.warc.gz', mode='rb') as gzf:
    count = 0
    cleantexts = []
    url_list= []
    doc_ids = []
    
    for i,record in enumerate(warc.WARCFile(fileobj=gzf)):
        if i == 0:
            continue
        cleantexts.append(BeautifulSoup(record.payload.read(), 'lxml').text)
        url_list.append(record.header.get('WARC-Target-URI'))
        doc_ids.append(record.header.get('WARC-TREC-ID'))
        
        #if count > 559:
        #    break

In [4]:
len(cleantexts)

1464

### Additional filtering of text

### Named Entity Recognition (NER) - Different taggers

#### NLTK way of NER (fastest so far and reasonable?)

In [26]:
''' 
When first time using this method, need to run this to download:

import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
'''

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marvi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\marvi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\marvi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\marvi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

In [30]:
start = time.time()
l = {(' '.join(c[0] for c in chunk), chunk.label() ) for chunk in ne_chunk(pos_tag(word_tokenize(cleantexts[0]))) if hasattr(chunk, 'label') }
end = time.time()
print(end - start)
l

1.0169801712036133


{('CDATA', 'ORGANIZATION'),
 ('Flash Player', 'PERSON'),
 ('France', 'GPE'),
 ('Luke Morton', 'PERSON'),
 ('Ocular Professor', 'PERSON'),
 ('Photos', 'PERSON'),
 ('Pool', 'PERSON'),
 ('Roy Tanck', 'PERSON'),
 ('SWFObject', 'ORGANIZATION'),
 ('Tunis Tunisia', 'PERSON'),
 ('Unix', 'ORGANIZATION'),
 ('WordPress', 'ORGANIZATION'),
 ('allowScriptAccess', 'ORGANIZATION'),
 ('xtWP Cumulus Flash', 'ORGANIZATION')}

In [5]:
def tag_with_NLTK(text):
    return {(' '.join(c[0] for c in chunk), chunk.label() ) for chunk in ne_chunk(pos_tag(word_tokenize(text))) if hasattr(chunk, 'label') }
    

#### Stanford NER

In [4]:
#"""
jar = '../stanford-ner-2018/stanford-ner.jar'
model = '../stanford-ner-2018/classifiers/english.all.3class.distsim.crf.ser.gz'

# Prepare NER tagger with english model
ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

#tagged_docs = [ner_tagger.tag(word_tokenize(text)) for text in cleantexts if label != "O"]

#With Progress bar
#tagged_docs = [ner_tagger.tag(word_tokenize(text)) for text in tqdm(cleantexts)]

In [31]:
start = time.time()
l1 = ner_tagger.tag(word_tokenize(cleantexts[1]))
end = time.time()
print(end - start)

#Standford tag with word based
for word,label in l1:
    if label != "O":
        print(word,label)

2.8749115467071533
France LOCATION
France LOCATION
Tunis LOCATION
Tunisia LOCATION
Roy PERSON
Tanck PERSON
Luke PERSON
Morton PERSON


In [5]:
#Standford tag with sentence based
l2 = ner_tagger.tag(split_single(cleantexts[1]))
for word,label in l2:
    if label != "O":
        print(word,label)

France LOCATION
France LOCATION


In [5]:
def tag_with_stanford(text,level="word", 
    jar = '../stanford-ner-2018/stanford-ner.jar',model = '../stanford-ner-2018/classifiers/english.all.3class.distsim.crf.ser.gz'):
    #jar = '../stanford-ner-2018/stanford-ner.jar'
    #model = '../stanford-ner-2018/classifiers/english.all.3class.distsim.crf.ser.gz'
    
    # Prepare NER tagger (default english model with 3 class)
    ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')
    if level == "word":
        tagged_text = ner_tagger.tag(word_tokenize(text))
        
    elif level == "sent": #Sentence based
        tagged_text = ner_tagger.tag(split_single(text))
    
    tags = set() 
    for word,label in tagged_text:
        if label != "O":
            tags.add( (word,label) )
            #print(word,label)
    return tags

tags = tag_with_stanford(cleantexts[1])#,"sent")
tags

{('Luke', 'PERSON'),
 ('Morton', 'PERSON'),
 ('Roy', 'PERSON'),
 ('Tanck', 'PERSON'),
 ('Tunis', 'LOCATION'),
 ('Tunisia', 'LOCATION')}

#### Flair NER

In [5]:
#tagger = SequenceTagger.load('ner')  # 4 class model with large amount of layers
#tagger = SequenceTagger.load('ner-fast') #Smaller model with half dimensions of above, faster if CPU
tagger = SequenceTagger.load('ner-ontonotes-fast') # 18 class model fast version ner

2019-11-29 14:50:37,294 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/models-v0.4/release-ner-ontonotes-fast-0/en-ner-ontonotes-fast-v0.4.pt not found in cache, downloading to C:\Users\marvi\AppData\Local\Temp\tmp26txz74x


100%|██████████████████████████████████████████████████████████████| 1331379415/1331379415 [05:32<00:00, 3998194.53B/s]


2019-11-29 14:56:10,530 copying C:\Users\marvi\AppData\Local\Temp\tmp26txz74x to cache at C:\Users\marvi\.flair\models\en-ner-ontonotes-fast-v0.4.pt
2019-11-29 14:56:12,092 removing temp file C:\Users\marvi\AppData\Local\Temp\tmp26txz74x
2019-11-29 14:56:12,216 loading file C:\Users\marvi\.flair\models\en-ner-ontonotes-fast-v0.4.pt


In [15]:
sentences = [Sentence(sent, use_tokenizer=True) for sent in split_single(cleantexts[1]) if len(sent) > 0]
#sentences = [Sentence(sent, use_tokenizer=False) for sent in split_single(cleantexts[1]) if len(sent) > 0]

In [10]:
len(sentences)

105

In [16]:
#With use of tokenizer and tagging on 1 document with "ner" model is 324 s with 9/10 GB of memory
#No use of tokenizer and tagging on 1 document with "ner" model is 258 s with 5/6 GB of memory

#Probably best for now
#With use of tokenizer and tagging on 1 document with "ner-fast" model is 88 s with 3.5 GB of memory
#No use of tokenizer and tagging on 1 document with "ner-fast" model is 71 s with 3.5 GB of memory

#With use of tokenizer and tagging on 1 document with "ner-ontonotes-fast" model is 68 s with 6 GB of memory
#No use of tokenizer and tagging on 1 document with "ner-ontonotes-fast" model is 49 s with 3 GB? of memory

start = time.time()
tag_test = tagger.predict(sentences)
end = time.time()
print(end - start)

2019-11-29 15:02:12,623 Ignore 2 sentence(s) with no tokens.
67.73403835296631


In [33]:
#Ner-fast results
for sentence in sentences:
    for entity in sentence.get_spans("ner"):
        print(entity)

"ISC-span [7]: "GMT
"ISC-span [7]: "GMT
LOC-span [2]: "France"
ORG-span [1]: "News"
LOC-span [1]: "France"
ORG-span [1]: "la"
LOC-span [1]: "Tunis"
LOC-span [1]: "Tunisia"
ORG-span [7,8]: "Roy Tanck"
PER-span [10,11]: "Luke Morton"
ORG-span [13,14]: "Flash Player"
LOC-span [3]: "'photobabble'"
ORG-span [3]: "Ocular"
ORG-span [8]: "WordPress"


In [89]:
for x in sentences:
    if len(x.to_dict(tag_type='ner')["entities"]) > 0:
        print(x.to_dict(tag_type='ner')["entities"])

[{'text': 'GMT', 'start_pos': 32, 'end_pos': 35, 'type': 'MISC', 'confidence': 0.999241828918457}]
[{'text': 'GMT', 'start_pos': 35, 'end_pos': 38, 'type': 'MISC', 'confidence': 0.9993075132369995}]
[{'text': 'France', 'start_pos': 5, 'end_pos': 11, 'type': 'LOC', 'confidence': 0.9995354413986206}]
[{'text': 'News', 'start_pos': 0, 'end_pos': 4, 'type': 'ORG', 'confidence': 0.8299650549888611}]
[{'text': 'France', 'start_pos': 0, 'end_pos': 6, 'type': 'LOC', 'confidence': 0.9995354413986206}]
[{'text': 'la', 'start_pos': 0, 'end_pos': 2, 'type': 'ORG', 'confidence': 0.574532687664032}]
[{'text': 'Tunis', 'start_pos': 0, 'end_pos': 5, 'type': 'LOC', 'confidence': 0.9505420923233032}]
[{'text': 'Tunisia', 'start_pos': 0, 'end_pos': 7, 'type': 'LOC', 'confidence': 0.9992771744728088}]
[{'text': 'Roy Tanck', 'start_pos': 32, 'end_pos': 41, 'type': 'PER', 'confidence': 0.5057555437088013}, {'text': 'Luke Morton', 'start_pos': 46, 'end_pos': 57, 'type': 'PER', 'confidence': 0.941348731517791

In [14]:
#Results of ner_ontoton-fast with no use of tokenizer = True
for sentence in sentences:
    for entity in sentence.get_spans("ner"):
        print(entity)

CARDINAL-span [1,2]: "HTTP/1.1 200"
DATE-span [3,4,5]: "10 Feb 2012"
TIME-span [6]: "22:50:40"
DATE-span [2,3,4,5]: "Thu, 19 Nov 1981"
TIME-span [6]: "08:52:00"
GPE-span [2]: "France"
CARDINAL-span [5]: "2012-02-10"
PERSON-span [1]: "shaggyshoo"
GPE-span [1]: "France"
TIME-span [5]: "16:22:52"
DATE-span [1,2,3]: "February 10th, 2012"
CARDINAL-span [1]: "one"
GPE-span [1]: "Tunis"
GPE-span [1]: "Tunisia"
ORG-span [2]: "Cumulus"
PERSON-span [7,8]: "Roy Tanck"
PERSON-span [10,11]: "Luke Morton"
TIME-span [4]: "9/1/10"


In [21]:
#Results of ner_ontoton-fast with no use of tokenizer = False
for sentence in sentences:
    for entity in sentence.get_spans("ner"):
        print(entity)

DATE-span [5,6,7]: "10 Feb 2012"
TIME-span [8,9]: "22:50:40 GMT"
PERSON-span [3]: "Thu"
DATE-span [5,6,7]: "19 Nov 1981"
TIME-span [8,9]: "08:52:00 GMT"
GPE-span [1]: "France"
TIME-span [5]: "16:22:52"
CARDINAL-span [7]: "1"
PERSON-span [1]: "shaggyshoo"
DATE-span [5,6,7,8,9]: "11 / 2 / 08"
CARDINAL-span [7]: "234"
CARDINAL-span [7]: "60"
GPE-span [3]: "france"
GPE-span [1]: "France"
TIME-span [5]: "16:22:52"
DATE-span [1,2,3,4]: "February 10th , 2012"
CARDINAL-span [1]: "one"
GPE-span [1]: "Tunis"
GPE-span [1]: "Tunisia"
ORG-span [2]: "Cumulus"
PRODUCT-span [3]: "Flash"
PERSON-span [7,8]: "Roy Tanck"
PERSON-span [10,11]: "Luke Morton"
CARDINAL-span [53]: "200"
PERCENT-span [172,173]: "3A %"
PERCENT-span [180,181]: "2F2008 %"
PERCENT-span [190,191]: "27tag-link-1033 %"
CARDINAL-span [198]: "27515"
PERCENT-span [212,213,214,215,216,217,218,219,220,221]: "12.3513513514pt % 3B % 27 % 3E2008 % 3C %"
PERCENT-span [226,227]: "0A %"
PERCENT-span [236,237,238,239]: "3A % 2F %"
PERCENT-span [24

### Use elasticsearch

In [6]:
#Start elastichsearch local server first before executing this cell
def do_elasticsearch(query="Vrije Universiteit",domain="localhost:9200",print_output=False,extended_info=False):
    if extended_info: 
        url = 'http://%s/freebase/label/_search' % domain
        response = requests.get(url, params={'q': query, 'size':1000})
        response = response.json()
        return response.get('hits', {}).get('hits', [])
    else: 
        id_labels = elasticsearch.search(domain,query)

        if print_output:
            for entity, labels in id_labels.items():
                print(entity, labels)
                
        return id_labels


id_labels = do_elasticsearch(print_output=True)

/m/01vyty {'Vrije Universiteit', 'Vrije University', 'Vrije universiteit Amsterdam', 'vrije universiteit, amsterdam', 'Vrije Universiteit Amsterdam', 'VRIJE UNIVERSITEIT'}
/m/0225gp {'Vrije', 'Vrije Universiteit brussel', 'Vrije Universiteit', 'Vrije UNIVERSITEIT brussel', 'Vrije Universiteit Brussels', 'Vrije universiteit Brussel', 'Vrije universiteit brussel', 'Vrije University Brussel', 'Vrije Universiteit Brussel'}
/m/05md62 {'Vrije Universiteit'}
/m/07y10c {'Vrije Universiteit Amsterdam'}
/m/0225ty {'Vrije Universiteit Brussel'}
/m/02nbdg {'Universiteit Maastricht'}
/m/0lvng {'universiteit leiden', 'Universiteit Leiden', 'Universiteit van Leiden'}
/m/05c1q2 {'Erasmus Universiteit', 'Erasmus Universiteit Rotterdam', 'Universiteit van Amsterdam'}
/m/07w6r {'Universiteit Utrecht', 'Universiteit van Utrecht'}
/m/0419j9 {'Open Universiteit Nederland', 'Open Universiteit in the Netherlands', 'Open universiteit', 'Open Universiteit'}
/m/03gf8x {'Universiteit Antwerpen'}
/m/0556m4 {'Unive

In [16]:
domain = "localhost:9200"
url = 'http://%s/freebase/label/_search' % domain
query = "Vrije Universiteit"
response = requests.get(url, params={'q': query, 'size':1000})

In [17]:
response = response.json()
response.get('hits', {}).get('hits', [])
#_score to get the score of an hit

[{'_index': 'cw12',
  '_type': 'label',
  '_id': 'AWeJWGvhywBEYNF7f9KO',
  '_score': 9.265863,
  '_source': {'label': 'Vrije Universiteit', 'resource': '/m/01vyty'}},
 {'_index': 'cw12',
  '_type': 'label',
  '_id': 'AWeJRNWFywBEYNF7LJ1v',
  '_score': 8.682977,
  '_source': {'resource': '/m/01vyty', 'label': 'Vrije Universiteit'}},
 {'_index': 'cw12',
  '_type': 'label',
  '_id': 'AWeJTiOHywBEYNF7ZOp2',
  '_score': 8.682977,
  '_source': {'resource': '/m/0225gp', 'label': 'Vrije Universiteit'}},
 {'_index': 'cw12',
  '_type': 'label',
  '_id': 'AWeJTAM8ywBEYNF7WFhE',
  '_score': 8.682977,
  '_source': {'resource': '/m/05md62', 'label': 'Vrije Universiteit'}},
 {'_index': 'cw12',
  '_type': 'label',
  '_id': 'AWeJTAM8ywBEYNF7WFhF',
  '_score': 8.682977,
  '_source': {'resource': '/m/01vyty', 'label': 'VRIJE UNIVERSITEIT'}},
 {'_index': 'cw12',
  '_type': 'label',
  '_id': 'AWeJTiOHywBEYNF7ZOp6',
  '_score': 8.340757,
  '_source': {'resource': '/m/07y10c',
   'label': 'Vrije Universiteit

In [19]:
l = response.get('hits', {}).get('hits', [])
l[0]#["_source"]["resource"]

{'_index': 'cw12',
 '_type': 'label',
 '_id': 'AWeJWGvhywBEYNF7f9KO',
 '_score': 9.265863,
 '_source': {'label': 'Vrije Universiteit', 'resource': '/m/01vyty'}}

### Use trident

### Apply some heuristics and predict

In [7]:
#Used Glove embeddings built in Flair, which will download. Bad choice tho, just for testing.
from flair.embeddings import WordEmbeddings
glove_embedding = WordEmbeddings('glove')

import torch                    
cosine_similarity = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
best_id_key = None

#Cosine similarity if no torch, use sklearn method
#from sklearn.metrics.pairwise import cosine_similarity
#print(cosine_similarity(df, df))

#Hyperparameter, how many results of elasticsearch to compare
k = 5

#Tagger
tagger = "NLTK"
#tagger = "Stanford"
#tagger = "Flair"

I1202 17:59:51.701412 10852 utils.py:422] loading Word2VecKeyedVectors object from C:\Users\marvi\.flair\embeddings\glove.gensim
I1202 17:59:52.370649 10852 utils.py:461] loading vectors from C:\Users\marvi\.flair\embeddings\glove.gensim.vectors.npy with mmap=None
I1202 17:59:52.549958 10852 utils.py:494] setting ignored attribute vectors_norm to None
I1202 17:59:52.551953 10852 utils.py:428] loaded C:\Users\marvi\.flair\embeddings\glove.gensim


In [9]:

file = open("predictions.txt","w+",encoding="utf-8")

start = time.time()

#Document level
for doc_idx, text in enumerate(cleantexts):
    print(doc_idx)
    doc_key = doc_ids[doc_idx]
    
    if tagger == "NLTK":
        #NLTK
        #set_entities_tag = tag_with_NLTK(text)
        set_entities_tag = {(' '.join(c[0] for c in chunk), chunk.label() ) for chunk in ne_chunk(pos_tag(word_tokenize(text))) if hasattr(chunk, 'label') }
    elif tagger == "Stanford":
        set_entities_tag = tag_with_stanford(text)
    
    #print(set_entities_tag)
    
    #Go through each entity found
    for word,label in set_entities_tag:
        
        #Make embedding of entity, Naive method and probably use embedding not from Flair
        if len(word) < 1:
            continue 
            
        entity = Sentence(word)
        glove_embedding.embed(entity)
        emb_ner_word = entity[0].embedding
        
        #If not zero vector, do elasticsearch
        if np.count_nonzero(emb_ner_word):
            
            #print(word)
            #print(emb_ner_word)
            
            id_labels = do_elasticsearch(word)
            
            #Only taking the first result method
            if len(id_labels) == 0:
                continue
            
            best_id_key  = list(id_labels)[0] 
            
            #Taking top k result and do avg cosine similarity of embeddings
            '''
            #scores = []
            highest_value = -1
            
            for i,(freebase_id,freebase_labels) in enumerate(id_labels.items()):
                #print(freebase_id)
                #print(freebase_labels)
                sum_cos_sim = 0
                
                for query_result_name in freebase_labels:
                    name = Sentence(query_result_name)
                    glove_embedding.embed(name)
                    emb_name = name[0].embedding
                    
                    cos_sim_value = cosine_similarity(emb_ner_word,emb_name).item() 
                    sum_cos_sim += cos_sim_value
                    
                avg_cos_sim = sum_cos_sim / len(freebase_labels)
                #print(avg_cos_sim)
                
                if avg_cos_sim > highest_value:
                    best_id_key = freebase_id
                    highest_value = avg_cos_sim
                #scores[i] = avg_cos_sim
                

                if i == k:
                    break
            '''
            
            #highest_index = scores.index(max(scores))
            #print(doc_key)
            #print(word)
            #print(best_id_key)
            line = doc_key + '\t' + word + '\t' + best_id_key + "\n"
            #print(line)
            file.write(line)
            #print(highest_value)
            
    #break

file.close()
end = time.time()
print(end - start)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

### Compute score

In [10]:

#In terminal to test: 
#python score.py ../data/sample.annotations.tsv ./predictions.txt

#To save to a file
os.system("python score.py ../data/sample.annotations.tsv ./predictions.txt > ./results.txt ")

0