Skip to content

Commit

Permalink
fine tune paths and readme
Browse files Browse the repository at this point in the history
  • Loading branch information
amagge committed Jun 28, 2018
1 parent 9a9908b commit fbd9729
Show file tree
Hide file tree
Showing 8 changed files with 46 additions and 116 deletions.
2 changes: 0 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -101,12 +101,10 @@ ENV/
.mypy_cache/

# Custom project stuff
data/
pkl/
.pkl
runs/
.vscode
model/
resources/
trash/
out/
92 changes: 24 additions & 68 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,75 +1,31 @@
# ner-topo-ff
Named Entity Recognizer (NER) for entity extraction using a feedforward deep neural network and distance supervision
# semeval-ffnn-baseline
This project presents a baseline system for Task 12 i.e. Named Entity Recognition (NER) and Concept Resolution subtasks that uses a 2-layer feedforward neural network.

Requirements:
tensorflow
numpy
arparse
gensim

To run :
python ff_model.py

Argparse prompt

usage: ff_model.py [-h] [--train TRAIN] [--test TEST] [--val VAL]
[--dist DIST] [--pubdir PUBDIR] [--outdir OUTDIR]
[--emb_loc EMB_LOC] [--embvocab EMBVOCAB]
[--hid_dim HID_DIM] [--lrn_rate LRN_RATE]
[--feat_cap FEAT_CAP] [--feat_dict FEAT_DICT]
[--dropout DROPOUT] [--window_size WINDOW_SIZE]
[--dist_epochs DIST_EPOCHS] [--train_epochs TRAIN_EPOCHS]
[--eval_interval EVAL_INTERVAL] [--n_classes {2,3}]
[--batch_size BATCH_SIZE] [--restore RESTORE] [--save SAVE]

optional arguments:
-h, --help show this help message and exit
--train _TRAIN_ train file location
--test _TEST_ test file location
--val _VAL_ val file location
--dist _DIST_ distance supervision files dir.
--pubdir _PUBDIR_ pubmed files dir. To be production set.
--outdir _OUTDIR_ Output dir for ffmodel annotated pubmed files.
--emb_loc _EMB_LOC_ word2vec embedding location
--embvocab _EMBVOCAB_ load top n words in word emb
--hid_dim _HID_DIM_ dimension of hidden layers
--lrn_rate _LRN_RATE_ learning rate
--feat_cap _FEAT_CAP_ Capitalization feature
--feat_dict _FEAT_DICT_ Dictionary feature
--dropout _DROPOUT_ dropout probability
--window_size _WINDOW_SIZE_ context window size - 3/5/7
--dist_epochs _DIST_EPOCHS_ number of distsup epochs
--train_epochs _TRAIN_EPOCHS_ number of train epochs
--eval_interval _EVAL_INTERVAL_ evaluate once in _ epochs
--n_classes _{2,3}_ number of classes
--batch_size _BATCH_SIZE_ batch size of training
--restore _RESTORE_ path of saved model
--save _SAVE_ path to save model
Dependencies:
1) ```python```
2) ```geonames-services``` for disabmiguation and normalization

Input files:
Requirements:
1) Directory containing BRAT annotated files i.e. corpus files containing article texts (.txt) and respective annotation files (.ann). You can extract the training files from the provided dataset and place the .ann and .txt files in the ```data/train``` directory.
2) A file containing word embeddings i.e word vectors that can be loaded using the gensim model. You can download word embeddings trained on PubMed and Wikipedia articles from http://bio.nlplab.org/ and place the bin file in the ```resources``` directory.

Annotated input expected as a file containing tokens on each line along with their respective annotations B/I/O or I/O separated by tab-spaces.
Install dependencies:
```
pip install --upgrade -r requirements.txt
```

To train the model:
1) Create the files required for training by running the following command:
```
Overall O
, O
these O
results O
indicate O
widespread O
human-to-animal O
transmission O
of O
pandemic O
( O
H1N1 O
) O
2009 O
influenza O
viruses O
in O
South B
Korea I
. O
python gen_training_files.py -t data/train -e resources/wikipedia-pubmed-and-PMC-w2v.bin -o resources/
```

2) Train the model by running the following command:
```
python ffnn_train.py
```

3) To annotate files using the trained model, run the following command:
```
python ffnn_run.py
```
3 changes: 3 additions & 0 deletions data/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# ignore everything except .gitignore just to keep data directory
*
!.gitignore
13 changes: 0 additions & 13 deletions ffnn_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,6 @@ def evaluate(tokens, instances, labels, write_result=False):
saver.restore(sess, save_loc)
print("Model from {} restored.".format(save_loc))
evaluate(test_t, test_v, test_l, True)
# load the pubmed files for annotation pubdir
# pub_files = [f for f in listdir(args.pubdir) if isfile(join(args.pubdir, f))]
# for _, pubfile in enumerate(pub_files):
# pub_t, pub_v = get_input_pub(args, word_emb, join(args.pubdir, pubfile))
# prediction = sess.run(model.pred, feed_dict={model.input_x: np.asarray(pub_v),
# model.dropout: 1.0})
# write_pred_and_entities(args, pub_t, prediction, pubfile.replace(".txt", ""))

def main():
'''Main method : parse input arguments and train'''
Expand All @@ -141,12 +134,6 @@ def main():
help='test file location')
parser.add_argument('--val', type=str, default='data/io/val-io.txt',
help='val file location')
parser.add_argument('--dist', type=str, default='data/dist/',
help='distance supervision files dir.')
parser.add_argument('--pubdir', type=str, default='data/pubmed/',
help='pubmed files dir containing production set. ')
parser.add_argument('--outdir', type=str, default='out/pubmed/',
help='Output dir for ffmodel annotated pubmed files.')
# Word Embeddings
parser.add_argument('--emb_loc', type=str, default="model/word-embeddings.pkl",
help='word2vec embedding location')
Expand Down
17 changes: 7 additions & 10 deletions gen_training_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@
import random
import re
import sys
from os import listdir
from os.path import join
from os import listdir, makedirs
from os.path import join, exists
import codecs
import numpy as np
from gensim.models import Word2Vec
Expand Down Expand Up @@ -107,8 +107,6 @@ def load_test_data(train_dir):
def create_embeddings(args):
'''Create embeddings object and dump pickle for use in subsequent models'''
vocab = load_train_data(args.train_corpus)
test_vocab = load_test_data(args.test_corpus)
vocab = vocab.union(test_vocab)
print("Total vocab:", len(vocab))
print("Loading word embeddings:", args.emb_loc)
unk_words = set()
Expand All @@ -120,6 +118,8 @@ def create_embeddings(args):
except KeyError:
unk_words.add(word)
print("Number of unknown words:", len(unk_words))
if not exists(args.out_dir):
makedirs(args.out_dir)
# Dump dictionary pickle to disk
print("Dumping training files to", args.out_dir)
pickle.dump(wemb_dict, open(join(args.out_dir, WORDEMB_FILENAME), "wb"))
Expand All @@ -134,14 +134,11 @@ def main():
'''Main method : parse input arguments and train'''
parser = argparse.ArgumentParser()
# Input and Output paths
parser.add_argument('--train_corpus', type=str, default='data/train/',
parser.add_argument('-t', '--train_corpus', type=str, default='data/train/',
help='path to dir where training corpus files are stored')
parser.add_argument('--test_corpus', type=str, default='data/test/',
help='path to dir where training corpus files are stored')
parser.add_argument('--emb_loc', type=str,
default="data/PMC-w2v.bin",
parser.add_argument('-e', '--emb_loc', type=str,
help='path to the word2vec embedding location')
parser.add_argument('--out_dir', type=str, default='model/',
parser.add_argument('-o', '--out_dir', type=str, default='resources/',
help='output file containing minimal vocabulary')
args = parser.parse_args()
print(args)
Expand Down
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
tensorflow==1.50
numpy==1.14.0
gensim==2.1.0
3 changes: 3 additions & 0 deletions resources/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# ignore everything except .gitignore just to keep data directory
*
!.gitignore
29 changes: 6 additions & 23 deletions utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
"""Utility functions for loading datasets and computing performance"""
from __future__ import print_function
import os, codecs, io
import sys
from os.path import join, isfile
from random import random

import codecs
import cPickle as pickle
import re
import sys
from os.path import isfile, join

import numpy as np
from gensim.models.keyedvectors import KeyedVectors
import requests
from gensim.models.keyedvectors import KeyedVectors
from requests.utils import quote

GEONAMES_URL = "http://localhost:8091/location?location="
Expand Down Expand Up @@ -114,24 +115,6 @@ def tokenize_document(doc_path):
doc_vocab.add(word)
return doc_tokens, doc_vocab

def write_annotations(annotations, doc_path):
'''Write annotations to file'''
with open(doc_path, 'r') as myfile:
doc_text = myfile.read()
doc_vocab = set()
# Split sentences into words and create Token objects
doc_tokens = []
words = re.split(SPLIT_REGEX, doc_text)
words = [word.strip() for word in words if word.strip() != ""]
current_offset = 0
for word in words:
word_offset = doc_text.index(word, current_offset)
current_offset = current_offset + len(word)
doc_token = Token(word, word_offset, len(word), "O")
doc_tokens.append(doc_token)
doc_vocab.add(word)
return doc_tokens, doc_vocab

def get_namedentities(args, tokens, prediction):
'''Get list of named entitiess'''
assert len(tokens) == len(prediction)
Expand Down

0 comments on commit fbd9729

Please sign in to comment.