fine tune paths and readme

amagge · Jun 28, 2018 · fbd9729 · fbd9729
1 parent 9a9908b
commit fbd9729
Show file tree

Hide file tree

Showing 8 changed files with 46 additions and 116 deletions.
diff --git a/.gitignore b/.gitignore
@@ -101,12 +101,10 @@ ENV/
 .mypy_cache/
 
 # Custom project stuff
-data/
 pkl/
 .pkl
 runs/
 .vscode
 model/
-resources/
 trash/
 out/
diff --git a/README.md b/README.md
@@ -1,75 +1,31 @@
-# ner-topo-ff  
-Named Entity Recognizer (NER) for entity extraction using a feedforward deep neural network and distance supervision  
+# semeval-ffnn-baseline  
+This project presents a baseline system for Task 12 i.e. Named Entity Recognition (NER) and Concept Resolution subtasks that uses a 2-layer feedforward neural network.
 
-Requirements:  
-tensorflow  
-numpy  
-arparse 
-gensim  
-
-To run :  
-python ff_model.py  
-
-Argparse prompt  
-
-usage: ff_model.py [-h] [--train TRAIN] [--test TEST] [--val VAL]
-                   [--dist DIST] [--pubdir PUBDIR] [--outdir OUTDIR]
-                   [--emb_loc EMB_LOC] [--embvocab EMBVOCAB]
-                   [--hid_dim HID_DIM] [--lrn_rate LRN_RATE]
-                   [--feat_cap FEAT_CAP] [--feat_dict FEAT_DICT]
-                   [--dropout DROPOUT] [--window_size WINDOW_SIZE]
-                   [--dist_epochs DIST_EPOCHS] [--train_epochs TRAIN_EPOCHS]
-                   [--eval_interval EVAL_INTERVAL] [--n_classes {2,3}]
-                   [--batch_size BATCH_SIZE] [--restore RESTORE] [--save SAVE]
-
-optional arguments:  
-  -h, --help            show this help message and exit  
-  --train _TRAIN_         train file location  
-  --test _TEST_           test file location  
-  --val _VAL_             val file location  
-  --dist _DIST_           distance supervision files dir.  
-  --pubdir _PUBDIR_       pubmed files dir. To be production set.  
-  --outdir _OUTDIR_       Output dir for ffmodel annotated pubmed files.  
-  --emb_loc _EMB_LOC_     word2vec embedding location  
-  --embvocab _EMBVOCAB_   load top n words in word emb  
-  --hid_dim _HID_DIM_     dimension of hidden layers  
-  --lrn_rate _LRN_RATE_   learning rate  
-  --feat_cap _FEAT_CAP_   Capitalization feature  
-  --feat_dict _FEAT_DICT_ Dictionary feature  
-  --dropout _DROPOUT_     dropout probability  
-  --window_size _WINDOW_SIZE_ context window size - 3/5/7  
-  --dist_epochs _DIST_EPOCHS_ number of distsup epochs  
-  --train_epochs _TRAIN_EPOCHS_ number of train epochs  
-  --eval_interval _EVAL_INTERVAL_ evaluate once in _ epochs  
-  --n_classes _{2,3}_     number of classes  
-  --batch_size _BATCH_SIZE_ batch size of training  
-  --restore _RESTORE_     path of saved model  
-  --save _SAVE_           path to save model  
+Dependencies:
+1) ```python```
+2) ```geonames-services``` for disabmiguation and normalization
 
-Input files:
+Requirements:
+1) Directory containing BRAT annotated files i.e. corpus files containing article texts (.txt) and respective annotation files (.ann). You can extract the training files from the provided dataset and place the .ann and .txt files in the ```data/train``` directory.
+2) A file containing word embeddings i.e word vectors that can be loaded using the gensim model. You can download word embeddings trained on PubMed and Wikipedia articles from http://bio.nlplab.org/ and place the bin file in the ```resources``` directory.
 
-Annotated input expected as a file containing tokens on each line along with their respective annotations B/I/O or I/O separated by tab-spaces.
+Install dependencies:
+```
+pip install --upgrade -r requirements.txt
+```
 
+To train the model:
+1) Create the files required for training by running the following command:
 ```
-Overall O  
-, O  
-these O  
-results O  
-indicate O  
-widespread O  
-human-to-animal O  
-transmission O  
-of O  
-pandemic O  
-( O  
-H1N1 O  
-) O  
-2009 O  
-influenza O  
-viruses O  
-in O  
-South B  
-Korea I  
-. O  
+python gen_training_files.py -t data/train -e resources/wikipedia-pubmed-and-PMC-w2v.bin -o resources/
 ```
 
+2) Train the model by running the following command:
+```
+python ffnn_train.py
+```
+
+3) To annotate files using the trained model, run the following command:
+```
+python ffnn_run.py
+```
diff --git a/data/.gitignore b/data/.gitignore
@@ -0,0 +1,3 @@
+# ignore everything except .gitignore just to keep data directory
+*
+!.gitignore
diff --git a/ffnn_train.py b/ffnn_train.py
@@ -123,13 +123,6 @@ def evaluate(tokens, instances, labels, write_result=False):
         saver.restore(sess, save_loc)
         print("Model from {} restored.".format(save_loc))
         evaluate(test_t, test_v, test_l, True)
-        # load the pubmed files for annotation pubdir
-        # pub_files = [f for f in listdir(args.pubdir) if isfile(join(args.pubdir, f))]
-        # for _, pubfile in enumerate(pub_files):
-        #     pub_t, pub_v = get_input_pub(args, word_emb, join(args.pubdir, pubfile))
-        #     prediction = sess.run(model.pred, feed_dict={model.input_x: np.asarray(pub_v),
-        #                                                  model.dropout: 1.0})
-        #     write_pred_and_entities(args, pub_t, prediction, pubfile.replace(".txt", ""))
 
 def main():
     '''Main method : parse input arguments and train'''
@@ -141,12 +134,6 @@ def main():
                         help='test file location')
     parser.add_argument('--val', type=str, default='data/io/val-io.txt',
                         help='val file location')
-    parser.add_argument('--dist', type=str, default='data/dist/',
-                        help='distance supervision files dir.')
-    parser.add_argument('--pubdir', type=str, default='data/pubmed/',
-                        help='pubmed files dir containing production set. ')
-    parser.add_argument('--outdir', type=str, default='out/pubmed/',
-                        help='Output dir for ffmodel annotated pubmed files.')
     # Word Embeddings
     parser.add_argument('--emb_loc', type=str, default="model/word-embeddings.pkl",
                         help='word2vec embedding location')

diff --git a/gen_training_files.py b/gen_training_files.py
@@ -6,8 +6,8 @@
 import random
 import re
 import sys
-from os import listdir
-from os.path import join
+from os import listdir, makedirs
+from os.path import join, exists
 import codecs
 import numpy as np
 from gensim.models import Word2Vec
@@ -107,8 +107,6 @@ def load_test_data(train_dir):
 def create_embeddings(args):
     '''Create embeddings object and dump pickle for use in subsequent models'''
     vocab = load_train_data(args.train_corpus)
-    test_vocab = load_test_data(args.test_corpus)
-    vocab = vocab.union(test_vocab)
     print("Total vocab:", len(vocab))
     print("Loading word embeddings:", args.emb_loc)
     unk_words = set()
@@ -120,6 +118,8 @@ def create_embeddings(args):
         except KeyError:
             unk_words.add(word)
     print("Number of unknown words:", len(unk_words))
+    if not exists(args.out_dir):
+        makedirs(args.out_dir)
     # Dump dictionary pickle to disk
     print("Dumping training files to", args.out_dir)
     pickle.dump(wemb_dict, open(join(args.out_dir, WORDEMB_FILENAME), "wb"))
@@ -134,14 +134,11 @@ def main():
     '''Main method : parse input arguments and train'''
     parser = argparse.ArgumentParser()
     # Input and Output paths
-    parser.add_argument('--train_corpus', type=str, default='data/train/',
+    parser.add_argument('-t', '--train_corpus', type=str, default='data/train/',
                         help='path to dir where training corpus files are stored')
-    parser.add_argument('--test_corpus', type=str, default='data/test/',
-                        help='path to dir where training corpus files are stored')
-    parser.add_argument('--emb_loc', type=str,
-                        default="data/PMC-w2v.bin",
+    parser.add_argument('-e', '--emb_loc', type=str,
                         help='path to the word2vec embedding location')
-    parser.add_argument('--out_dir', type=str, default='model/',
+    parser.add_argument('-o', '--out_dir', type=str, default='resources/',
                         help='output file containing minimal vocabulary')
     args = parser.parse_args()
     print(args)

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+tensorflow==1.50
+numpy==1.14.0
+gensim==2.1.0
diff --git a/resources/.gitignore b/resources/.gitignore
@@ -0,0 +1,3 @@
+# ignore everything except .gitignore just to keep data directory
+*
+!.gitignore
diff --git a/utils.py b/utils.py
@@ -1,14 +1,15 @@
 """Utility functions for loading datasets and computing performance"""
 from __future__ import print_function
-import os, codecs, io
-import sys
-from os.path import join, isfile
-from random import random
+
+import codecs
 import cPickle as pickle
 import re
+import sys
+from os.path import isfile, join
+
 import numpy as np
-from gensim.models.keyedvectors import KeyedVectors
 import requests
+from gensim.models.keyedvectors import KeyedVectors
 from requests.utils import quote
 
 GEONAMES_URL = "http://localhost:8091/location?location="
@@ -114,24 +115,6 @@ def tokenize_document(doc_path):
         doc_vocab.add(word)
     return doc_tokens, doc_vocab
 
-def write_annotations(annotations, doc_path):
-    '''Write annotations to file'''
-    with open(doc_path, 'r') as myfile:
-        doc_text = myfile.read()
-    doc_vocab = set()
-    # Split sentences into words and create Token objects
-    doc_tokens = []
-    words = re.split(SPLIT_REGEX, doc_text)
-    words = [word.strip() for word in words if word.strip() != ""]
-    current_offset = 0
-    for word in words:
-        word_offset = doc_text.index(word, current_offset)
-        current_offset = current_offset + len(word)
-        doc_token = Token(word, word_offset, len(word), "O")
-        doc_tokens.append(doc_token)
-        doc_vocab.add(word)
-    return doc_tokens, doc_vocab
-
 def get_namedentities(args, tokens, prediction):
     '''Get list of named entitiess'''
     assert len(tokens) == len(prediction)