In [82]:
'''
BT 2 - Data Collection.ipynb
Author: Jingchuan Shi
Acknowledgments: Asst. Prof. Ahmed Qureshi
Created 2019/9/7, last modified 2019/9/9 at University of Alberta.
All Rights Reserved.
'''

# Load relevant modules.
import os
import stanfordnlp
from tika import parser
from nltk.corpus import wordnet as wn
config = {
        'processors': 'tokenize,pos,lemma'
         }
nlp = stanfordnlp.Pipeline(**config)

Use device: cpu
---
Loading: tokenize
With settings: 
{'model_path': '/Users/ferax/stanfordnlp_resources/en_ewt_models/en_ewt_tokenizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: pos
With settings: 
{'model_path': '/Users/ferax/stanfordnlp_resources/en_ewt_models/en_ewt_tagger.pt', 'pretrain_path': '/Users/ferax/stanfordnlp_resources/en_ewt_models/en_ewt.pretrain.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
---
Loading: lemma
With settings: 
{'model_path': '/Users/ferax/stanfordnlp_resources/en_ewt_models/en_ewt_lemmatizer.pt', 'lang': 'en', 'shorthand': 'en_ewt', 'mode': 'predict'}
Building an attentional Seq2Seq model...
Using a Bi-LSTM encoder
Using soft attention for LSTM.
Finetune all embeddings.
[Running seq2seq lemmatizer with edit classifier]
Done loading processors!
---


In [84]:
# Files that store verbs and information about the data collection process. Please modify the master path to your own.
master_path = '/Users/ferax/bin/'
core_source_path = master_path + 'BTverblist_core.txt' # List of pre-labelled words.
new_source_path = master_path + 'BTverblist_new.txt' # List of new words to be extracted and analyzed.
directory = master_path + 'Paper_source/' # Directory of all papers and books that are used as corpus.
done_source_path = master_path + 'BTdone_paper_list.txt' # List of papers and books that are already extracted.
progress_path = master_path + 'BTgain_progress.txt' # Tracks the progress of data extraction.

# Read from files
with open(core_source_path, 'r') as f:
    core_verbs = [line.replace('\n', '') for line in f.readlines()]
with open(new_source_path, 'r') as g:
    verbs = [line.replace('\n', '') for line in g.readlines()]
    old_size = len(verbs)
    print('\nCurrent data size: ', end = '') # Amount of words before the current extraction session.
    print(old_size)
with open(done_source_path, 'r') as h:
    done_papers = [line.replace('\n', '') for line in h.readlines()]
files = [os.path.join(directory, file) for file in os.listdir(directory) if os.path.join(directory, file) not in done_papers]

# The main extraction procedure
for file_path in files:
    if file_path[-3:] == 'pdf': # PDFs are parsed with tika.
        raw = parser.from_file(file_path)
        raw_text = raw['content']
        if raw_text == None:
            with open(done_source_path, 'a') as n:
                n.write(file_path)
                n.write('\n')
            continue
        else:
            raw_text = raw_text.replace('-\n', '').replace('\n', ' ')
    elif file_path[-3:] == 'txt': # Plain texts are read directly.
        with open(file_path, 'r') as l:
            raw_text = l.read()
            if raw_text == None:
                with open(done_source_path, 'a') as n:
                    n.write(file_path)
                    n.write('\n')
                continue
            else:
                raw_text = raw_text.replace('-\n', '').replace('\n', ' ')
    else:
        continue
    new_verbs = []
    stanford_obj = nlp(raw_text) # Raw texts are tokenized, pos-tagged and lemmatized by StanfordNLP.
    processed = [[word.lemma, word.xpos] for sent in stanford_obj.sentences for word in sent.words]
    for word in processed: # Words recognized as verbs are added to the list. Only unique verbs are allowed.
        if word[0] == None:
            continue
        else:
            word[0] = word[0].lower()
            if (word[1] in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'] and word[0] not in core_verbs and word[0] not in verbs and word[0] not in new_verbs):
                new_verbs.append(word[0])
    with open(new_source_path, 'a') as m: # Write the newly extracted words to file.
        for word in new_verbs:
            m.write(word)
            m.write('\n')
            verbs.append(word)
    with open(done_source_path, 'a') as n: # The current paper or book is added to the finished list and will not be analyzed again.
        n.write(file_path)
        n.write('\n')
    print('\nLoaded paper {}.'.format(file_path)) # Report progress.
    print('Current data size: ', end = '')
    print(len(verbs), end = '')
    gain = len(verbs) - old_size
    print('. {} new verbs acquired.'.format(gain))
    old_size = len(verbs)
    with open(progress_path, 'a') as n: # Write the amount of newly extracted words and amount of all words so far to the progress tracking file.
        n.write(str(len(verbs)) + ' ' + str(gain) + '\n')

valid_verb = [] 
for word in verbs: # Validate extracted words by checking if it has any meaningful WordNet synset as a verb.
    if wn.synsets(word, pos = 'v') != []:
        valid_verb.append(word)
print('# of verbs identified by wordnet: ' + str(len(valid_verb)))


Current data size: 6124
Verbs identified by wordnet: 3079
