In [1]:
from bs4 import BeautifulSoup
from bs4 import BeautifulStoneSoup
import nltk.data
import re
import glob
import pickle
import pprint
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
stopwords = nltk.corpus.stopwords.words('english')+[',','.','!','``',"''",'?',"'s",';','$',':',"'",'_', ')', '(']
stemmer = PorterStemmer()

In [2]:
def get_futureWork(all_text):
    ret = ""
    pattern = re.compile("future")
    for sent in sent_detector.tokenize(all_text.strip()):
        clean_sent = sent.lower().replace('\n', ' ').strip()
        clean_sent = clean_sent.replace("- ", "")
        if pattern.search(clean_sent):
            ret = ret + "\n" + clean_sent

    return ret

def get_futureWork_plus(all_text):
    ret = ""
    pattern1 = re.compile("future")
    pattern2 = re.compile("plan to")
    for sent in sent_detector.tokenize(all_text.strip()):
        clean_sent = sent.lower().replace('\n', ' ').strip()
        clean_sent = clean_sent.replace("- ", "")
        if pattern1.search(clean_sent):
            ret = ret + "\n" + clean_sent
        
        #todo make sure the sentence is not added already
        elif pattern2.search(clean_sent):
            ret = ret + "\n" + clean_sent
            
    return ret

def get_futureWork_extended(all_text, indicators):
    ret = ""
    #indicators = ['futur', 'work', 'use', 'plan', 'model', 'improv', 'system', 'research', 'method', 'featur', 'includ', 'investig', 'explor', 'direct', 'languag', 'would', 'data', 'evalu', 'approach', 'perform']
    #indicators = ['futur', 'work', 'plan', 'improv', 'explor', 'approach', 'perform', 'research', 'evalu', 'extend', 'would']
    for sent in sent_detector.tokenize(all_text.strip()):
        clean_sent = sent.lower().replace('\n', ' ').strip()
        clean_sent = clean_sent.replace("- ", "")
        
        for token in [t.lower() for t in nltk.word_tokenize(clean_sent)]:
            if token in stopwords:
                continue
            if stemmer:
                token = stemmer.stem(token)
            
            if token in indicators:
                ret = ret + "\n" + clean_sent
                break
            
    return ret

def get_abstract(all_text):
    ret = ""
    for sent in sent_detector.tokenize(all_text.strip()):
        clean_sent = sent.lower().replace('\n', ' ').strip()
        clean_sent = clean_sent.replace("- ", "")
        ret = ret + "\n" + clean_sent

    return ret

#Gets all the body texts until the next section header.
def get_bodyText(starterPointer):
    saveText = " "
    if starterPointer is not None:
        #saveText = starterPointer.get_text()
        currentPoint = starterPointer.find_next()
        while currentPoint is not None :
            #print currentPoint.name
            if currentPoint.name == 'sectionHeader':
                break
            #elif currentPoint.name == 'page':
            #    print currentPoint.name
            elif currentPoint.name == 'bodyText':
                saveText = saveText + currentPoint.get_text()

            currentPoint = currentPoint.find_next()
            
    #print saveText
    return saveText

def prev_bodyText(starterPointer):
    saveText = " "
    if starterPointer is not None:
        saveText = starterPointer.get_text()
        currentPoint = starterPointer.find_previous()
        while currentPoint is not None :
            #print currentPoint.name
            if currentPoint.name == 'sectionHeader':
                break
            #elif currentPoint.name == 'page':
            #    print currentPoint.name
            elif currentPoint.name == 'bodyText':
                saveText = saveText + currentPoint.get_text()

            currentPoint = currentPoint.find_previous()
    return saveText


In [3]:
#how many for which we have labels (summary, machine translation, dependency parsing) are in the cleanXMLdata?
cleanData = glob.glob("../cleanXMLdataV2/*.out")
cd_files = [ i.split('/')[-1].split('.')[0] for i in cleanData]
labelData = glob.glob("../aan/aan_mds/papers_text/*")
ld_files = [ i.split('/')[-1] for i in labelData]


In [4]:
print (len(ld_files), len(cd_files))

(380, 20399)


In [5]:
#these are the files for which we have labels
common_files = set(ld_files).intersection(set(cd_files))

In [6]:
import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def print_sentences(some_text):
    sentences = []
    sent_list = tokenizer.tokenize(some_text)
    #sent_list = some_text.split('.')
    for i, sent in enumerate( sent_list ):
        if len(sent.strip()) > 5:
            #print (i,sent.strip())
            sentences.append(sent.replace('\n', ' ').strip())
    return sentences

In [7]:
def parse_aan_metadata():
    paperID2label = {}
    paperID2title = {}
    
    with open ( '../aan/aan_mds/metadata.txt', 'r') as f:
        meta_data = f.readlines()
    for line in meta_data:
        #print (line)
        item = line.split('\t')
        paperID2label [ item[1] ] = item[0]
        paperID2title [ item[1] ] = item[2]
    return paperID2label , paperID2title
        

In [8]:
import random
FW_ABS_Index = {}

indicators = ['futur', 'work', 'plan', 'improv', 'explor', 'approach', 'perform', 'research', 'evalu', 'extend', 'would']
output_pickle = 'FutureWorkAndAbstractPickle_expanded.pk'
#output_pickle = 'temp'
input_directory = "../cleanXMLdataV2/*.out"
#input_directory = "../cleanXMLdataV2/P11-2088.out"
#read_files = glob.glob("/Users/aditi_khullar/Documents/Dropbox/cleanXMLdataV2/*.out")
read_files = glob.glob(input_directory)
paperID2label, paperID2title = parse_aan_metadata()
for xmlFile in read_files:
    paperId = xmlFile.split("/")[-1][0:-4]
    if paperId not in common_files:
        #only looking at the papers with the labeled information
        continue

    # DEBUG: looking at the summarization papers 
    if paperID2label[paperId] != 'M':
        continue
        
    #print paperId
    if FW_ABS_Index.has_key(paperId) is False:
        FW_ABS_Index[paperId] = ["Smaple Abstract", "Sample Futurework", "Sample Intro", "Sample Con"]
    with open(xmlFile, 'r') as f:
        xmlData = f.read();
    #`soup = BeautifulStoneSoup(xmlData, selfClosingTags=['sectionHeader','bodyText'])
    soup = BeautifulSoup(xmlData, 'xml')
    #print soup.prettify()

    #trying to find the title
#     title  = soup.find('title')
#     if title == None:
#         print ("NOTICE: skipping file:", xmlFile, " no TITLE!")
#         continue
    #paperTitle = title.get_text()
    #print (paperTitle)
    
    # gets all of the lines after conclusion and before acknowledgement
    limit = 5 #assuming that there are not many seperations between conclusion and acknowledgement
    bodiesCON = []
    bodiesAWK = []
    bodyConText = ""
    for i, header in  enumerate(soup.findAll('sectionHeader')):
        #print i
        if header['genericHeader'] == 'abstract':    
            if header.find_next('bodyText') is not None:
                paperAbs = get_abstract(header.find_next('bodyText').get_text())
            else:
                paperAbs = " "
            FW_ABS_Index[paperId][0] = paperAbs

        if header['genericHeader'] == 'introduction':
            FW_ABS_Index[paperId][2] = get_abstract(get_bodyText(header))
            
        #gets the conclusion section
        if header['genericHeader'] == 'conclusions' and float( header['confidence']) > 0.95:
            if header.find_next('bodyText') is not None:
                paperCon = get_abstract(header.find_next('bodyText').get_text())
            else:
                paperCon = " "

            FW_ABS_Index[paperId][3] = paperCon
            
        #gets all of what we believe is the conclusion section to extract future works in
        if header['genericHeader'] == 'conclusions':
            if float( header['confidence']) < 0.95:
                #bodiesCON = header.find_all_previous('bodyText', limit=3)
                #print header['confidence']
                bodyConText = get_abstract(prev_bodyText(header))
            else:
                bodyConText = get_abstract(get_bodyText(header))#bodiesCON +  header.find_all_next('bodyText')
    
    sentences = print_sentences(bodyConText)
    if sentences > 0:
        print (paperId)
        #write the data to csv format 
        with open ( 'MachineTranslation_annotation_file.txt', 'a+') as f:
            for sent in sentences:
                f.write( paperId + "\t" + paperID2title[paperId] + "\t"  + sent.encode('ascii', 'ignore').strip() + "\n")
            
    futureWorkText = get_futureWork_extended(bodyConText,indicators)
    FW_ABS_Index[paperId][1] = futureWorkText

# Writing the Dictinary to a pickle file
# output = open(output_pickle, 'wb')
# pickle.dump(FW_ABS_Index, output)
# output.close()


E89-1038
C86-1117
H01-1007
W05-0712
C80-1064
W07-0724
C86-1153
P84-1072
W06-3121
W06-3123
W05-0812
P03-1005
P00-1006
W07-0717
P98-1070
C00-2162
H91-1026
P07-1089
C92-2101
C94-2178
C92-4203
C86-1100
N06-1058
W03-0318
P99-1067
W01-1406
P06-2014
C80-1067
P02-1044
P03-1010
C94-1048
J93-1006
W06-1628
H05-1085
W07-0734
W01-0808
D07-1104
H92-1052
P96-1023
C88-1017
P06-1065
P06-1011
C00-2092
P98-2139
P98-2160
J00-2004
D07-1103
P07-2017
P98-1017
P08-1114
P03-1040
C86-1025
P93-1004
W05-1506
C88-2154
W03-0413
J99-1003
P97-1046
W06-1609
H01-1062
P07-1111
E93-1062
P03-1041
P84-1105
N06-1014
P98-1036
C96-2119
C02-1065
W03-0311
P06-1009
W97-0311
P04-1078
W05-0825
P01-1067
D07-1091
N03-1003
P91-1022
P99-1011
W03-0310
P03-1012
P99-1028
W08-0405
P98-1069
P98-1004
C90-3044
W00-0507
W01-1402
C82-1034
W07-0414
C04-1154
J97-3002
C00-1019
P07-2026
H05-1011
P91-1021
P88-1019
C92-2081
C92-3168
W07-0730
E03-1007
H93-1038
P94-1012
P06-1002
P95-1032
C92-3164
P98-1117
C90-3057
C96-1054
C88-1016
D07-1092
W05-0817
P

In [63]:
paperID2label = parse_aan_metadata()

In [16]:
sent.strip()

u'the multilingual lexical sample task in senseval-3 featured english ambiguous words that were to be tagged with their most appropriate hindi translation.'

In [66]:
#FW_ABS_Index