In [1]:
# import modules
import nltk
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os
import copy
from nltk.stem.wordnet import WordNetLemmatizer
import csv

In [2]:
# import space library and get its vocabularies
from spacy.en import English, LOCAL_DATA_DIR
data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR)
nlp = English(data_dir=data_dir)

In [3]:
# read the list with reporting verbs and HTML files
verbList = pd.read_csv('ReportingVerbList2.csv').values
verbs = map(lambda x: x.replace(" ", ""),[item for sublist in verbList for item in sublist])
html1 = BeautifulSoup(open('NYTPakDrone2004_2013_1.HTML','r'),'html.parser')
html2 = BeautifulSoup(open('NYTPakDrone2004_2013_2.HTML','r'),'html.parser')

In [36]:
def findTextFromHtml(html):
    """ Reads HTML file and returns dict with respect to needed columns in final CSV file """
    
    import datetime
    def change_date_format(line):
        line = ' '.join(line.replace(',', '').split()[:3])
        try:
            new_date = datetime.datetime.strptime(line, '%B %d %Y').strftime('%m/%d/%Y')
        except:
            new_date = ''
        return new_date
    
    # define CSV file's columns.Text of an article we wite to 'ARTICLETEXT'
    CSV_columns = ['ARTICLENO', 'PUBLICATION', 'DATE', 'HEADLINE', 'BYLINE', 'SECTION', 'LENGTH', 'DATELINE', 
                   'URL', 'GRAPHIC', 'LANGUAGE', 'PUBLICATION-TYPE', 'SUBJECT', 'COMPANY', 'ORGANIZATION', 
                   'PERSON', 'CITY', 'COUNTRY', 'REGION', 'LOAD-DATE', 'ARTICLETEXT']
                   
    parsedDiv = html.findAll('div')
    for div in parsedDiv:
        if div.text=="":
            parsedDiv.remove(div)        
    text = map(lambda x: x.text.replace('\n','').replace('\t',''), parsedDiv)
    
    res = []
    article_counter = 0
    new_art_i = 0
    max_len_i = 0
    for i in range(len(text)):
        len_i = len(text[i])
        if 'DOCUMENTS' in text[i]:
            new_art_i = i
            max_len_i = 0
            article_counter += 1
            new_article = {key: '' for key in CSV_columns}
            new_article['ARTICLENO'] = article_counter
            continue
        if i-1 == new_art_i:
            new_article['PUBLICATION'] = text[i]
            continue
        if i-2 == new_art_i:
            new_article['DATE'] = change_date_format(text[i])
            continue
        if i-3 == new_art_i:
            new_article['HEADLINE'] = text[i]
            continue
        if i > new_art_i and len_i > max_len_i:
            max_len_i = len_i
            new_article['ARTICLETEXT'] = text[i]
        for j in CSV_columns[CSV_columns.index('BYLINE'):-1]:
            if j+':' in text[i]:
                new_article[j] = text[i].split(j+':')[1].strip()
        if 'Copyright' in text[i]:
            res.append(new_article)
    return res


class IndexedText(object):
    """ Transforms a word to its general form """
    
    def __init__(self, stemmer,lemmatizer, text):
        self._text = text
        self._stemmer = stemmer
        self._lemmatizer = lemmatizer
        self._index = nltk.Index((self._stem(word), i) for (i, word) in enumerate(text))
        #self._verbs = filter(lambda x: 'vb' in x[1].lower(), nltk.pos_tag(text))
        #self._stemVerbs = map(lambda x: self._stem(x[0]), self._verbs)

    def concordance(self, word):
        key = self._stem(word)              # words of context
        finded = []
        #if key in self._stemVerbs:
        if self._index[key]:
            context = ' '.join(self._text)
            finded.append(context)
        return finded

    def _stem(self, word):
        return self._lemmatizer.lemmatize(self._stemmer.stem(word).lower(), pos='v') 

    
def Searcher(texts,verbs):
    """ Searches verbs from REPORTINGVERB_LIST in the sentences """
    
    lemmatizer = WordNetLemmatizer()
    porter = nltk.PorterStemmer()
    searchRes = []
    for senNo,text in enumerate(texts):
        tokTest = nltk.word_tokenize(text)
        indexText = IndexedText(porter,lemmatizer, tokTest)
        for verb in verbs:
            con = indexText.concordance(verb)
            if con:                
                for i in con:
                    searchRes.append([senNo+1,i,verb])
    return searchRes               


def sentenceSplitter(texto,verbs):
    """ Divides text on sentences """
    
    sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = sent_detector.tokenize(texto.strip())
    searcherRes = Searcher(sentences,verbs)
    return searcherRes


def findFullSubj(fullSubj,tok):
    """ Builds the tree of connection for long sources """
    
    for itl in [t.orth_ for t in tok.lefts]:
        fullSubj.insert(fullSubj.index(tok.orth_),itl)
    for itr in reversed([t.orth_ for t in tok.rights]):
        fullSubj.insert(fullSubj.index(tok.orth_)+1,itr)
    for tl in [t for t in tok.lefts]:
        findFullSubj(fullSubj,tl)
    for tr in [t for t in tok.rights]:
        findFullSubj(fullSubj,tr)
    

def getSource(verbs,searcherRes):
    """ Defines the source for respective verb """
    
    resWithSource = copy.deepcopy(searcherRes)
    for ind, result in enumerate(searcherRes):
        text = nlp(result[1])
        verb = nlp(unicode(result[2]))
        for t in verb:verbLemma = t.lemma_
        source = None
        notVB = True
        for token in text:            
            if verbLemma == token.lemma_:
                if token.tag_ not in ['NN','NNS']:
                    notVB = False
                else:
                    continue
                startToken = token
                reps=0
                while True:
                    if reps>10: break
                    if (token.dep_==u'nsubj' or token.dep_==u'nsubjpass') and token.head == startToken:
                        source = token
                        break
                    if token.dep_==u'ROOT':
                        branches = [t for t in token.lefts]+[t for t in token.rights]
                        for branch in branches:
                            if branch.dep_== u'nsubj' or branch.dep_==  u'nsubjpass':
                                source = branch
                                break
                        if source:
                            break
                    token = token.head
                    reps+=1
        if notVB:
            resWithSource[ind].append(u'DELETE')
        else:
            if source:
                if source.tag_ == u'PRP' or source.tag_ == u'DT':
                    resWithSource[ind].append(u'NO SOURCE')
                else:
                    FullSubj=[source.orth_,]
                    findFullSubj(FullSubj,source)
                    resWithSource[ind].append(' '.join(FullSubj))
            else:
                resWithSource[ind].append(u'NO SOURCE')
    resWithSource = [x for x in resWithSource if x[3] != u'DELETE']
    for ind, res in enumerate(resWithSource):
        if res[3] == u'NO SOURCE':
            for revers in reversed(range(ind)):
                if resWithSource[revers][3] != u'NO SOURCE':
                    res[3] = resWithSource[revers][3]
                    break
    return resWithSource


def clear_sentence(text):
    """ Additional cleaner to sentences """
    
    while not text[0].isalpha():
        text = text[1:]
    while not text[-1].isalpha():
        text = text[:-1]
    text = ','.join(map(lambda x: x.rstrip(), text.split(',')))
    return text


def create_CSV_for_html(html, saving_file_name):
    """ Creates and saves a CSV file for respective html file """
    
    listo = findTextFromHtml(html)
    fullSourceReport=[]
    i=0
    for texto in listo:
        seRes = sentenceSplitter(texto['ARTICLETEXT'],verbs)
        teSources = getSource(verbs,seRes)
        fullSourceReport.append(teSources)
        i+=1
        #print i

    dfListo = pd.DataFrame.from_dict(listo, orient='columns')
    dfListo = dfListo.drop('ARTICLETEXT', 1)

    formattedSourceRep=[]
    for ind, textRep in enumerate(fullSourceReport):
        for row in textRep:
            formattedSourceRep.append([ind+1]+row)

    dfReport = pd.DataFrame.from_records(formattedSourceRep, columns=['ARTICLENO','SENTENCENO','SENTENCE','REPORTEDVERB','SOURCE'])
    dfMerged = dfListo.merge(dfReport, how='right', on='ARTICLENO')
    dfMerged['NOWORDS'] = ""
    dfMerged['SENTENCE'] = dfMerged['SENTENCE'].apply(clear_sentence)
    dfMerged.to_csv(saving_file_name, index=False, encoding='utf-8')

### Working part

In [None]:
if __name__ == '__main__':
    create_CSV_for_html(html1, 'result_html1.csv')
    create_CSV_for_html(html2, 'result_html2.csv')

### Function for concatenate all CSV for each HTML files

In [None]:
# Note that for correct work the folder containing current IPython Notebook file should 
# contain only saved CSV files for each HTML file (not other CSV files)  
def concatenate_all_CSV(path='./'):
    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path,f)) and f.split('.')[-1] == 'csv']
    df = pd.DataFrame()
    for f in files:
        df_i = pd.read_csv(f)
        try:
            df = pd.concat([df, df_i])
        except:
            continue
    for i in list(df.columns):
        if 'Unnamed' in i:
            df.drop(i, axis=1, inplace=True)
    df.reset_index(drop=True, inplace=True)
    df.to_csv('all.csv')
    
concatenate_all_CSV()