# Example GDD 📼

In [1]:
import yaml
import psycopg2
import requests
import pandas as pd
import numpy as np

from utils import connect_db, get_dams

In [2]:
# Get dam names
dams = get_dams()

# Database connection
df = connect_db()
df.head()


Unnamed: 0,docid,sentid,wordidx,words,poses,ners
0,5705014ccf58f18a4c0d6d61,80,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[The, authors, caution, that, projected, incre...","[DT, NNS, VBP, IN, VBN, VBN, NN, IN, NN, NN, M...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,5705014ccf58f18a4c0d6d61,81,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[-LRB-, Water, Resources, Research, ,, doi, :1...","[-LRB-, NNP, NNP, NNP, ,, FW, FW, :, NN, CD, ,...","[O, ORGANIZATION, ORGANIZATION, ORGANIZATION, ..."
2,5705014ccf58f18a4c0d6d61,82,"[1, 2, 3, 4]","[American, Geophysical, Union, .]","[JJ, NNP, NNP, .]","[ORGANIZATION, ORGANIZATION, ORGANIZATION, O]"
3,5705014ccf58f18a4c0d6d61,83,"[1, 2, 3, 4]","[All, Rights, Reserved, .]","[DT, NNS, VBN, .]","[O, O, O, O]"
4,57ebf766cf58f1a12a8e3d34,1,"[1, 2, 3, 4, 5, 6, 7]","[RIVER, RESEARCH, AND, APPLICATIONS, River, Re...","[NN, NN, CC, NNS, NN, NNP, .]","[O, O, O, O, O, O, O]"


## Utility Functions

In [4]:
def remap_sent(sent): return ' '.join(sent)


def n_sents(idx, df):
    ''' Returns the surrounding sentences in rel to dataframe'''
    start = idx
    end = idx
    if idx > 0:
        start = idx-1
    if idx < len(df):
        end = idx+1
    return(start, end)


def n_upper(token, sentence):
    ''' returns uppercase tokens surrounding term '''
    span = ''
    idx = sentence.split().index(token)
    while idx > 0:
        idx = idx - 1
        if sentence.split()[idx][0].isupper():
            if len(span) > 1:
                span = sentence.split()[idx] + ' ' + span
            else:
                span = span + sentence.split()[idx]
        else:
            return span
    return span


def sentence_elements (docid,sentid):
    sentence_df = df[(df['docid'] == docid) & (df['sentid']== int(sentid))]
    return sentence_df

def clean_intext_references(row):
    '''returns cleaned row (sentence elements) as df, removing in-text references'''
    '''only addresses in-text references formated as (NAME(s) DATE)'''
    print ((row['words'].iloc[0]))  # for testing, can be commented/removed


    #First check to make sure sentence has '-LRB-', '-RRB-' and ners 'DATE' 
    if '-LRB-' in (row['poses'].iloc[0]) and '-RRB-' in (row['poses'].iloc[0]) \
    and 'DATE' in (row['ners'].iloc[0]):
        #list of lrb and rrb indices turned into a dataframe
        list_lrb_i = [i for i, j in enumerate(row['poses'].iloc[0]) if j == '-LRB-']
        list_rrb_i = [i for i, j in enumerate(row['poses'].iloc[0]) if j == '-RRB-']
        if len(list_lrb_i)==len(list_rrb_i):
            print ('\n' + str(len(list_lrb_i)) + " sets of parenthesis to investigate")  #for testing, can be commented/removed
            #extract lists of words, poses, and ners and create df based on common list index
            rb_df = pd.DataFrame({'lrb': list_lrb_i, 'rrb': list_rrb_i})
            sentence_df = pd.DataFrame({'words':(row['words'].iloc[0]), 'poses':(row['poses'].iloc[0]), \
                                        'ners': (row['ners'].iloc[0])})

            for rb in rb_df.itertuples():
                if sentence_df['ners'][rb.rrb-1]== 'DATE' and \
                ([sentence_df['words'][rb.lrb:rb.rrb+1].isin(['et','al.'])] or \
                 [sentence_df['ners'][rb.lrb:rb.rrb+1].isin(['PERSON'])]):
                    #drop indices of an in-text reference
                    print ('dropping index range: ' + str(rb.lrb) + ':' + str(rb.rrb+1))  # For testing, can be commented/removed
                    sentence_df = sentence_df.drop(sentence_df.index[(rb.lrb):(rb.rrb+1)])

            #rebuild cleaned df row                
            cleanDf = pd.DataFrame({'docid': row['docid'].iloc[0], 'sentid': row['sentid'].iloc[0], \
                                    'words':[sentence_df['words'].tolist()], 'poses':[sentence_df['poses'].tolist()], \
                                    'ners':[sentence_df['ners'].tolist()] })    

            return cleanDf
        else:
            return row
    else:
        return row


## Remove in-text references from a sentence

In [5]:
#Example that has two sets of parenthesis, one dropped due to in-text reference critera
row = sentence_elements(docid='57ebf766cf58f1a12a8e3d34', sentid=89)

clean_intext_references(row)

['Volumetric', 'change', 'estimates', 'using', '`', 'Structure-from-Motion', "'", 'photogrammetric', 'topography', 'analysis', '-LRB-', 'cf.', 'Westoby', 'et', 'al.', ',', '2012', '-RRB-', 'from', 'a', 'novel', 'aerial', 'imaging', 'system', 'indicate', 'that', 'a', 'total', 'of', '6.1', '×', '106', 'm3', 'of', 'sediment', 'had', 'moved', 'downstream', 'from', 'both', 'of', 'the', 'reservoir-sediment', 'de', '-', 'posits', 'as', 'of', 'spring', '2013', '-LRB-', 'A.', 'C.', 'Ritchie', ',', 'unpublished', 'data', '-RRB-', '.']

2 sets of parenthesis to investigate
dropping index range: 10:18


Unnamed: 0,docid,ners,poses,sentid,words
0,57ebf766cf58f1a12a8e3d34,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[JJ, NN, NNS, VBG, ``, NNP, '', JJ, NN, NN, IN...",89,"[Volumetric, change, estimates, using, `, Stru..."


## Finding Candidate Passages

In [21]:
cand_dam = np.zeros((len(df),), dtype=int)
cand_stream = np.zeros((len(df),), dtype=int)

for idx, i in enumerate(df['docid']):
    doc, sentid, wordidx, words, poses, ners = df.loc[idx]
    
    if 'Dam' in words or 'dam' in words and 'DATE' in ners:
        cand_dam[idx] = 1
    if 'stream' in words or 'Stream' in words or 'River' in words or 'river' in words:
        cand_stream[idx] = 1

# add to df
df['cand_stream'] = cand_stream
df['cand_dam'] = cand_dam

print('Candidate Dam sentences: %s' %np.unique(cand_dam, return_counts=True)[1][1])
print('Candidate Stream sentences: %s' %np.unique(cand_stream, return_counts=True)[1][1])

Candidate Dam sentences: 3150
Candidate Stream sentences: 13512


## Labeling functions

These are based loosely on Snorkel LFs. These are intended to filter down the candidates 
```python

CandidateExtractor(Dam_Removal_Year, [ngrams, ngrams], [DateMatcher(), DictionaryMatch(d=rm)])

def LF_timeframe(c):
    ''' LF to ensure the dam removal is within a timeframe'''
    try: 
        c = int(c.year.get_span())
        if c > 1890 and c < 2020:
            return 1
        else: return 0
    except:
        return 0
```

In [23]:
def dam_extract(sent):
    ''' Attempt to extract the name of the dam based on uppercase prior tokens '''
    for i in sent.split():
        if 'Dam' == i:
            term = 'Dam'
        if i == 'dam':
            term = 'dam'
    return n_upper(term, sent)


def removal_present(series):
    rm = ['remove', 'removal', 'breach', 'destroyed', 'destroy', 'failed', 
          'removed', 'breached', 'removing', 'post-dam', 'demolition', 'demolish',
          'demolished', 'razing', 'razed', 'raze']
    
    doc, sentid, wordidx, words, poses, ners, *_ = series
    
    for i in rm:
        if i in words:
            return 1
    return 0


def dam_nearby(sent):
    for i in sent.split():
        if i in dams['name'].tolist():
            return 1 
    return 0


removal_present(df.iloc[6])

1

#### Flagging Sample 

In [24]:
flagged_dam = np.zeros((len(df),), dtype=int)
flagged_stream = np.zeros((len(df),), dtype=int)

# Get candidate dams 
for idx, i in df[df['cand_dam'] == 1].iterrows():
    start, end = n_sents(idx, df)
    
    for sent in [start, idx, end]:
        if removal_present(df.iloc[sent]) == 1:
            flagged_dam[idx] = 1

# Get candidate rivers
for idx, i in df[df['cand_stream'] == 1].iterrows():
    start, end = n_sents(idx, df)
    
    for sent in [start, idx, end]:
        if dam_nearby(remap_sent(df.iloc[sent]['words'])) == 1:
            flagged_stream[idx] = 1


# Add to dataframe
df['flagged_dam'] = flagged_dam
df['flagged_stream'] = flagged_stream

# Display totals
print('Dam Flagged: %s labels found' %df[df['flagged_dam'] == 1].shape[0])
print('River Flagged: %s labels found' %df[df['flagged_stream'] == 1].shape[0])

sample_flagged_dam = df['words'][df['flagged_dam']==1].iloc[5]
sample_flagged_stream = df['words'][df['flagged_stream']==1].iloc[5]
print('Sample Dam Sentence: %s' %remap_sent(sample_flagged_dam))
print('Sample Stream Sentence: %s' % remap_sent(sample_flagged_stream))

Dam Flagged: 1290 labels found
River Flagged: 9 labels found
Sample Dam Sentence: Dam removal is becoming an increasingly common component of river restoration -LRB- Grant , 2001 ; Pizzuto , 2002 ; Graf , 2003 -RRB- .
Sample Stream Sentence: When signiﬁcance levels for statistical tests were provided , they are reported along with the conclusion of S or NS -LRB- signiﬁcant , non-signiﬁcant , respectively -RRB- Study Location Scale -LRB- # streams evaluated -RRB- land use Heterogeneity measure Signiﬁcance of heterogeneity effect Beisel et al. -LRB- 1998 -RRB- Harper et al. -LRB- 1997 -RRB- Minshall & Robinson -LRB- 1998 -RRB- Robson & Chester -LRB- 1999 -RRB- Buffagni et al. -LRB- 2000 -RRB- Brown -LRB- 2003 -RRB- Boyero & Bosch -LRB- 2004 -RRB- Urban et al. -LRB- 2006 -RRB- Northern France Ireland ; Czech Republic ID , U.S.A. Hobart , Tasmania North Italy NH , U.S.A. Panama CT , U.S.A. Reach and sub-reach -LRB- four streams -RRB- Forest Reach -LRB- multiple ` rivers ' -RRB- Forest Reac

### Test Removal Year Extraction

REDO THIS

In [None]:
for cand in [40:90]:
    doc, sentid, wordidx, words, poses, ners, *_ = df.loc[cand]
    if 'dam' in words or 'Dam' in words:
        dam = dam_extract(remap_sent(df['words'].iloc[cand]))
        try:
            if dam != '':
                dates = []
                for idx, i in enumerate(ners): 
                    if i == 'DATE':
                        dates.append(words[idx])
                if len(dates) > 0: print(dam, dates, remap_sent(df['words'].iloc[cand]), doc)
        except Exception as e:
            print(e)