# Example GDD 📼

In [1]:
import yaml
import psycopg2
import requests
import pandas as pd
import numpy as np

from utils import connect_db, get_dams

In [19]:
# Get dam names
dams = get_dams()

# Database connection
df = connect_db()
df.head(5)

Sentences: 147287


Unnamed: 0,docid,sentid,wordidx,words,poses,ners
0,5705014ccf58f18a4c0d6d61,1,"[1, 2, 3, 4]","[Eos, ,, Vol, .]","[NNS, ,, NNP, .]","[O, O, O, O]"
1,5705014ccf58f18a4c0d6d61,2,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[94, ,, No., 10, ,, 5, March, 2013, PAGE, 104,...","[CD, ,, NN, CD, ,, CD, NNP, CD, NN, CD, JJ, JJ...","[NUMBER, O, O, NUMBER, O, DATE, DATE, DATE, O,..."
2,5705014ccf58f18a4c0d6d61,3,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[Most, studies, of, solute, transport, through...","[JJS, NNS, IN, JJ, NN, IN, NNS, VBP, VBN, IN, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O]"
3,5705014ccf58f18a4c0d6d61,4,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[However, ,, small, impoundments, ,, such, as,...","[RB, ,, JJ, NNS, ,, JJ, IN, DT, VBN, IN, NN, C...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,5705014ccf58f18a4c0d6d61,5,"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14...","[As, these, small, systems, mature, ,, the, im...","[IN, DT, JJ, NNS, VBP, ,, DT, NNS, VBP, IN, NN...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


## Utility Functions

In [20]:
def remap_sent(sent): return ' '.join(sent)


def n_sents(idx, df):
    ''' Returns the surrounding sentences in rel to dataframe'''
    start = idx
    end = idx
    if idx > 0:
        start = idx-1
    if idx < len(df):
        end = idx+1
    return(start, end)


def n_upper(token, sentence):
    ''' returns uppercase tokens surrounding term '''
    span = ''
    idx = sentence.split().index(token)
    while idx > 0:
        idx = idx - 1
        if sentence.split()[idx][0].isupper():
            if len(span) > 1:
                span = sentence.split()[idx] + ' ' + span
            else:
                span = span + sentence.split()[idx]
        else:
            return span
    return span

## Finding Candidate Passages

In [21]:
cand_dam = np.zeros((len(df),), dtype=int)
cand_stream = np.zeros((len(df),), dtype=int)

for idx, i in enumerate(df['docid']):
    doc, sentid, wordidx, words, poses, ners = df.loc[idx]
    
    if 'Dam' in words or 'dam' in words and 'DATE' in ners:
        cand_dam[idx] = 1
    if 'stream' in words or 'Stream' in words or 'River' in words or 'river' in words:
        cand_stream[idx] = 1

# add to df
df['cand_stream'] = cand_stream
df['cand_dam'] = cand_dam

print('Candidate Dam sentences: %s' %np.unique(cand_dam, return_counts=True)[1][1])
print('Candidate Stream sentences: %s' %np.unique(cand_stream, return_counts=True)[1][1])

Candidate Dam sentences: 3150
Candidate Stream sentences: 13512


## Labeling functions

These are based loosely on Snorkel LFs. These are intended to filter down the candidates 
```python

CandidateExtractor(Dam_Removal_Year, [ngrams, ngrams], [DateMatcher(), DictionaryMatch(d=rm)])

def LF_timeframe(c):
    ''' LF to ensure the dam removal is within a timeframe'''
    try: 
        c = int(c.year.get_span())
        if c > 1890 and c < 2020:
            return 1
        else: return 0
    except:
        return 0
```

In [23]:
def dam_extract(sent):
    ''' Attempt to extract the name of the dam based on uppercase prior tokens '''
    for i in sent.split():
        if 'Dam' == i:
            term = 'Dam'
        if i == 'dam':
            term = 'dam'
    return n_upper(term, sent)


def removal_present(series):
    rm = ['remove', 'removal', 'breach', 'destroyed', 'destroy', 'failed', 
          'removed', 'breached', 'removing', 'post-dam', 'demolition', 'demolish',
          'demolished', 'razing', 'razed', 'raze']
    
    doc, sentid, wordidx, words, poses, ners, *_ = series
    
    for i in rm:
        if i in words:
            return 1
    return 0


def dam_nearby(sent):
    for i in sent.split():
        if i in dams['name'].tolist():
            return 1 
    return 0


removal_present(df.iloc[6])

1

#### Flagging Sample 

In [24]:
flagged_dam = np.zeros((len(df),), dtype=int)
flagged_stream = np.zeros((len(df),), dtype=int)

# Get candidate dams 
for idx, i in df[df['cand_dam'] == 1].iterrows():
    start, end = n_sents(idx, df)
    
    for sent in [start, idx, end]:
        if removal_present(df.iloc[sent]) == 1:
            flagged_dam[idx] = 1

# Get candidate rivers
for idx, i in df[df['cand_stream'] == 1].iterrows():
    start, end = n_sents(idx, df)
    
    for sent in [start, idx, end]:
        if dam_nearby(remap_sent(df.iloc[sent]['words'])) == 1:
            flagged_stream[idx] = 1


# Add to dataframe
df['flagged_dam'] = flagged_dam
df['flagged_stream'] = flagged_stream

# Display totals
print('Dam Flagged: %s labels found' %df[df['flagged_dam'] == 1].shape[0])
print('River Flagged: %s labels found' %df[df['flagged_stream'] == 1].shape[0])

sample_flagged_dam = df['words'][df['flagged_dam']==1].iloc[5]
sample_flagged_stream = df['words'][df['flagged_stream']==1].iloc[5]
print('Sample Dam Sentence: %s' %remap_sent(sample_flagged_dam))
print('Sample Stream Sentence: %s' % remap_sent(sample_flagged_stream))

Dam Flagged: 1290 labels found
River Flagged: 9 labels found
Sample Dam Sentence: Dam removal is becoming an increasingly common component of river restoration -LRB- Grant , 2001 ; Pizzuto , 2002 ; Graf , 2003 -RRB- .
Sample Stream Sentence: When signiﬁcance levels for statistical tests were provided , they are reported along with the conclusion of S or NS -LRB- signiﬁcant , non-signiﬁcant , respectively -RRB- Study Location Scale -LRB- # streams evaluated -RRB- land use Heterogeneity measure Signiﬁcance of heterogeneity effect Beisel et al. -LRB- 1998 -RRB- Harper et al. -LRB- 1997 -RRB- Minshall & Robinson -LRB- 1998 -RRB- Robson & Chester -LRB- 1999 -RRB- Buffagni et al. -LRB- 2000 -RRB- Brown -LRB- 2003 -RRB- Boyero & Bosch -LRB- 2004 -RRB- Urban et al. -LRB- 2006 -RRB- Northern France Ireland ; Czech Republic ID , U.S.A. Hobart , Tasmania North Italy NH , U.S.A. Panama CT , U.S.A. Reach and sub-reach -LRB- four streams -RRB- Forest Reach -LRB- multiple ` rivers ' -RRB- Forest Reac

### Test Removal Year Extraction

REDO THIS

In [None]:
for cand in [40:90]:
    doc, sentid, wordidx, words, poses, ners, *_ = df.loc[cand]
    if 'dam' in words or 'Dam' in words:
        dam = dam_extract(remap_sent(df['words'].iloc[cand]))
        try:
            if dam != '':
                dates = []
                for idx, i in enumerate(ners): 
                    if i == 'DATE':
                        dates.append(words[idx])
                if len(dates) > 0: print(dam, dates, remap_sent(df['words'].iloc[cand]), doc)
        except Exception as e:
            print(e)