# Select Data for Annotation

In [76]:
source = 'nyt'

In [188]:
ls ../data/diffengine-diffs/spark-output/

ap-matched-sentences.db           independent-matched-sentences.db
bbc-2-matched-sentences.db        nyt-matched-sentences.db
guardian-matched-sentences.db     reuters-matched-sentences.db
guardian-matched-sentences.db.gz  wp-matched-sentences.db.gz


In [77]:
import sys
sys.path.insert(0, '..')
from util import util_refactorings as ur
import sqlite3
import pandas as pd 
import os 

db_filename = '../data/diffengine-diffs/spark-output/%s-matched-sentences.db' % source

if not os.path.exists(db_filename):
    db_zip = db_filename + '.gz'
    ! gunzip $db_zip

conn = sqlite3.connect(db_filename)
pd.read_sql('''SELECT 
                    name
                FROM 
                    sqlite_master 
                WHERE 
                    type ='table' AND 
                    name NOT LIKE 'sqlite_%';
''', con=conn)

Unnamed: 0,name
0,matched_sentences
1,split_sentences


In [2]:
low_count_versions = pd.read_sql('''
    with c1 as 
        (SELECT entry_id, version, COUNT(1) as c from split_sentences GROUP BY entry_id, version)
    SELECT entry_id, version from c1
    WHERE c < 10 and c > 5
''', con=conn)

# get join keys
low_count_entry_ids = ', '.join(list(map(str, low_count_versions['entry_id'].unique())))
joint_keys = low_count_versions.pipe(lambda df: df['entry_id'].astype(str) + '-' + df['version'].astype(str))
joint_keys = "'%s'" % "', '".join(joint_keys.tolist())

# matched sentences
matched_sentences = pd.read_sql('''
    WITH c1 as ( 
    SELECT *, 
    entry_id || '-' || version_x as key_x,
    entry_id || '-' || version_y as key_y 
    FROM matched_sentences 
    )
    SELECT *
    FROM c1
    WHERE key_x in (%s) AND key_y  in (%s)
    ''' % (joint_keys, joint_keys)
, con=conn)

# get split sentences
split_sentences = pd.read_sql('''
    with c1 AS (
        SELECT *, entry_id || '-' || version as key FROM split_sentences
    )
    SELECT * from c1
    WHERE key IN (%s)
''' % joint_keys, con=conn)

In [84]:
matched_sentences = matched_sentences.assign(source=source)
split_sentences = split_sentences.assign(source=source)

In [89]:
doc_arcs_dict = matched_sentences.to_dict(orient='rows')

# get HTML diffs
doc_arcs = (matched_sentences
 .merge(split_sentences, how='outer', 
              right_on=['source', 'entry_id', 'version', 'sent_idx'],
              left_on=['source', 'entry_id', 'version_x', 'sent_idx_x'] ,
  ).drop(['version', 'sent_idx'], axis=1)
 .merge(split_sentences, how='outer', 
              right_on=['source', 'entry_id', 'version', 'sent_idx'],
              left_on=['source', 'entry_id', 'version_y', 'sent_idx_y'] ,
  ).drop(['version', 'sent_idx'], axis=1) 
)

grouped_arcs = (matched_sentences
 .groupby(['source', 'entry_id', 'version_x', 'version_y'])
 .apply(lambda df: 
    df[['version_x', 'version_y', 'sent_idx_x', 'sent_idx_y',
        'avg_sentence_distance_x', 'avg_sentence_distance_y'
       ]].to_dict(orient='rows')
 )
 .to_frame('arcs')
)

grouped_nodes = (split_sentences
 .groupby(['source', 'entry_id', 'version'])
 .apply(lambda df: df[['version', 'sent_idx', 'sentence']].to_dict(orient='rows'))
).to_frame('nodes').reset_index()

matched_grouped_nodes = (grouped_nodes
 .merge(
     grouped_nodes.assign(next_vers=lambda df: df['version'] - 1), 
     left_on=['source', 'entry_id', 'version'], 
     right_on=['source', 'entry_id', 'next_vers']
 )
 .assign(nodes=lambda df: df['nodes_x'] + df['nodes_y'])
 [['source', 'entry_id', 'version_x', 'version_y', 'nodes']]
 .set_index(['source', 'entry_id', 'version_x', 'version_y'])
)

output = (
    pd.concat([matched_grouped_nodes, grouped_arcs], axis=1)
    .to_dict(orient='index')
)

output = {str(k): v for k, v in output.items()}

import json
with open('../evaluation/data/sample_datum_small.json', 'w') as f:
    json.dump(output, f )

# Checks

In [135]:
null_sents = (
    matched_sentences
     .groupby(['entry_id', 'version_x', 'version_y'])
     [['sent_idx_x','sent_idx_y']].apply(lambda df: df.isnull().sum())
    #  .sort_values(ascending=False).loc[lambda s: s <4]
)

In [179]:
# null_sents.loc[lambda df: df['sent_idx_y'] > 0]

In [138]:
non_zero_sents = (
    matched_sentences
     .groupby(['entry_id', 'version_x', 'version_y'])
     [['avg_sentence_distance_x', 'avg_sentence_distance_y']]
     .mean()
)

In [180]:
doc_level_stats = pd.concat([
    null_sents, 
    non_zero_sents
], axis=1)

In [159]:
doc_level_stats.loc[lambda df: df['sent_idx_x'] == 3]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,sent_idx_x,sent_idx_y,avg_sentence_distance_x,avg_sentence_distance_y
entry_id,version_x,version_y,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
604236,1,2,3,2,0.2979898,0.2947833
692496,0,1,3,3,0.09875585,0.09875585
852870,0,1,3,3,0.1193558,0.1188369
870103,0,1,3,1,0.2529579,0.2527408
927805,2,3,3,1,0.2149481,0.1650758
968656,0,1,3,0,0.03309153,0.03309153
1039526,3,4,3,2,0.3560334,0.3406636
1249462,0,1,3,1,0.1066311,0.1066311
1461204,0,1,3,0,0.1842804,0.1842804
1537365,0,1,3,1,0.1002909,0.1001699


In [181]:
desired_index = doc_level_stats.loc[lambda df: df['sent_idx_x'] == 3]

In [186]:
desired_index.index

MultiIndex([( 604236, 1, 2),
            ( 692496, 0, 1),
            ( 852870, 0, 1),
            ( 870103, 0, 1),
            ( 927805, 2, 3),
            ( 968656, 0, 1),
            (1039526, 3, 4),
            (1249462, 0, 1),
            (1461204, 0, 1),
            (1537365, 0, 1),
            (1786689, 0, 1),
            (1853390, 1, 2),
            (1862395, 0, 1),
            (1945539, 0, 1),
            (1976079, 7, 8)],
           names=['entry_id', 'version_x', 'version_y'])

In [93]:
# merge both
merged_matched_sentences = (matched_sentences
 .merge(
    split_sentences, left_on=['entry_id', 'version_x', 'sent_idx_x'], right_on=['entry_id', 'version', 'sent_idx'],
    how='left'
 ).drop(['version', 'sent_idx', 'key', 'key_x', 'key_y'], axis=1)
 .merge(
    split_sentences, left_on=['entry_id', 'version_y', 'sent_idx_y'], right_on=['entry_id', 'version', 'sent_idx'],
    how='left'
 ).drop(['version', 'sent_idx', 'key',], axis=1)
)

In [7]:
# check how many additions there are
merged_matched_sentences.loc[lambda df: df['sent_idx_x'].isnull()]

Unnamed: 0,entry_id,version_x,version_y,sent_idx_x,sent_idx_y,avg_sentence_distance_x,avg_sentence_distance_y,sentence_x,sentence_y
24,1651691,0,1,,6.0,,,,There has not been a proclamation about Mr. Mc...
42,1650749,0,1,,1.0,,,,A 55-year-old woman came forward to the police...
155,1165597,0,1,,3.0,,,,There was no immediate claim of responsibility...
245,1322807,1,2,,7.0,,,,The whale stranding was the largest in the cou...
307,1598136,3,4,,1.0,,,,"In each newsletter, our gender writer, Maya Sa..."
...,...,...,...,...,...,...,...,...,...
9672,1450768,0,1,,4.0,,,,"Zhaira Franco, 35, who works for Facebook in s..."
9676,1450768,0,1,,6.0,,,,An aftershock with a magnitude of 5.7 and an e...
9698,1223039,0,1,,7.0,,,,Schlafly’s obituary will be posted soon.
9709,1223039,0,1,,6.0,,,,” A full version of Mrs.


# Launch to MTurk

In [2]:
from tqdm.auto import tqdm
import boto3
import os
from boto.mturk.connection import MTurkConnection
from boto.mturk.question import HTMLQuestion
import pandas as pd 
from boto.mturk.question import ExternalQuestion
from IPython.display import display, HTML
import matplotlib.pyplot as plt
plt.rc('font', size=14)
from importlib import reload
from boto.mturk.qualification import (
    Qualifications,
    PercentAssignmentsApprovedRequirement, 
    NumberHitsApprovedRequirement
)

import sys
sys.path.insert(0, '../evaluation/')

import mturk as mturk
import mturk.utils_mturk as um

from importlib import reload
reload(um)
reload(mturk)

<module 'mturk' (namespace)>

In [None]:
print(os.environ['AWS_ACCESS_KEY_ID'])
print(os.environ['AWS_SECRET_ACCESS_KEY'])

In [96]:
from importlib import reload
reload(um)

env = 'sandbox'
# env = 'production'
mturk = um.MTurkHandler(environment=env) #=production/sandbox

In [97]:
CUSTOM_QUALIFICATION = False

In [98]:
worker_requirements = [
    ### number of hits approved
    {
        'QualificationTypeId': '000000000000000000L0',
        'Comparator': 'GreaterThanOrEqualTo',
        'IntegerValues': [80],
    },
    ## worker local
    {
        'QualificationTypeId': '00000000000000000071',
        'Comparator': 'EqualTo',
        'LocaleValues': [{
            "Country":"US",
        }],
        'RequiredToPreview': True,
    },
    ## percent assignments approved
    {
        'QualificationTypeId': '000000000000000000L0',
        'Comparator': 'GreaterThanOrEqualTo',
        'IntegerValues': [90],
    },
]

worker_requirements = []

## custom qualification
if CUSTOM_QUALIFICATION:
    if env == 'production':
        worker_requirements.append({
            'QualificationTypeId': '3WZ6PU0JYXSTA4EIPF2M6S1CMZ7KL8',
            'Comparator': 'GreaterThanOrEqualTo',
            'IntegerValues': [90],      
        })
    else:
        worker_requirements.append({
            'QualificationTypeId': '381R35RGJFFV6VLBBDX2MLZFNSH414',  ## UCLA students
            #'3FQWXCP5BDC6A66PD20NE8FM4G3H44' other workers
            'Comparator': 'GreaterThanOrEqualTo',
            'IntegerValues': [90],      
        })

In [99]:
from jinja2 import Template, Environment, FileSystemLoader
import datetime
from IPython.display import display, HTML
env = Environment(loader=FileSystemLoader('../evaluation/templates'))

In [170]:
template = env.get_template('match-sentences-from-scratch.html')
created_hits = []

In [187]:
for i, idx in enumerate(desired_index.index):
    sample_key = str(tuple(['nyt'] + list(idx)))

    ## make the HTML 
    data = output[sample_key]
    html = template.render(
        data=data,
        doc_id=sample_key,
        do_mturk=True,
        start_time=datetime.datetime.now()
    )

    ## dump to disk for inspection
    with open('../evaluation/mturk/templated-question-example-%s.html' % i, 'w') as f:
        f.write(html)

        
    if False:
        ## HTMLQuestion HIT
        description = '''
            We\'d like to match sentences from two edited versions of the same article. 
            Help us by drawing lines to connect blocks of text.
        '''
        title = 'Annotate some news article edits v3'
        new_hit = mturk.client.create_hit(
            Title = title,
            Description = description,
            Keywords = 'text, highlighting',
            Reward = '0.6',
            MaxAssignments = 1,
            LifetimeInSeconds = 17280000,
            AssignmentDurationInSeconds = 600000,
            AutoApprovalDelayInSeconds = 28800,
            Question = html,
            QualificationRequirements=worker_requirements #if env == 'production' else []
        )
        created_hits.append(new_hit)

# Get Data from MTurk

In [108]:
hit_ids = list(map(lambda x: x['HIT']['HITId'], created_hits))

In [109]:
mturk.get_answer_df_for_hit_list(hit_ids)

  0%|          | 0/2 [00:00<?, ?it/s]

ValueError: arrays must all be same length