In [6]:
from tqdm.auto import tqdm
import boto3
from boto.mturk.connection import MTurkConnection
from boto.mturk.question import HTMLQuestion
from boto.mturk.question import ExternalQuestion

from IPython.display import display, HTML
from boto.mturk.qualification import (
    Qualifications,
    PercentAssignmentsApprovedRequirement, 
    NumberHitsApprovedRequirement
)

import sys, os, json, re
sys.path.insert(0, '../')
from mturk import mturk_handler as um

In [7]:
sys.path.insert(0, '../scripts/')

In [8]:
import util_data_fetching_for_app as uda

In [None]:
import fasttext
category_model = fasttext.load_model('../scripts/fasttext_model__news-classification.bin')

# Generate new edits data

In [61]:
import sqlite3
import pandas as pd 

In [194]:
! gunzip $db_pathway/ap-matched-sentences.db.gz  

In [195]:
db_pathway = '/Users/alex/Projects/usc-research/edit-pathways/data/diffengine-diffs/spark-output'
db_path = os.path.join(db_pathway, 'bbc-2-matched-sentences.db')
db_path = os.path.join(db_pathway, 'ap-matched-sentences.db')

In [196]:
conn = sqlite3.connect(db_path)
doc_level_stats_df = pd.read_sql('select * from doc_level_stats', con=conn)

In [228]:
num_versions = doc_level_stats_df['entry_id'].value_counts().to_frame('num_versions')
candidate_articles = (
    doc_level_stats_df
    .merge(num_versions, left_on='entry_id', right_index=True)
#     .loc[lambda df: (df['version_x'] / df['num_versions']) < .2]  ## articles early in their lifecycle
#     .loc[lambda df: df['num_sentences_x'] < 20]                   ## short articles
    .loc[lambda df: (df['num_added'] / df['num_sentences_x']) > .2] ## articles that grow between iterations
    .loc[lambda df: (df['num_added'] / df['num_sentences_x']) < .4] ## articles that grow between iterations    
    .loc[lambda df: df['num_deleted'] / df['num_sentences_x'] < .05]
)

In [229]:
keys = uda.get_join_keys(candidate_articles)

In [230]:
matched_sents, split_sents = uda.get_data_from_sqlite_by_sentence_criteria('bbc', conn, keys)

In [231]:
df_for_mturk = uda.match_sentences(matched_sents, split_sents)

In [233]:
df_for_mturk['category'] = (df_for_mturk
 .apply(lambda x: list(filter(lambda y: y['version'] == x.name[2], x['nodes'])), axis=1 )
 .apply(lambda x: list(map(lambda y: y['sentence'], x)))
 .apply(lambda x: category_model.predict(' '.join(x)))
 .apply(lambda x: x[0][0].replace('__label__', ''))
)

In [234]:
df_for_mturk = df_for_mturk.loc[lambda df: df['category'] != 'Other']

In [244]:
sample_to_annotate = (df_for_mturk
 .reset_index()
 .loc[lambda df: df['version_x'] < 6]
 .groupby('version_x')
 .apply(lambda x: x.sample(n=50).squeeze() if len(x) > 50 else x.squeeze())
)

In [246]:
to_annotate_json = uda.dump_output_to_app_readable(sample_to_annotate)

In [250]:
with open('../app/data/ap-sampled-data.json', 'w') as f:
    json.dump(to_annotate_json, f)

# Grant permissions

In [9]:
import datetime
from importlib import reload
reload(um)

# env = 'sandbox'
env = 'production'
mturk = um.MTurkHandler(environment=env) #=production/sandbox

In [21]:
#  mturk.create_qualification('edit-intention editor', 'text-classification, journalism, editing', 'Ability to identify edit intentions.')

prod_edit = {'QualificationType': {'QualificationTypeId': '3G1QR9I4MNX655BXKMI2XU5T2G2OJM',
  'CreationTime': datetime.datetime(2023, 1, 24, 17, 37, 28),
  'Name': 'edit-intention editor',
  'Description': 'Ability to identify edit intentions.',
  'Keywords': 'text-classification, journalism, editing',
  'QualificationTypeStatus': 'Active',
  'IsRequestable': True,
  'AutoGranted': False},
 'ResponseMetadata': {'RequestId': '857f14d8-eb41-444f-8a4d-a7f2b55f81d6',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '857f14d8-eb41-444f-8a4d-a7f2b55f81d6',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '321',
   'date': 'Wed, 25 Jan 2023 01:37:28 GMT'},
  'RetryAttempts': 0}}

In [168]:
CUSTOM_QUALIFICATION = True
# sandbox editor qualification: 3H3KEN1OLUQQR02IYZSVMYM7ESCBIO
# sandbox_journalist_qual = mturk.create_qualification('journalist')
# production_journalist_qual = mturk.create_qualification(
#     'journalist',
#     qual_keywords='journalist, writer, editor', 
#     qual_description='Turkers with experience in newsrooms performing reporting and editing functions.'
# )

In [19]:
production_journalist_qual = {'QualificationType': {'QualificationTypeId': '3YJP8DI8F7IJNZ5SWSN2GXBAZJF4Q2',
#   'CreationTime': datetime.datetime(2021, 11, 4, 23, 3, 23, tzinfo=tzlocal()),
  'Name': 'journalist',
  'Description': 'A custom qualification group given to workers we deem good.',
  'Keywords': 'custom-group filtering',
  'QualificationTypeStatus': 'Active',
  'IsRequestable': True,
  'AutoGranted': False},
 'ResponseMetadata': {'RequestId': '0edf8c83-69c6-446a-8d20-930712b2efa7',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '0edf8c83-69c6-446a-8d20-930712b2efa7',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '315',
   'date': 'Fri, 05 Nov 2021 06:03:22 GMT'},
  'RetryAttempts': 0}}

In [20]:
sandbox_journalist_qual = {'QualificationType': {'QualificationTypeId': '3H3KEN1OLUQQR02IYZSVMYM7ESCBIO',
#   'CreationTime': datetime.datetime(2021, 11, 4, 23, 0, 7, tzinfo=tzlocal()),
  'Name': 'journalist',
  'Description': 'A custom qualification group given to workers we deem good.',
  'Keywords': 'custom-group filtering',
  'QualificationTypeStatus': 'Active',
  'IsRequestable': True,
  'AutoGranted': False},
 'ResponseMetadata': {'RequestId': '28302a92-3fef-47d4-8f02-1b93f1e08258',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '28302a92-3fef-47d4-8f02-1b93f1e08258',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '315',
   'date': 'Fri, 05 Nov 2021 06:00:07 GMT'},
  'RetryAttempts': 0}}

In [292]:
workers = [
    'A25JJA15CM494B', #'Margot Williams'
    'AEMJM5IATKDU8', #'Beau Collins'
    'A200N6ADPZGA9A', #'Eliza Billingham'
    'AKLN3G3G3NATU', #'Miacel Spotted Elk'
    'ARBK4601C0FHM', #'Matt Zdun'
    'A1G0DK990FW3HL', #'Antonio Jarne'
    'AL5YEU3OZGP4T', # Marina Villeneuve
    'A3SBGUREOSQP6G', # Steeve
]

# mturk.give_qualification_to_workers(workers, qualification_id=production_journalist_qual['QualificationType']['QualificationTypeId'])

In [315]:
mturk.give_qualification_to_workers(
    workers, 
    qualification_id=prod_edit['QualificationType']['QualificationTypeId']
)

# Make file from template

In [10]:
from mturk import make_mturk_from_json as mj
from importlib import reload
reload(mj)

<module 'mturk.make_mturk_from_json' from '/Users/alex/Projects/usc-research/edit-intentions/notebooks/../mturk/make_mturk_from_json.py'>

In [11]:
import unidecode
import copy

def is_quote(x):
    s = unidecode.unidecode(x['sentence'])
    return s in ["''", '"']

def arc_contains_one_of_nodelist(arc, nodes):
    contains_one_node = False
    for n in nodes:
        in_x = (arc['version_x'] == n['version']) and (arc['sent_idx_x'] == n['sent_idx'])
        in_y = (arc['version_y'] == n['version']) and (arc['sent_idx_y'] == n['sent_idx'])
        contains_one_node |= (in_x or in_y)
    return contains_one_node

def update_key(d, key, val=None):
    """Update the dictionary, unless the value is null, then keep the old value."""
    if val is None:
        val = d[key]
        
    d[key] = val
    return d

def clean_data(datum):
    def _get_old_new_wrapper(nodes_list):
        new_idxs = list(range(len(nodes_list)))
        old_idxs = list(map(lambda x: x['sent_idx'], nodes_list))
        return dict(zip(old_idxs, new_idxs))
    
    # generic cleaning
    datum = copy.deepcopy(datum)
    nodes = datum['nodes']
    arcs = datum['arcs']
    for n in nodes:
        n['sentence'] = n['sentence'].replace('"', '')
    datum['nodes'] = nodes
    datum['arcs'] = arcs
    
    # get single-quotes and filter down
    quote_nodes = list(filter(is_quote, datum['nodes']))
    new_arcs = list(filter(lambda arc: not arc_contains_one_of_nodelist(arc, quote_nodes), datum['arcs']))
    new_nodes = list(filter(lambda x: not is_quote(x), datum['nodes']))
    
    # make version mappers
    vers_x = min(list(map(lambda x: x['version'], new_nodes)))
    vers_y = max(list(map(lambda x: x['version'], new_nodes)))
    new_nodes_x = list(filter(lambda x: x['version'] == vers_x, new_nodes))
    new_nodes_y = list(filter(lambda x: x['version'] == vers_y, new_nodes))
    x_idx_mapper = _get_old_new_wrapper(new_nodes_x)
    y_idx_mapper = _get_old_new_wrapper(new_nodes_y)
    
    nodes_x = list(map(lambda d: update_key(d, 'sent_idx', x_idx_mapper.get(d['sent_idx'])), new_nodes_x))
    nodes_y = list(map(lambda d: update_key(d, 'sent_idx', y_idx_mapper.get(d['sent_idx'])), new_nodes_y))

    new_arcs = list(map(lambda d: update_key(d, 'sent_idx_x', x_idx_mapper.get(d['sent_idx_x'])), new_arcs))
    new_arcs = list(map(lambda d: update_key(d, 'sent_idx_y', y_idx_mapper.get(d['sent_idx_y'])), new_arcs))

    datum['nodes'] = new_nodes_x + new_nodes_y
    datum['arcs'] = new_arcs
    return datum

In [12]:
data_file = '../app/data/ap-sampled-data.json'

with open(data_file) as f:
    input_data = json.load(f)

PRINT_DETAILS = False
keys = list(input_data.keys())
if PRINT_DETAILS:
    print('sample keys:')
    print('\n'.join(keys[:5]))
    print('\nnum documents:')
    print(len(keys))

SHUFFLE_DATA = False
if SHUFFLE_DATA:
    import random
    random.shuffle(keys)

In [38]:
output_htmls = []

for i in range(len(keys)):
    # select files
    k = keys[i]

    # get data
    datum = input_data[k]

    instructions_file = '../app/static/assets/instructions.html'
    with open(instructions_file) as f:
        instructions_html = f.read()

    data_id = re.split('\(\'|\'\, |\, |\)', k)
    data_id = list(filter(lambda x: x != '', data_id))
    data_id = '-'.join(data_id)

    output = mj.render_page(
        clean_data(datum),
        data_id,
        write=True, 
        template_folder='../app/templates/', 
        template_fn='visualize-doc-level-edits-d3.html',
        output_dir='../mturk/tasks_to_launch',
        instructions=instructions_html
    )
    output_htmls.append(output)

# Launch to MTurk

In [33]:
# env = 'sandbox'
env = 'production'

In [34]:
mturk = um.MTurkHandler(environment=env)

In [35]:
worker_requirements = [{
    'QualificationTypeId': prod_edit['QualificationType']['QualificationTypeId'],  ## journalists
    'Comparator': 'GreaterThanOrEqualTo',
    'IntegerValues': [90],      
}]

In [36]:
len(output_htmls)

19

In [43]:
60 * 60 * 4

14400

In [46]:
# created_hits = []
for output in tqdm(output_htmls[32:34]):
    try:
        title = 'Edit-Intentions Task v4.0.12'
        new_hit = mturk.client.create_hit(
            Title = title,
            Description = 'Help us annotate the reasons each edit was made',
            Keywords = 'classification',
            Reward = '3.50',
            MaxAssignments = 1,
            LifetimeInSeconds = 17_280_000,
            AssignmentDurationInSeconds = 600_000,
            AutoApprovalDelayInSeconds = 14_400,
            Question = output['html'],
            QualificationRequirements=worker_requirements if env == 'production' else []
        )
        created_hits.append(new_hit)
    except Exception as e:
        print(e)

  0%|          | 0/2 [00:00<?, ?it/s]

In [47]:
len(created_hits)

20

In [48]:
hit_list = list(map(lambda x: x['HIT']['HITId'], created_hits))

In [49]:
with open('cache/2023-02-01__second-batch-everyone-hit-list.json', 'w') as f:
    json.dump(hit_list, f)

In [None]:
answer_df = mturk.get_answer_df_for_hit_list(hit_list)

In [322]:
answer_df['assignment_id'].unique()

array(['3B4YI393VD7O2WX2DNW8L9HI6XDSSW', '3CP1TO84PXCURVIT7C4U1GUMU6C25H',
       '39GXDJN2OXPMB4ZXTKV8F51OTZG8VH', '3Y5140Z9D1R2VQEEGUY0M26KA6RPIQ',
       '3YWRV122CWAMYPAXOUL3JJZ5FR78UQ', '358010RM5I4CQ5051S70V4IHK4GVXV'],
      dtype=object)

In [323]:
answer_df.to_pickle('cache/2023-01-24__first-batch-results.json')

In [329]:
answer_df.drop_duplicates('assignment_id')['time_delta']

0   0 days 01:04:41
0   0 days 00:29:14
0   4 days 23:22:20
0   0 days 00:03:25
0   0 days 00:10:52
0   0 days 00:15:46
Name: time_delta, dtype: timedelta64[ns]