# Select Data for Annotation

In [40]:
source = 'bbc-2'

In [41]:
ls ../data/diffengine-diffs/spark-output/

ap-matched-sentences.db           nyt-matched-sentences.db
bbc-2-matched-sentences.db        reuters-matched-sentences.db
guardian-matched-sentences.db     wp-matched-sentences.db
independent-matched-sentences.db


In [42]:
import sys
sys.path.insert(0, '..')
from util import util_refactorings as ur
import sqlite3
import pandas as pd 
import os 

db_filename = '../data/diffengine-diffs/spark-output/%s-matched-sentences.db' % source

if not os.path.exists(db_filename):
    db_zip = db_filename + '.gz'
    ! gunzip $db_zip

conn = sqlite3.connect(db_filename)
pd.read_sql('''SELECT 
                    name
                FROM 
                    sqlite_master 
                WHERE 
                    type ='table' AND 
                    name NOT LIKE 'sqlite_%';
''', con=conn)

Unnamed: 0,name
0,matched_sentences
1,split_sentences


In [43]:
low_count_versions = pd.read_sql('''
    with c1 as 
        (SELECT entry_id, CAST(version as INT) as version, COUNT(1) as c from split_sentences GROUP BY entry_id, version)
    SELECT entry_id, version from c1
    WHERE c < 15 and c > 3
''', con=conn)

# get join keys
low_count_entry_ids = ', '.join(list(map(str, low_count_versions['entry_id'].unique())))
joint_keys = low_count_versions.pipe(lambda df: df['entry_id'].astype(str) + '-' + df['version'].astype(str))
joint_keys = "'%s'" % "', '".join(joint_keys.tolist())

# matched sentences
matched_sentences = pd.read_sql('''
    WITH c1 as ( 
    SELECT *, 
    entry_id || '-' || version_x as key_x,
    entry_id || '-' || version_y as key_y 
    FROM matched_sentences 
    )
    SELECT *
    FROM c1
    WHERE key_x in (%s) AND key_y  in (%s)
    ''' % (joint_keys, joint_keys)
, con=conn)

# get split sentences
split_sentences = pd.read_sql('''
    with c1 AS (
        SELECT *, entry_id || '-' || CAST(version AS INT) as key FROM split_sentences
    )
    SELECT entry_id, CAST(version AS INT) as version, sent_idx, sentence 
    FROM c1
    WHERE key IN (%s)
''' % joint_keys, con=conn)

In [44]:
matched_sentences = matched_sentences.assign(source=source)
split_sentences = split_sentences.assign(source=source)

In [45]:
import unidecode

In [46]:
doc_arcs_dict = matched_sentences.to_dict(orient='rows')

# get HTML diffs
doc_arcs = (matched_sentences
 .merge(split_sentences, how='outer', 
              right_on=['source', 'entry_id', 'version', 'sent_idx'],
              left_on=['source', 'entry_id', 'version_x', 'sent_idx_x'] ,
  ).drop(['version', 'sent_idx'], axis=1)
 .merge(split_sentences, how='outer', 
              right_on=['source', 'entry_id', 'version', 'sent_idx'],
              left_on=['source', 'entry_id', 'version_y', 'sent_idx_y'] ,
  ).drop(['version', 'sent_idx'], axis=1) 
)

grouped_arcs = (matched_sentences
 .groupby(['source', 'entry_id', 'version_x', 'version_y'])
 .apply(lambda df: 
    df[['version_x', 'version_y', 'sent_idx_x', 'sent_idx_y',
        'avg_sentence_distance_x', 'avg_sentence_distance_y'
       ]].to_dict(orient='rows')
 )
 .to_frame('arcs')
)

split_sentences['sentence'] = split_sentences['sentence'].apply(unidecode.unidecode)
split_sentences['sentence'] = split_sentences['sentence'].str.replace('"', '\'\'')
split_sentences['sentence'] = split_sentences['sentence'].str.replace('<p>', '').str.replace('</p>', '').str.strip()

grouped_nodes = (split_sentences
 .groupby(['source', 'entry_id', 'version'])
 .apply(lambda df: df[['version', 'sent_idx', 'sentence']].to_dict(orient='rows'))
).to_frame('nodes').reset_index()

matched_grouped_nodes = (grouped_nodes
 .merge(
     grouped_nodes.assign(next_vers=lambda df: df['version'] - 1), 
     left_on=['source', 'entry_id', 'version'], 
     right_on=['source', 'entry_id', 'next_vers']
 )
 .assign(nodes=lambda df: df['nodes_x'] + df['nodes_y'])
 [['source', 'entry_id', 'version_x', 'version_y', 'nodes']]
 .set_index(['source', 'entry_id', 'version_x', 'version_y'])
)

output = (
    pd.concat([matched_grouped_nodes, grouped_arcs], axis=1)
    .to_dict(orient='index')
)

output = {str(k): v for k, v in output.items()}

import json
with open('../evaluation/data/sample_datum_small.json', 'w') as f:
    json.dump(output, f )

# Checks

In [47]:
matched_sentences.head()

Unnamed: 0,entry_id,version_x,version_y,sent_idx_x,sent_idx_y,avg_sentence_distance_x,avg_sentence_distance_y,key_x,key_y,source
0,498581,0,1,6.0,6.0,0.0,0.0,498581-0,498581-1,bbc-2
1,475648,1,2,8.0,8.0,0.0,0.0,475648-1,475648-2,bbc-2
2,498633,0,1,11.0,11.0,0.0,0.0,498633-0,498633-1,bbc-2
3,500021,5,6,12.0,,,,500021-5,500021-6,bbc-2
4,493139,0,1,3.0,3.0,0.0,0.0,493139-0,493139-1,bbc-2


In [48]:
null_sents = (
    matched_sentences
     .groupby(['entry_id', 'version_x', 'version_y'])
     [['sent_idx_x','sent_idx_y']]
    .apply(lambda df: df.isnull().sum())
    .rename(columns={'sent_idx_x': 'num_deleted', 'sent_idx_y': 'num_added'})
    #  .sort_values(ascending=False).loc[lambda s: s <4]
)

In [49]:
# null_sents.loc[lambda df: df['sent_idx_y'] > 0]

In [50]:
non_zero_sents = (
    matched_sentences
     .groupby(['entry_id', 'version_x', 'version_y'])
     [['avg_sentence_distance_x', 'avg_sentence_distance_y']]
     .mean()
)

In [51]:
doc_level_stats = pd.concat([
    null_sents, 
    non_zero_sents
], axis=1)

In [52]:
doc_level_stats.loc[lambda df: df['num_deleted'] == 3] ## choose all documents with 3 deletions
## num_deleted = number of deleted sentences 
## num_added = number of added sentences 
## avg_sentence_distance_x, avg_sentence_distance_y = overall, how different are the sentences across the document?



Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,num_deleted,num_added,avg_sentence_distance_x,avg_sentence_distance_y
entry_id,version_x,version_y,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
218743,1,2,3,1,0.000000e+00,0.000000e+00
218980,1,2,3,2,2.276138e-02,2.276138e-02
218980,2,3,3,3,2.503717e-01,2.470772e-01
219139,0,1,3,2,5.360667e-02,5.360667e-02
219146,1,2,3,0,9.434915e-08,9.434915e-08
...,...,...,...,...,...,...
1983184,1,2,3,1,6.911459e-02,6.911459e-02
1990374,0,1,3,0,0.000000e+00,0.000000e+00
1995636,0,1,3,3,1.844887e-01,1.844887e-01
1995813,1,2,3,0,3.626789e-02,3.626789e-02


In [53]:
doc_level_stats.loc[lambda df: df['avg_sentence_distance_x'] > 2e-01] ## choose all documents with 3 deletions

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,num_deleted,num_added,avg_sentence_distance_x,avg_sentence_distance_y
entry_id,version_x,version_y,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
218742,0,1,8,5,0.212504,0.209759
218761,2,3,5,3,0.288710,0.288710
218873,1,2,1,1,0.403236,0.401979
218980,2,3,3,3,0.250372,0.247077
219006,0,1,2,0,0.225663,0.223859
...,...,...,...,...,...,...
1995662,1,2,4,1,0.286460,0.248541
1995709,1,2,9,3,0.279626,0.253950
2012949,2,3,2,4,0.410263,0.410263
2018809,0,1,2,3,0.216414,0.216414


In [56]:
desired_index = doc_level_stats.loc[lambda df: df['avg_sentence_distance_x'] > 2e-01]

In [57]:
# x = 870103
# desired_index = list(filter(lambda x: x[0] == 870103, desired_index.index))

In [None]:
# merge both
merged_matched_sentences = (matched_sentences
 .merge(
    split_sentences, left_on=['entry_id', 'version_x', 'sent_idx_x'], right_on=['entry_id', 'version', 'sent_idx'],
    how='left'
 ).drop(['version', 'sent_idx', 'key', 'key_x', 'key_y'], axis=1)
 .merge(
    split_sentences, left_on=['entry_id', 'version_y', 'sent_idx_y'], right_on=['entry_id', 'version', 'sent_idx'],
    how='left'
 ).drop(['version', 'sent_idx', 'key',], axis=1)
)

In [None]:
# check how many additions there are
merged_matched_sentences.loc[lambda df: df['sent_idx_x'].isnull()]

# Launch to MTurk

In [121]:
from tqdm.auto import tqdm
import boto3
import os
from boto.mturk.connection import MTurkConnection
from boto.mturk.question import HTMLQuestion
import pandas as pd 
from boto.mturk.question import ExternalQuestion
from IPython.display import display, HTML
import matplotlib.pyplot as plt
plt.rc('font', size=14)
from importlib import reload
from boto.mturk.qualification import (
    Qualifications,
    PercentAssignmentsApprovedRequirement, 
    NumberHitsApprovedRequirement
)

import sys
sys.path.insert(0, '../evaluation/')

import mturk as mturk
import mturk.utils_mturk as um

from importlib import reload
reload(um)
reload(mturk)

<module 'mturk' (namespace)>

In [122]:
print(os.environ['AWS_ACCESS_KEY_ID'])
print(os.environ['AWS_SECRET_ACCESS_KEY'])

VN2M29BH4PCAJT9ABZKN
O9bcBpaprrXr6Q3dorn0XYI4Kp8go6oBDBYFYqeD


In [140]:
from importlib import reload
reload(um)

# env = 'sandbox'
env = 'production'
mturk = um.MTurkHandler(environment=env) #=production/sandbox

In [None]:
mturk.bonus_worker('A3PRQ2GSU42718', '3B837J3LDS7C1HXPYYW3VZCJESSSRO', '3.50')

In [148]:
ass = mturk.client.list_assignments_for_hit(HITId=created_hits[0]['HIT']['HITId'])

In [62]:
CUSTOM_QUALIFICATION = False

In [63]:
worker_requirements = [
    ### number of hits approved
    {
        'QualificationTypeId': '000000000000000000L0',
        'Comparator': 'GreaterThanOrEqualTo',
        'IntegerValues': [80],
    },
    ## worker local
    {
        'QualificationTypeId': '00000000000000000071',
        'Comparator': 'EqualTo',
        'LocaleValues': [{
            "Country":"US",
        }],
        'RequiredToPreview': True,
    },
    ## percent assignments approved
    {
        'QualificationTypeId': '000000000000000000L0',
        'Comparator': 'GreaterThanOrEqualTo',
        'IntegerValues': [90],
    },
]

## custom qualification
if CUSTOM_QUALIFICATION:
    worker_requirements = []
    if env == 'production':
        worker_requirements.append({
            'QualificationTypeId': '3WZ6PU0JYXSTA4EIPF2M6S1CMZ7KL8',
            'Comparator': 'GreaterThanOrEqualTo',
            'IntegerValues': [90],      
        })
    else:
        worker_requirements.append({
            'QualificationTypeId': '381R35RGJFFV6VLBBDX2MLZFNSH414',  ## UCLA students
            #'3FQWXCP5BDC6A66PD20NE8FM4G3H44' other workers
            'Comparator': 'GreaterThanOrEqualTo',
            'IntegerValues': [90],      
        })

In [118]:
from jinja2 import Template, Environment, FileSystemLoader
import datetime
from IPython.display import display, HTML
jinja_env = Environment(loader=FileSystemLoader('../evaluation/templates'))

In [119]:
template = jinja_env.get_template('match-sentences-from-scratch.html')
created_hits = []

In [120]:
import unidecode
def clean_temp(node):
    node['sentence'] =     node['sentence'].replace('.p T', ' -- t')
    node['sentence'] =     unidecode.unidecode(node['sentence'])
    return node
    
# data['nodes'] = list(map(clean_temp, data['nodes']))

In [139]:
for i, idx in enumerate(desired_index.sample(2).index):
    sample_key = str(tuple([source] + list(idx)))

    ## make the HTML 
    data = output[sample_key]
    html = template.render(
        data=data,
        doc_id=sample_key,
        do_mturk=True,
        production=env=='production',
        start_time=datetime.datetime.now()
    )

    ## dump to disk for inspection
    with open('../evaluation/mturk/templated-question-example-%s.html' % i, 'w') as f:
        f.write(html)

    if True:
        ## HTMLQuestion HIT
        description = '''
            We\'d like to match sentences from two edited versions of the same article. 
            Help us by drawing lines to connect blocks of text.
        '''
        title = 'Annotate some news article edits v8'
        new_hit = mturk.client.create_hit(
            Title = title,
            Description = description,
            Keywords = 'text, highlighting',
            Reward = '0.4',
            MaxAssignments = 1,
            LifetimeInSeconds = 17280000,
            AssignmentDurationInSeconds = 600000,
            AutoApprovalDelayInSeconds = 28800,
            Question = html,
            QualificationRequirements=worker_requirements #if env == 'production' else []
        )
        created_hits.append(new_hit)

In [150]:
mkdir hits_cache

In [151]:
import pickle

In [142]:
with open('hits_cache/2021-10-22__first-round.pkl', 'rb') as f:
    created_hits = pickle.load(f)

In [153]:
with open('hits_cache/2021-10-22__first-round.pkl', 'wb') as f:
    pickle.dump(created_hits, f)

In [174]:
with open('hits_cache/2021-10-22__first-round-sandbox.pkl', 'wb') as f:
    pickle.dump(created_hits, f)

In [320]:
with open('hits_cache/2021-10-23__second-round-sandbox.pkl', 'wb') as f:
    pickle.dump(created_hits, f)

In [365]:
with open('hits_cache/2021-10-23__third-round-sandbox.pkl', 'wb') as f:
    pickle.dump(created_hits, f)

In [367]:
with open('hits_cache/2021-10-23__fourth-round-sandbox.pkl', 'wb') as f:
    pickle.dump(created_hits, f)

In [454]:
with open('hits_cache/2021-10-23__fifth-round-sandbox.pkl', 'wb') as f:
    pickle.dump(created_hits, f)

In [482]:
with open('hits_cache/2021-10-23__sixth-round-sandbox.pkl', 'wb') as f:
    pickle.dump(created_hits, f)

In [521]:
with open('hits_cache/2021-10-23__seventh-round-sandbox.pkl', 'wb') as f:
    pickle.dump(created_hits, f)

In [523]:
with open('hits_cache/2021-10-23__eigth-round-sandbox.pkl', 'wb') as f:
    pickle.dump(created_hits, f)

In [33]:
import pickle
with open('hits_cache/2021-10-24__ninth-round-sandbox.pkl', 'wb') as f:
    pickle.dump(created_hits, f)

In [68]:
import pickle
with open('hits_cache/2021-10-24__tenth-round-sandbox.pkl', 'wb') as f:
    pickle.dump(created_hits, f)

In [211]:
with open('hits_cache/2021-10-22__second-round.pkl', 'wb') as f:
    pickle.dump(created_hits, f)

In [240]:
with open('hits_cache/2021-10-22__third-round.pkl', 'wb') as f:
    pickle.dump(created_hits, f)

# Get Data from MTurk

In [160]:
import glob
all_created_hits = []
files = glob.glob('hits_cache/*.pkl')
files = list(filter(lambda x: 'sandbox' not in x and 'answers' not in x, files))
for round_pkl in files:
    with open(round_pkl, 'rb') as f:
        created_hits = pickle.load(f)
        all_created_hits += created_hits

In [162]:
len(all_created_hits)

35

In [163]:
hit_ids = list(map(lambda x: x['HIT']['HITId'], all_created_hits))

In [164]:
len(hit_ids)

35

In [165]:
len(list(set(hit_ids)))

35

In [166]:
answers, answer_df = mturk.get_answers_for_hit_list(set(hit_ids))

  0%|          | 0/35 [00:00<?, ?it/s]

In [169]:
t = 'A3PRQ2GSU42718'
answer_df.loc[lambda df: df['worker_id'] == t]

Unnamed: 0,doc_id,question_class,sent_idx_x,sent_idx_y,version_x,version_y,worker_id,assignment_id,hit_id,time_delta


In [125]:
len(answers)

193

In [130]:
doc_ids = list(map(lambda x: x['doc_id'], answers))

In [132]:
len(set(doc_ids))

190

In [137]:
answer_df.drop_duplicates().to_csv('hits_cache/2021-10-24__all-answers-df.csv', index=False)

In [138]:
with open('hits_cache/2021-10-24__all-answers.pkl', 'wb') as f :
    pickle.dump(answers, f)

In [337]:
answer_df.to_csv('hits_cache/2021-10-23__first-second-round-sandbox-results.csv')

In [390]:
answer_df.to_csv('hits_cache/2021-10-23__third-fourth-round-sandbox-results.csv')

In [491]:
answer_df.to_csv('hits_cache/2021-10-23__fifth-sixth-round-sandbox-results.csv')

In [534]:
answer_df.to_csv('hits_cache/2021-10-23__seventh-eigth-round-sandbox-results.csv')

In [39]:
answer_df.to_csv('hits_cache/2021-10-23__ninth-round-sandbox-results.csv')

In [75]:
answer_df.to_csv('hits_cache/2021-10-23__tenth-round-sandbox-results.csv')

In [387]:
answer_df['doc_id'].drop_duplicates()

(40,)

In [None]:
answer_df['doc_id'].value_counts()

In [83]:
answer_dfs = []
answers = []
for hit_id in tqdm(hit_ids):
    ##
    assignmentsList = mturk.client.list_assignments_for_hit(
        HITId=hit_id,
        # AssignmentStatuses=['Submitted', 'Approved'],
        MaxResults=10
    )
    assignments = assignmentsList['Assignments']
    assignments_submitted_count = len(assignments)
    print(assignments_submitted_count)
    if assignments_submitted_count > 0:
        break 
    for assignment in assignments:
        # Retreive the attributes for each Assignment
        answer_dict = xmltodict.parse(assignment['Answer'])
        answer = json.loads(answer_dict['QuestionFormAnswers']['Answer'][1]['FreeText'])
        answers.append(answer)
        # formatted 
        answer_df = pd.DataFrame(answer['annotated connections'])
        answer_df['worker_id'] = assignment['WorkerId']
        answer_df['assignment_id'] = assignment['AssignmentId']
        answer_df['hit_id'] = assignment['HITId']
        answer_df['time_delta'] = assignment['SubmitTime'] - assignment['AcceptTime']
        answer_dfs.append(answer_df)

  0%|          | 0/30 [00:00<?, ?it/s]

1


In [231]:
for assignment in assignments:
    # Retreive the attributes for each Assignment
    answer_dict = xmltodict.parse(assignment['Answer'])
    answer = json.loads(answer_dict['QuestionFormAnswers']['Answer'][1]['FreeText'])

In [232]:
answer

{'doc_id': "('nyt', 752766, 1, 2)",
 'annotated connections': [],
 'nodes': [{'sent_idx': 0,
   'sentence': 'Crimea will vote on Sunday in a ballot referendum that leaders of the regional Parliament expect will ratify their decision to break away from Ukraine and become part of Russia.',
   'version': 1},
  {'sent_idx': 1,
   'sentence': 'The referendum will offer two choices, neither one of them “No”: 1) “Are you in favor of the reunification of Crimea with Russia as a part of the Russian Federation?” 2) “Are you in favor of restoring the 1992 Constitution and the status of Crimea as a part of Ukraine?” Voters will have to mark one option affirmatively, but they cannot vote for the status quo.',
   'version': 1},
  {'sent_idx': 2,
   'sentence': 'A return to the 1992 Constitution — adopted after the Soviet collapse but quickly thrown out by the post-Soviet Ukraine — would effectively provide for Crimea’s independence, while remaining part of Ukraine.',
   'version': 1},
  {'sent_idx':

In [282]:
hits = mturk.client.list_hits(MaxResults=100)

In [283]:
hits_df = pd.DataFrame(hits['HITs'])

In [284]:
hits_df['CreationTime'] = pd.to_datetime(hits_df['CreationTime'])
import datetime

In [285]:
hits_df['CreationTime']

0    2021-10-22 20:05:44-07:00
1    2021-10-22 20:05:43-07:00
2    2021-10-22 20:05:42-07:00
3    2021-10-22 20:05:41-07:00
4    2021-10-22 20:05:40-07:00
                ...           
93   2021-05-13 12:39:03-07:00
94   2021-05-13 12:39:03-07:00
95   2021-05-13 12:39:02-07:00
96   2021-05-13 12:39:01-07:00
97   2021-05-13 12:39:00-07:00
Name: CreationTime, Length: 98, dtype: datetime64[ns, tzlocal()]

In [260]:
hits_df.loc[lambda df: df['CreationTime']  > datetime.datetime(2021, 10, 1, tzinfo=datetime.tzinfo)]

TypeError: Cannot compare tz-naive and tz-aware datetime-like objects

In [181]:
from importlib import reload

In [183]:
reload(um)

<module 'mturk.utils_mturk' from '../evaluation/mturk/utils_mturk.py'>

In [92]:
list(filter(lambda x: x['version'] == 1 and x['sent_idx'] == 0, data['nodes']))

[{'version': 1,
  'sent_idx': 0,
  'sentence': 'TAIPEI, Taiwan — A man fatally stabbed six nurses and a manager on Thursday at a Chinese military hospital in a seaside area famous for gatherings of Communist Party leaders, police officials in China said.'}]

In [97]:
t1 = "TAIPEI, Taiwan — Police officials in China said Thursday that they arrested a man suspected of stabbing to death six nurses and a manager at a Chinese military sanitarium in a seaside area famous for gatherings of Communist Party leaders."
t2 = "TAIPEI, Taiwan — A man fatally stabbed six nurses and a manager on Thursday at a Chinese military hospital in a seaside area famous for gatherings of Communist Party leaders, police officials in China said."

t1 = 'ZABUL, Afghanistan — A complex attack on a military base in southern Afghanistan Monday left at least one member of U.S.-led coalition forces dead.'
t2 = 'ZABUL, Afghanistan — A complex attack on a military base in southern Afghanistan Monday killed at least one member of the U.S.-led coalition forces.'

In [98]:
import sys
sys.path.insert(0, '../')
import util.util_newssniffer_parsing as unp

In [99]:
unp.html_compare_sentences(t1, t2)

('ZABUL , Afghanistan — A complex attack on a military base in southern Afghanistan Monday <span style="background-color:rgba(255,0,0,0.3)">left</span>  at least one member of  U.S.-led coalition forces <span style="background-color:rgba(255,0,0,0.3)">dead</span> .',
 'ZABUL , Afghanistan — A complex attack on a military base in southern Afghanistan Monday  <span style="background-color:rgba(0,255,0,0.3)">killed </span> at least one member of <span style="background-color:rgba(0,255,0,0.3)">the </span> U.S.-led coalition forces  .')

In [101]:
t1= 'The police said the suspect in the killings Thursday morning in Beidaihe, a resort area in the Hebei Province city of Qinhuangdao, is a sanitarium employee, indicating the attack was likely not the result of a dispute over a patient’s treatment-- the suspect, a 27-year-old man, had no known conflict with the victims, who were killed in a women’s dormitory at the hospital, the Qinhuangdao City Public Security Bureau said in a statement online.'

t2 = 'The suspect, a 27-year-old hospital employee who had no known motive, was arrested after the attacks, the Qinhuangdao City Public Security Bureau said in a statement online.'
t3 = 'The stabbings took place Thursday morning in Beidaihe, a resort area in the Hebei Province city of Qinhuangdao, about 180 miles east of Beijing.'
t4 = 'But the fact that the suspect in the Beidaihe case is an employee indicates that the attack was probably not the result of a dispute over a patient’s treatment.'
t5 = 'The victims were killed in a women’s dormitory at the hospital for elderly military personnel, also known as People’s Liberation Army Hospital 281, the statement said.'

In [108]:
import pyperclip
pyperclip.copy(unp.html_compare_sentences(t1, t5)[1])