# Select Data for Annotation

In [1]:
import sys
sys.path.insert(0, '..')
from util import util_refactorings as ur
import sqlite3
import pandas as pd 
import os 

db_filename = '../data/diffengine-diffs/spark-output/nyt-matched-sentences.db'

if not os.path.exists(db_filename):
    db_zip = db_filename + '.gz'
    ! gunzip $db_zip

conn = sqlite3.connect(db_filename)
pd.read_sql('''SELECT 
                    name
                FROM 
                    sqlite_master 
                WHERE 
                    type ='table' AND 
                    name NOT LIKE 'sqlite_%';
''', con=conn)

Unnamed: 0,name
0,matched_sentences
1,split_sentences


In [2]:
low_count_versions = pd.read_sql('''
    with c1 as 
        (SELECT entry_id, version, COUNT(1) as c from split_sentences GROUP BY entry_id, version)
    SELECT entry_id, version from c1
    WHERE c < 10 and c > 5
''', con=conn)

# get join keys
low_count_entry_ids = ', '.join(list(map(str, low_count_versions['entry_id'].unique())))
joint_keys = low_count_versions.pipe(lambda df: df['entry_id'].astype(str) + '-' + df['version'].astype(str))
joint_keys = "'%s'" % "', '".join(joint_keys.tolist())

# matched sentences
matched_sentences = pd.read_sql('''
    WITH c1 as ( 
    SELECT *, 
    entry_id || '-' || version_x as key_x,
    entry_id || '-' || version_y as key_y 
    FROM matched_sentences 
    )
    SELECT *
    FROM c1
    WHERE key_x in (%s) AND key_y  in (%s)
    ''' % (joint_keys, joint_keys)
, con=conn)

# get split sentences
split_sentences = pd.read_sql('''
    with c1 AS (
        SELECT *, entry_id || '-' || version as key FROM split_sentences
    )
    SELECT * from c1
    WHERE key IN (%s)
''' % joint_keys, con=conn)

In [4]:
doc_arcs_dict = matched_sentences.to_dict(orient='rows')

# get HTML diffs
doc_arcs = (matched_sentences
 .merge(split_sentences, how='outer', 
              right_on=['entry_id', 'version', 'sent_idx'],
              left_on=['entry_id', 'version_x', 'sent_idx_x'] ,
  ).drop(['version', 'sent_idx'], axis=1)
 .merge(split_sentences, how='outer', 
              right_on=['entry_id', 'version', 'sent_idx'],
              left_on=['entry_id', 'version_y', 'sent_idx_y'] ,
  ).drop(['version', 'sent_idx'], axis=1) 
)

grouped_arcs = (matched_sentences
 .groupby(['entry_id', 'version_x', 'version_y'])
 .apply(lambda df: df[['version_x', 'version_y', 'sent_idx_x', 'sent_idx_y']].to_dict(orient='rows'))
 .to_frame('arcs')
)

grouped_nodes = (split_sentences
 .groupby(['entry_id', 'version'])
 .apply(lambda df: df[['version', 'sent_idx', 'sentence']].to_dict(orient='rows'))
).to_frame('nodes').reset_index()

matched_grouped_nodes = (grouped_nodes
 .merge(
     grouped_nodes.assign(next_vers=lambda df: df['version'] - 1), 
     left_on=['entry_id', 'version'], 
     right_on=['entry_id', 'next_vers']
 )
 .assign(nodes=lambda df: df['nodes_x'] + df['nodes_y'])
 [['entry_id', 'version_x', 'version_y', 'nodes']]
 .set_index(['entry_id', 'version_x', 'version_y'])
)

output = (
    pd.concat([matched_grouped_nodes, grouped_arcs], axis=1)
    .to_dict(orient='index')
)

output = {str(k): v for k, v in output.items()}

import json
with open('../evaluation/data/sample_datum_small.json', 'w') as f:
    json.dump(output, f )

# Checks

In [5]:
# merge both
merged_matched_sentences = (matched_sentences
 .merge(
    split_sentences, left_on=['entry_id', 'version_x', 'sent_idx_x'], right_on=['entry_id', 'version', 'sent_idx'],
    how='left'
 ).drop(['version', 'sent_idx', 'key', 'key_x', 'key_y'], axis=1)
 .merge(
    split_sentences, left_on=['entry_id', 'version_y', 'sent_idx_y'], right_on=['entry_id', 'version', 'sent_idx'],
    how='left'
 ).drop(['version', 'sent_idx', 'key',], axis=1)
)

In [7]:
# check how many additions there are
merged_matched_sentences.loc[lambda df: df['sent_idx_x'].isnull()]

Unnamed: 0,entry_id,version_x,version_y,sent_idx_x,sent_idx_y,avg_sentence_distance_x,avg_sentence_distance_y,sentence_x,sentence_y
24,1651691,0,1,,6.0,,,,There has not been a proclamation about Mr. Mc...
42,1650749,0,1,,1.0,,,,A 55-year-old woman came forward to the police...
155,1165597,0,1,,3.0,,,,There was no immediate claim of responsibility...
245,1322807,1,2,,7.0,,,,The whale stranding was the largest in the cou...
307,1598136,3,4,,1.0,,,,"In each newsletter, our gender writer, Maya Sa..."
...,...,...,...,...,...,...,...,...,...
9672,1450768,0,1,,4.0,,,,"Zhaira Franco, 35, who works for Facebook in s..."
9676,1450768,0,1,,6.0,,,,An aftershock with a magnitude of 5.7 and an e...
9698,1223039,0,1,,7.0,,,,Schlafly’s obituary will be posted soon.
9709,1223039,0,1,,6.0,,,,” A full version of Mrs.
