# Generate Data to Score

In [1]:
import sys

In [2]:
sys.path.insert(0, '../eda/EventPlus/component/BETTER/joint/')

In [3]:
from event_pipeline_demo import BETTER_API
betterAPI = BETTER_API('../eda/EventPlus/component/BETTER/joint')

In [None]:
!pip uninstall -y transformers
!pip install -y transformers==3.5

In [None]:
from transformers import AutoTokenizer
import spacy

In [None]:
nlp = spacy.load('en_core_web_sm')

In [30]:
text = "The boy went outside."
text_tokenized = [i.text for i in nlp(text)]

In [None]:
betterAPI.pred(text_tokenized)

In [None]:
ls ../eda/EventPlus/component/BETTER/joint

In [None]:
import json
json.load(open('../eda/EventPlus/component/BETTER/joint/saved_args.json'))

In [1]:
import sqlite3
import pandas as pd 
import glob

In [2]:
files = glob.glob('../data/diffengine-diffs/spark-output/*')

In [4]:
all_matched_dfs = []

In [3]:
f = files[0]
print(f)
source = f.split('/')[-1].split('-')[0]
if '.gz' in f:
    ! gunzip $f
f = f.replace('.gz', '')
conn = sqlite3.connect(f)

../data/diffengine-diffs/spark-output/ap-matched-sentences.db.gz


In [13]:
%%timeit
matched_df = pd.read_sql('''
    SELECT entry_id, version_x, version_y, sent_idx_x, sent_idx_y
        FROM matched_sentences m
    WHERE (m.avg_sentence_distance_x > 0.1) AND (m.avg_sentence_distance_y > 0.1)
''', con=conn)

unique_v = (matched_df[['entry_id', 'version_x', 'sent_idx_x']]
 .rename(columns={'version_x': 'version', 'sent_idx_x': 'sent_idx'})
 .merge(right=matched_df[['entry_id', 'version_y', 'sent_idx_y']] 
                .rename(columns={'version_y': 'version', 'sent_idx_y': 'sent_idx'}),
        left_on=['entry_id', 'version', 'sent_idx'],
        right_on=['entry_id', 'version', 'sent_idx'],
        how='outer'
       )
)

keys = unique_v.apply(lambda x: '%s-%s-%s' % (x['entry_id'], x['version'], x['sent_idx']), axis=1)

sentences = pd.read_sql('''
    WITH c1 as ( 
        SELECT *, 
        entry_id || '-' || version || '-' || sent_idx as key
        FROM split_sentences
        )
        SELECT *
        FROM c1
        WHERE key in (%s)
''' % ', '.join(keys), con=conn)

6.17 s ± 115 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
sents = pd.read_sql('''
    with c1 as (
        SELECT entry_id, version_x, version_y, sent_idx_x, sent_idx_y
        FROM matched_sentences m
        WHERE (m.avg_sentence_distance_x > 0.1) AND (m.avg_sentence_distance_y > 0.1)
    )
        SELECT entry_id, version_x as version, sent_idx_x as sent_idx FROM c1
        UNION
        SELECT entry_id, version_y as version, sent_idx_y as sent_idx FROM c1
        ''', con=conn)

In [24]:
all_sents = []

In [None]:
rand() <= .3

In [48]:
total = pd.Series({'ap': 186667,
'independent': 427339,
'bbc': 1830932,
'guardian': 1839144,
'nyt': 1288130,
'reuters': 448025,
'wp': 391419, })

In [52]:
total

ap              186667
independent     427339
bbc            1830932
guardian       1839144
nyt            1288130
reuters         448025
wp              391419
dtype: int64

In [54]:
sample_weights = total.pipe(lambda s: 80_000 / s)

In [63]:
files = [
 '../data/diffengine-diffs/spark-output/guardian-matched-sentences.db.gz',
 '../data/diffengine-diffs/spark-output/nyt-matched-sentences.db.gz',
 '../data/diffengine-diffs/spark-output/reuters-matched-sentences.db.gz',
 '../data/diffengine-diffs/spark-output/wp-matched-sentences.db.gz'
]

In [None]:
downsampled_sents = []

In [64]:
for f in files:
    print(f)
    source = f.split('/')[-1].split('-')[0]
    if '.gz' in f:
        ! gunzip $f
    f = f.replace('.gz', '')
    conn = sqlite3.connect(f)
    w = sample_weights[source]
    print('fetching sql')
    sents = pd.read_sql('''
        with c1 as (
            SELECT entry_id, version_x, version_y, sent_idx_x, sent_idx_y
            FROM matched_sentences m
            WHERE (m.avg_sentence_distance_x > 0.1) AND (m.avg_sentence_distance_y > 0.1)
            AND RANDOM() <= %s
        ), c2 as (
            SELECT entry_id, version_x as version, sent_idx_x as sent_idx FROM c1
            UNION
            SELECT entry_id, version_y as version, sent_idx_y as sent_idx FROM c1
        )
        SELECT s.* from split_sentences s
        JOIN c2 ON s.entry_id = c2.entry_id AND s.version=c2.version AND s.sent_idx = c2.sent_idx
    ''' % w, con=conn)
    print('parsing')
    sents['sentence'] = sents['sentence'].str.replace('<p>', '').str.replace('</p>', '').str.replace('\s+', ' ').str.strip()
    sents['source'] = source
    print(sents.shape)
    downsampled_sents.append(sents)
    ! gzip $f

../data/diffengine-diffs/spark-output/guardian-matched-sentences.db.gz
fetching sql
parsing




(940028, 5)
../data/diffengine-diffs/spark-output/nyt-matched-sentences.db.gz
fetching sql
parsing
(665271, 5)
../data/diffengine-diffs/spark-output/reuters-matched-sentences.db.gz
fetching sql
parsing
(236551, 5)
../data/diffengine-diffs/spark-output/wp-matched-sentences.db.gz
fetching sql
parsing
(202014, 5)


In [65]:
all_downsampled = pd.concat(downsampled_sents)

In [73]:
all_downsampled.to_csv('../eda/downsampled-event-sents.csv')

In [71]:
all_downsampled[['entry_id', 'version', 'source']].drop_duplicates()['source'].value_counts()

bbc            425763
guardian       371732
nyt            220470
independent     86216
reuters         68859
wp              50004
ap              33582
Name: source, dtype: int64

In [27]:
all_sents_df = pd.concat(all_sents)

In [30]:
ls ../eda/

[34mEventPlus[m[m/           edit_scores.txt
[34mcomponent[m[m/           wp_edits_scores.pkl


In [35]:
docs = all_sents_df[['source', 'entry_id', 'version']].drop_duplicates()

In [41]:
source_weight = docs['source'].value_counts().pipe(lambda s: 1/s)

In [44]:
source_weight.to_frame('weight')

Unnamed: 0,weight
bbc,2e-06
guardian,2e-06
nyt,4e-06
independent,9e-06
reuters,1.3e-05
wp,1.7e-05
ap,2.4e-05


In [45]:
(docs
 .merge(source_weight.to_frame('weight'), right_index=True, left_on='source')
 .sample(100000)
)

Unnamed: 0,source,entry_id,version,weight
0,ap,16063,5.0,0.000024
1,ap,17769,1.0,0.000024
5,ap,17521,4.0,0.000024
14,ap,17516,3.0,0.000024
15,ap,19676,1.0,0.000024
...,...,...,...,...
391389,wp,1120215,5.0,0.000017
391394,wp,1377691,2.0,0.000017
391397,wp,1110888,3.0,0.000017
391398,wp,1944987,0.0,0.000017


In [31]:
all_sents_df.to_csv('../eda/events_sentences.csv')

In [76]:
import sys
sys.path.insert(0, '../util/')
import util_newssniffer_parsing as unp
from IPython.display import HTML, display

In [77]:
idx = 10
s1 = matched_df.iloc[idx]['sentence_x']
s2 = matched_df.iloc[idx]['sentence_y']

In [78]:
matched_df.iloc[idx][['avg_sentence_distance_x', 'avg_sentence_distance_y']]

avg_sentence_distance_x    0.305947
avg_sentence_distance_y    0.305947
Name: 10, dtype: object

In [79]:
h1, h2 = unp.html_compare_sentences(s1, s2)

In [83]:
s1

'Since the coup attempt, at least 100 news outlets have been forcibly closed in what Human Rights Watch has called a clampdown that has "all but silenced independent media." Yet Bozdag insisted any journalists in prison were there for drugs, trespassing or for "propagandizing for terrorist organizations."</p> <p>Turkish leaders have expressed exasperation that they are lambasted for the steps they took after the coup while France gets a "pass" for the state of emergency imposed after the 2015 Paris attacks.'

In [84]:
s2

'Since the failed coup, at least 100 news outlets have been forcibly closed in a clampdown Human Rights Watch says has "all but silenced independent media." Yet Bozdag insisted any journalists in prison were there for drugs, trespassing or for "propagandizing for terrorist organizations."</p> <p>Turkish leaders have expressed exasperation that they are lambasted for the steps they took after the coup while France gets a "pass" for the state of emergency imposed after the 2015 Paris attacks.'

In [80]:
display(HTML(h1))

In [81]:
display(HTML(h2))

# Analyze data

In [112]:
pluslab = [
    'events-output-0-1-0.pkl', 'events-output-0-1-10.pkl', 'events-output-0-1-11.pkl',
    'events-output-0-1-12.pkl', 'events-output-0-1-13.pkl', 'events-output-0-1-14.pkl', 'events-output-0-1-15.pkl', 'events-output-0-1-16.pkl', 'events-output-0-1-17.pkl', 'events-output-0-1-18.pkl', 'events-output-0-1-19.pkl', 'events-output-0-1-1.pkl', 'events-output-0-1-20.pkl', 'events-output-0-1-21.pkl', 'events-output-0-1-22.pkl', 'events-output-0-1-23.pkl', 'events-output-0-1-24.pkl', 'events-output-0-1-25.pkl', 'events-output-0-1-26.pkl', 'events-output-0-1-27.pkl', 'events-output-0-1-28.pkl', 'events-output-0-1-29.pkl', 'events-output-0-1-2.pkl', 'events-output-0-1-30.pkl', 'events-output-0-1-31.pkl', 'events-output-0-1-32.pkl', 'events-output-0-1-33.pkl', 'events-output-0-1-34.pkl', 'events-output-0-1-35.pkl', 'events-output-0-1-36.pkl', 'events-output-0-1-37.pkl', 'events-output-0-1-38.pkl', 'events-output-0-1-39.pkl', 'events-output-0-1-3.pkl', 'events-output-0-1-40.pkl', 'events-output-0-1-41.pkl', 'events-output-0-1-42.pkl', 'events-output-0-1-43.pkl', 'events-output-0-1-44.pkl', 'events-output-0-1-45.pkl', 'events-output-0-1-46.pkl', 'events-output-0-1-47.pkl', 'events-output-0-1-48.pkl', 'events-output-0-1-49.pkl', 'events-output-0-1-4.pkl', 'events-output-0-1-50.pkl', 'events-output-0-1-51.pkl', 'events-output-0-1-52.pkl', 'events-output-0-1-53.pkl', 'events-output-0-1-54.pkl', 'events-output-0-1-55.pkl', 'events-output-0-1-56.pkl', 'events-output-0-1-57.pkl', 'events-output-0-1-58.pkl', 'events-output-0-1-59.pkl', 'events-output-0-1-5.pkl', 'events-output-0-1-60.pkl', 'events-output-0-1-61.pkl', 'events-output-0-1-6.pkl', 'events-output-0-1-7.pkl', 'events-output-0-1-8.pkl', 'events-output-0-1-9.pkl', 'events-output-0-8-0.pkl', 'events-output-0-8-10.pkl', 'events-output-0-8-11.pkl', 'events-output-0-8-12.pkl', 'events-output-0-8-13.pkl', 'events-output-0-8-14.pkl', 'events-output-0-8-15.pkl', 'events-output-0-8-16.pkl', 'events-output-0-8-17.pkl', 'events-output-0-8-18.pkl', 'events-output-0-8-19.pkl', 'events-output-0-8-1.pkl', 'events-output-0-8-20.pkl', 'events-output-0-8-21.pkl', 'events-output-0-8-22.pkl', 'events-output-0-8-23.pkl', 'events-output-0-8-24.pkl', 'events-output-0-8-25.pkl', 'events-output-0-8-26.pkl', 'events-output-0-8-27.pkl', 'events-output-0-8-28.pkl', 'events-output-0-8-29.pkl', 'events-output-0-8-2.pkl', 'events-output-0-8-30.pkl', 'events-output-0-8-31.pkl', 'events-output-0-8-32.pkl', 'events-output-0-8-33.pkl', 'events-output-0-8-34.pkl', 'events-output-0-8-35.pkl', 'events-output-0-8-36.pkl', 'events-output-0-8-37.pkl', 'events-output-0-8-38.pkl', 'events-output-0-8-39.pkl', 'events-output-0-8-3.pkl', 'events-output-0-8-40.pkl', 'events-output-0-8-41.pkl', 'events-output-0-8-42.pkl', 'events-output-0-8-43.pkl', 'events-output-0-8-44.pkl', 'events-output-0-8-45.pkl', 'events-output-0-8-46.pkl', 'events-output-0-8-47.pkl', 'events-output-0-8-48.pkl ',
    'events-output-0-8-49.pkl', 'events-output-0-8-4.pkl', 'events-output-0-8-50.pkl', 'events-output-0-8-51.pkl', 'events-output-0-8-52.pkl', 'events-output-0-8-53.pkl', 'events-output-0-8-54.pkl', 'events-output-0-8-55.pkl', 'events-output-0-8-56.pkl', 'events-output-0-8-57.pkl', 'events-output-0-8-58.pkl', 'events-output-0-8-59.pkl', 'events-output-0-8-5.pkl', 'events-output-0-8-60.pkl', 'events-output-0-8-61.pkl', 'events-output-0-8-6.pkl', 'events-output-0-8-7.pkl', 'events-output-0-8-8.pkl', 'events-output-0-8-9.pkl', 'events-output-1-8-0.pkl', 'events-output-1-8-10.pkl', 'events-output-1-8-11.pkl', 'events-output-1-8-12.pkl', 'events-output-1-8-13.pkl', 'events-output-1-8-14.pkl', 'events-output-1-8-15.pkl', 'events-output-1-8-16.pkl', 'events-output-1-8-17.pkl', 'events-output-1-8-18.pkl', 'events-output-1-8-19.pkl', 'events-output-1-8-1.pkl', 'events-output-1-8-20.pkl', 'events-output-1-8-21.pkl', 'events-output-1-8-22.pkl', 'events-output-1-8-23.pkl', 'events-output-1-8-24.pkl', 'events-output-1-8-25.pkl', 'events-output-1-8-26.pkl', 'events-output-1-8-27.pkl', 'events-output-1-8-28.pkl', 'events-output-1-8-29.pkl', 'events-output-1-8-2.pkl', 'events-output-1-8-30.pkl', 'events-output-1-8-31.pkl', 'events-output-1-8-32.pkl', 'events-output-1-8-33.pkl', 'events-output-1-8-34.pkl', 'events-output-1-8-35.pkl', 'events-output-1-8-36.pkl', 'events-output-1-8-37.pkl', 'events-output-1-8-38.pkl', 'events-output-1-8-39.pkl', 'events-output-1-8-3.pkl', 'events-output-1-8-40.pkl', 'events-output-1-8-41.pkl', 'events-output-1-8-42.pkl', 'events-output-1-8-43.pkl', 'events-output-1-8-44.pkl', 'events-output-1-8-45.pkl', 'events-output-1-8-46.pkl', 'events-output-1-8-47.pkl', 'events-output-1-8-48.pkl', 'events-output-1-8-49.pkl', 'events-output-1-8-4.pkl', 'events-output-1-8-50.pkl', 'events-output-1-8-51.pkl', 'events-output-1-8-52.pkl', 'events-output-1-8-53.pkl', 'events-output-1-8-54.pkl', 'events-output-1-8-55.pkl', 'events-output-1-8-56.pkl', 'events-output-1-8-57.pkl', 'events-output-1-8-58.pkl', 'events-output-1-8-59.pkl', 'events-output-1-8-5.pkl', 'events-output-1-8-60.pkl', 'events-output-1-8-61.pkl', 'events-output-1-8-62.pkl', 'events-output-1-8-63.pkl', 'events-output-1-8-64.pkl', 'events-output-1-8-65.pkl', 'events-output-1-8-66.pkl', 'events-output-1-8-6.pkl', 'events-output-1-8-7.pkl', 'events-output-1-8-8.pkl', 'events-output-1-8-9.pkl', 'events-output-2-8-0.pkl', 'events-output-3-8-0.pkl', 'events-output-3-8-10.pkl', 'events-output-3-8-11.pkl', 'events-output-3-8-12.pkl', 'events-output-3-8-13.pkl', 'events-output-3-8-14.pkl', 'events-output-3-8-15.pkl', 'events-output-3-8-16.pkl', 'events-output-3-8-17.pkl', 'events-output-3-8-18.pkl', 'events-output-3-8-19.pkl', 'events-output-3-8-1.pkl', 'events-output-3-8-2.pkl', 'events-output-3-8-3.pkl', 'events-output-3-8-4.pkl', 'events-output-3-8-5.pkl', 'events-output-3-8-6.pkl', 'events-output-3-8-7.pkl', 'events-output-3-8-8.pkl', 'events-output-3-8-9.pkl',
    ]

In [126]:
non_edited_events = [
 'non-edited-events-output-0-2-0.pkl', 'non-edited-events-output-0-2-10.pkl', 'non-edited-events-output-0-2-11.pkl', 'non-edited-events-output-0-2-12.pkl', 'non-edited-events-output-0-2-13.pkl', 'non-edited-events-output-0-2-14.pkl', 'non-edited-events-output-0-2-15.pkl', 'non-edited-events-output-0-2-16.pkl', 'non-edited-events-output-0-2-17.pkl', 'non-edited-events-output-0-2-18.pkl', 'non-edited-events-output-0-2-19.pkl', 'non-edited-events-output-0-2-1.pkl', 'non-edited-events-output-0-2-2.pkl', 'non-edited-events-output-0-2-3.pkl', 'non-edited-events-output-0-2-4.pkl', 'non-edited-events-output-0-2-5.pkl', 'non-edited-events-output-0-2-6.pkl', 'non-edited-events-output-0-2-7.pkl', 'non-edited-events-output-0-2-8.pkl', 'non-edited-events-output-0-2-9.pkl', 'non-edited-events-output-1-2-0.pkl', 'non-edited-events-output-1-2-10.pkl', 'non-edited-events-output-1-2-11.pkl', 'non-edited-events-output-1-2-12.pkl', 'non-edited-events-output-1-2-13.pkl', 'non-edited-events-output-1-2-14.pkl', 'non-edited-events-output-1-2-15.pkl', 'non-edited-events-output-1-2-16.pkl', 'non-edited-events-output-1-2-17.pkl', 'non-edited-events-output-1-2-18.pkl', 'non-edited-events-output-1-2-19.pkl', 'non-edited-events-output-1-2-1.pkl', 'non-edited-events-output-1-2-2.pkl', 'non-edited-events-output-1-2-3.pkl', 'non-edited-events-output-1-2-4.pkl', 'non-edited-events-output-1-2-5.pkl', 'non-edited-events-output-1-2-6.pkl', 'non-edited-events-output-1-2-7.pkl', 'non-edited-events-output-1-2-8.pkl', 'non-edited-events-output-1-2-9.pkl',
]

In [111]:
plus_lab_02 =[
    'events-output-4-8-0.pkl', 'events-output-4-8-10.pkl', 'events-output-4-8-11.pkl', 'events-output-4-8-12.pkl', 'events-output-4-8-13.pkl', 'events-output-4-8-14.pkl', 'events-output-4-8-15.pkl', 'events-output-4-8-16.pkl',
    'events-output-4-8-17.pkl', 'events-output-4-8-18.pkl', 'events-output-4-8-19.pkl', 'events-output-4-8-1.pkl', 'events-output-4-8-20.pkl', 'events-output-4-8-21.pkl', 'events-output-4-8-22.pkl', 'events-output-4-8-23.pkl', 'events-output-4-8-24.pkl', 'events-output-4-8-25.pkl', 'events-output-4-8-26.pkl', 'events-output-4-8-27.pkl', 'events-output-4-8-28.pkl', 'events-output-4-8-29.pkl', 'events-output-4-8-2.pkl', 'events-output-4-8-30.pkl', 'events-output-4-8-31.pkl', 'events-output-4-8-32.pkl', 'events-output-4-8-33.pkl', 'events-output-4-8-34.pkl', 'events-output-4-8-35.pkl', 'events-output-4-8-36.pkl', 'events-output-4-8-37.pkl', 'events-output-4-8-38.pkl', 'events-output-4-8-39.pkl', 'events-output-4-8-3.pkl', 'events-output-4-8-40.pkl', 'events-output-4-8-41.pkl', 'events-output-4-8-42.pkl', 'events-output-4-8-43.pkl', 'events-output-4-8-44.pkl', 'events-output-4-8-45.pkl', 'events-output-4-8-46.pkl', 'events-output-4-8-47.pkl', 'events-output-4-8-48.pkl', 'events-output-4-8-49.pkl', 'events-output-4-8-4.pkl', 'events-output-4-8-50.pkl', 'events-output-4-8-51.pkl', 'events-output-4-8-52.pkl', 'events-output-4-8-53.pkl', 'events-output-4-8-54.pkl ',
    'events-output-4-8-55.pkl', 'events-output-4-8-56.pkl', 'events-output-4-8-57.pkl', 'events-output-4-8-58.pkl', 'events-output-4-8-59.pkl', 'events-output-4-8-5.pkl', 'events-output-4-8-60.pkl', 'events-output-4-8-61.pkl', 'events-output-4-8-62.pkl', 'events-output-4-8-63.pkl', 'events-output-4-8-64.pkl', 'events-output-4-8-65.pkl', 'events-output-4-8-66.pkl', 'events-output-4-8-67.pkl', 'events-output-4-8-68.pkl', 'events-output-4-8-6.pkl', 'events-output-4-8-7.pkl', 'events-output-4-8-8.pkl', 'events-output-4-8-9.pkl', 'events-output-5-8-0.pkl', 'events-output-5-8-1.pkl', 'events-output-5-8-2.pkl', 'events-output-5-8-3.pkl', 'events-output-5-8-4.pkl', 'events-output-5-8-5.pkl', 'events-output-5-8-6.pkl', 'events-output-6-8-0.pkl', 'events-output-6-8-10.pkl', 'events-output-6-8-11.pkl', 'events-output-6-8-12.pkl', 'events-output-6-8-13.pkl', 'events-output-6-8-14.pkl', 'events-output-6-8-15.pkl', 'events-output-6-8-16.pkl', 'events-output-6-8-17.pkl', 'events-output-6-8-18.pkl', 'events-output-6-8-19.pkl', 'events-output-6-8-1.pkl', 'events-output-6-8-20.pkl', 'events-output-6-8-21.pkl', 'events-output-6-8-22.pkl', 'events-output-6-8-23.pkl', 'events-output-6-8-24.pkl', 'events-output-6-8-25.pkl', 'events-output-6-8-26.pkl', 'events-output-6-8-27.pkl', 'events-output-6-8-28.pkl', 'events-output-6-8-29.pkl', 'events-output-6-8-2.pkl ',
    'events-output-6-8-30.pkl', 'events-output-6-8-31.pkl', 'events-output-6-8-32.pkl', 'events-output-6-8-33.pkl', 'events-output-6-8-34.pkl', 'events-output-6-8-35.pkl', 'events-output-6-8-36.pkl', 'events-output-6-8-37.pkl', 'events-output-6-8-38.pkl', 'events-output-6-8-39.pkl', 'events-output-6-8-3.pkl', 'events-output-6-8-40.pkl', 'events-output-6-8-41.pkl', 'events-output-6-8-42.pkl', 'events-output-6-8-43.pkl', 'events-output-6-8-44.pkl', 'events-output-6-8-45.pkl', 'events-output-6-8-46.pkl', 'events-output-6-8-47.pkl', 'events-output-6-8-48.pkl', 'events-output-6-8-49.pkl', 'events-output-6-8-4.pkl', 'events-output-6-8-50.pkl', 'events-output-6-8-51.pkl', 'events-output-6-8-52.pkl', 'events-output-6-8-53.pkl', 'events-output-6-8-54.pkl', 'events-output-6-8-55.pkl', 'events-output-6-8-56.pkl', 'events-output-6-8-57.pkl', 'events-output-6-8-58.pkl', 'events-output-6-8-59.pkl', 'events-output-6-8-5.pkl', 'events-output-6-8-60.pkl', 'events-output-6-8-61.pkl', 'events-output-6-8-6.pkl', 'events-output-6-8-7.pkl', 'events-output-6-8-8.pkl', 'events-output-6-8-9.pkl',
]

In [3]:
from tqdm.auto import tqdm

In [122]:
import os 
already_retrieved = os.listdir('../eda/output/')

In [123]:
len(already_retrieved)

280

In [None]:
for e in tqdm(pluslab):
    if e not in already_retrieved:
        ! sshpass -p "Pica_pic0Pica_pic0"  \
             scp -r spangher@pluslab.isi.edu:/lfs1/spangher/EventPlus/component/BETTER/joint/$e ../eda/output/

In [None]:
for e in tqdm(non_edited_events):
    if e not in already_retrieved:
        ! sshpass -p "Pica_pic0Pica_pic0"  \
             scp -r spangher@pluslab.isi.edu:/lfs1/spangher/EventPlus/component/BETTER/joint/$e ../eda/output-non-edited/

In [47]:
for e in tqdm(plus_lab_02):
    if e not in already_retrieved:
        ! sshpass -p "Pica_pic0Pica_pic0"  \
             scp -r spangher@pluslab02.isi.edu:/lfs1/spangher/EventPlus/component/BETTER/joint/$e ../eda/output/

  0%|          | 0/82 [00:00<?, ?it/s]

# Events in Edited Sentences

In [1]:
import glob
import pickle
import pandas as pd 

In [5]:
all_event_s = []
for f in glob.glob('../eda/output/event-sents*'):
    df = pd.read_pickle(f)
    all_event_s.append(df)

In [6]:
all_event_edited_sents = pd.concat(all_event_s)

In [23]:
grouped = (
    all_event_edited_sents
     .reset_index()
     .set_index(['level_0', 'level_1'])
     .groupby(level=[0,1])
)

In [22]:
t.apply(lambda x: list(set(x))).loc[lambda s: s.str.len() > 1]

level_0  level_1
ap       130                       [9.0, 2.0]
         688                      [10.0, 4.0]
         732                     [26.0, 14.0]
         1023                     [1.0, 15.0]
         1362                     [5.0, 15.0]
                              ...            
nyt      2038157                   [0.0, 1.0]
         2038190    [0.0, 1.0, 2.0, 3.0, 4.0]
         2038231                   [0.0, 1.0]
         2038254                   [1.0, 2.0]
         2038288                   [0.0, 1.0]
Name: level_2, Length: 156090, dtype: object

In [31]:
df = grouped.get_group(('nyt', 2038157))

In [32]:
def unstack_df(df):
    df['level_2']

Unnamed: 0_level_0,Unnamed: 1_level_0,level_2,level_3,0
level_0,level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
nyt,2038157,0.0,47,"[{'tokens': ['“', 'Never', 'allow', 'singing',..."
nyt,2038157,0.0,49,"[{'tokens': ['”', 'In', 'the', 'case', 'of', '..."
nyt,2038157,1.0,47,"[{'tokens': ['“', 'Never', 'allow', 'singing',..."
nyt,2038157,1.0,49,"[{'tokens': ['”', 'In', 'the', 'case', 'of', '..."


In [67]:
to_get = glob.glob('../data/diffengine-diffs/spark-output/*')

In [81]:
all_matched_events = []

In [86]:
'guardian' in all_event_edited_sents

True

In [87]:
for f in to_get:
    print(f)
    source = f.split('/')[-1].split('-')[0]
    conn = sqlite3.connect(f)
    if source not in all_event_edited_sents:
        continue
        
    source_edited_events = (
        all_event_edited_sents
        .loc[source]
    )

    entry_ids = (
        source_edited_events
        .reset_index(level=0)['level_0']
        .drop_duplicates()
        .tolist()
    )

    matched = pd.read_sql('''
        SELECT entry_id, version_x, version_y, sent_idx_x, sent_idx_y
            FROM matched_sentences
            WHERE (entry_id IN (%s))
                AND (avg_sentence_distance_x > 0.1)
                AND (avg_sentence_distance_y > 0.1)
    ''' % ', '.join(list(map(str, entry_ids))), con=conn)

    matched_events = (matched
     .merge(source_edited_events.to_frame('events_x').reset_index(), how='left', right_on=['level_0', 'level_1', 'level_2'], left_on=['entry_id', 'version_x', 'sent_idx_x'])
     .drop(['level_0','level_1','level_2'], axis=1)
     .merge(source_edited_events.to_frame('events_y').reset_index(), how='left', right_on=['level_0', 'level_1', 'level_2'], left_on=['entry_id', 'version_y', 'sent_idx_y'])
     .dropna()
     .drop(['level_0','level_1','level_2'], axis=1)
    )
    all_matched_events.append(matched_events)

../data/diffengine-diffs/spark-output/wp-matched-sentences.db
../data/diffengine-diffs/spark-output/independent-matched-sentences.db
../data/diffengine-diffs/spark-output/reuters-matched-sentences.db
../data/diffengine-diffs/spark-output/guardian-matched-sentences.db
../data/diffengine-diffs/spark-output/nyt-matched-sentences.db
../data/diffengine-diffs/spark-output/bbc-2-matched-sentences.db
../data/diffengine-diffs/spark-output/ap-matched-sentences.db


In [89]:
all_matched_events_df = pd.concat(all_matched_events)

In [91]:
all_matched_events_df.iloc[0]

entry_id                                                1348692
version_x                                                     5
version_y                                                     6
sent_idx_x                                                  8.0
sent_idx_y                                                 11.0
events_x      [{'tokens': ['May', 'said', 'that', 'the', 'na...
events_y      [{'tokens': ['May', 'said', 'that', 'the', 'na...
Name: 17, dtype: object

In [161]:
def unroll_events(e):
    t2 = list(map(lambda x: x['events'], e))
    t3 = [y['triggers'] for x in t2 for y in x]
    return [y['text'] for x in t3 for y in x]

def match_events(e_x, e_y):
    unique_event_paths = []
    e_x = unroll_events(e_x)
    e_y = unroll_events(e_y)
    for e_x_i in e_x:
        for e_y_i in e_y:
            if e_x_i != e_y_i:
                unique_event_paths.append((e_x_i, e_y_i))
    return unique_event_paths

In [193]:
unrolled_events = all_matched_events_df.apply(lambda x: match_events(x['events_x'], x['events_y']), axis=1)

In [189]:
all_matched_events_df['events'] = (
    all_matched_events_df
    .apply(lambda x: unroll_events(x['events_x']) + unroll_events(x['events_y']), axis=1)
)

In [198]:
all_matched_events_df['events'].str.len().pipe(lambda s: s > 0).value_counts()

False    244460
True     187869
Name: events, dtype: int64

In [192]:
all_matched_events_df['events'].str.len().pipe(lambda s: s > 0).value_counts().pipe(lambda s: s/s.sum())

False    0.565449
True     0.434551
Name: events, dtype: float64

In [199]:
69758/187869

0.3713119247986629

In [136]:
all_matched_events_df.shape

(432329, 7)

In [197]:
unrolled_events.str.len().pipe(lambda s: s > 0).value_counts().pipe(lambda s: s/s.sum())

False    0.838646
True     0.161354
dtype: float64

In [123]:
unrolled_events = unrolled_events.loc[lambda s: s.str.len() > 0]

In [135]:
unrolled_events.shape

(69758,)

In [126]:
all_unrolled_events = [y for x in unrolled_events.tolist() for y in x]

In [208]:
t = pd.Series(all_unrolled_events).value_counts().head(50)
t

(killed, attack)         799
(attack, killed)         789
(injured, killed)        603
(killed, injured)        578
(shot, dead)             476
(dead, shot)             468
(wounded, killed)        409
(killed, wounded)        399
(attacks, killed)        355
(killed, attacks)        350
(shot, killed)           324
(killed, shot)           321
(injured, attack)        272
(attack, injured)        265
(shooting, killed)       256
(killed, shooting)       250
(killed, died)           249
(died, killed)           247
(injured, died)          225
(attacks, attack)        225
(attack, attacks)        223
(died, death)            217
(died, injured)          216
(death, died)            208
(attack, wounded)        202
(suicide, killed)        197
(death, killed)          196
(killed, death)          190
(suicide, attack)        189
(election, won)          189
(dead, injured)          188
(injured, dead)          188
(killed, suicide)        185
(elections, election)    184
(wounded, atta

In [205]:
import pyperclip

In [206]:
pyperclip.copy(t.to_latex())

# Events in Added Sentences

In [137]:
ls ../eda/output/

event-sents-part-1.pkl      events-output-1-8-28.pkl
event-sents-part-2.pkl      events-output-1-8-29.pkl
events-output-0-1-0.pkl     events-output-1-8-3.pkl
events-output-0-1-1.pkl     events-output-1-8-30.pkl
events-output-0-1-10.pkl    events-output-1-8-31.pkl
events-output-0-1-11.pkl    events-output-1-8-32.pkl
events-output-0-1-12.pkl    events-output-1-8-33.pkl
events-output-0-1-13.pkl    events-output-1-8-34.pkl
events-output-0-1-14.pkl    events-output-1-8-35.pkl
events-output-0-1-15.pkl    events-output-1-8-36.pkl
events-output-0-1-16.pkl    events-output-1-8-37.pkl
events-output-0-1-17.pkl    events-output-1-8-38.pkl
events-output-0-1-18.pkl    events-output-1-8-39.pkl
events-output-0-1-19.pkl    events-output-1-8-4.pkl
events-output-0-1-2.pkl     events-output-1-8-40.pkl
events-output-0-1-20.pkl    events-output-1-8-41.pkl
events-output-0-1-21.pkl    events-output-1-8-42.pkl
events-output-0-1-22.pkl    events-output-1-8-43.pkl
events-output-0-1-23.pkl    ev

In [147]:
to_run = glob.glob('../data/diffengine-diffs/spark-output/*')

In [149]:
all_add_del_info = []
for f in to_run:
    print(f)
    if '.gz' in f:
        ! gunzip $f
    f = f.replace('.gz', '')
    conn = sqlite3.connect(f)

    add_del_info = pd.read_sql('''
        with added as (
                SELECT entry_id, 
                version_y as version, 
                sent_idx_y as sent_idx, 
                0 as is_deleted,
                CAST((sent_idx_x IS NULL) AS INT) AS is_added
            FROM matched_sentences
                WHERE is_added = 1
        ), deleted as (
            SELECT entry_id, 
                version_x as version, 
                sent_idx_x as sent_idx,
                CAST((sent_idx_y IS NULL) AS INT) AS is_deleted,
                0 as is_added
            FROM matched_sentences
                WHERE is_deleted = 1
        )
        SELECT * from added
        UNION ALL
        SELECT * from deleted
    ''', con=conn)
    add_del_info['source'] = f.split('/')[-1].split('-')[0]
    all_add_del_info.append(add_del_info)

../data/diffengine-diffs/spark-output/wp-matched-sentences.db
../data/diffengine-diffs/spark-output/independent-matched-sentences.db
../data/diffengine-diffs/spark-output/reuters-matched-sentences.db
../data/diffengine-diffs/spark-output/guardian-matched-sentences.db
../data/diffengine-diffs/spark-output/nyt-matched-sentences.db
../data/diffengine-diffs/spark-output/bbc-2-matched-sentences.db
../data/diffengine-diffs/spark-output/ap-matched-sentences.db


In [187]:
all_add_del_sents_df.to_csv('cache/add-del-sent-df.csv')

In [139]:
events_non_edited_sents = pd.read_pickle('../eda/output/non-edited-event-sents.pkl')

In [142]:
events_add_del_sents = pd.read_pickle('../eda/output/add-del-event-sents.pkl')

In [None]:
sentence_df_no_null = sentence_df.loc[lambda df: df['sentence'].notnull()]

In [150]:
all_add_del_sents_df = pd.concat(all_add_del_info)

In [154]:
all_add_del_sents_df.head(1)

Unnamed: 0,entry_id,version,sent_idx,is_deleted,is_added,source
0,1258519,1,5.0,0,1,wp


In [157]:
events_with_add_del = (events_add_del_sents
 .to_frame('events')
 .reset_index()
 .merge(all_add_del_sents_df,
       left_on=['level_0', 'level_1', 'level_2', 'level_3'],
        right_on=['source', 'entry_id', 'version', 'sent_idx',]
       )
 .drop(['level_0', 'level_1', 'level_2', 'level_3'], axis=1)
)

In [164]:
events_with_add_del['events_unrolled'] = events_with_add_del['events'].apply(unroll_events)

In [182]:
events_with_add_del

Unnamed: 0,events,entry_id,version,sent_idx,is_deleted,is_added,source,events_unrolled
0,"[{'tokens': ['<', '/p', '>', '<', 'p', '>', 'T...",19317,1,8.0,0,1,ap,[]
1,"[{'tokens': ['<', '/p', '>', '<', 'p', '>', 'T...",19317,1,9.0,0,1,ap,[]
2,"[{'tokens': ['<', '/p', '>', '<', 'p', '>', 'J...",19317,1,10.0,0,1,ap,[]
3,"[{'tokens': ['<', '/p', '>', '<', 'p>___</p', ...",19317,1,13.0,0,1,ap,[]
4,"[{'tokens': ['He', 'casts', 'Trump', 'as', 'a'...",17365,4,2.0,1,0,ap,[election]
...,...,...,...,...,...,...,...,...
386041,"[{'tokens': ['<', '/p', '>', '<', 'p', '>', 'M...",2615,7,57.0,1,0,ap,[]
386042,"[{'tokens': ['His', 'parents', 'have', 'anothe...",2615,7,58.0,1,0,ap,[]
386043,"[{'tokens': ['<', '/p', '>', '<', 'p>---</p', ...",2615,7,59.0,1,0,ap,[]
386044,"[{'tokens': ['<', 'p', '>', 'BOSTON', '(', 'AP...",2985,0,0.0,1,0,ap,[trial]


In [165]:
events_non_edited_sents_df= (
    events_non_edited_sents
    .to_frame('events')
    .assign(events_unrolled=lambda df: df['events'].apply(unroll_events))
)

In [183]:
(events_with_add_del
 .loc[lambda df: df['is_added'] == 1]
 ['events_unrolled']
 .str.len()
 .pipe(lambda s: s>0)
 .value_counts()
 .pipe(lambda s: s/s.sum())
)

False    0.614628
True     0.385372
Name: events_unrolled, dtype: float64

In [184]:
(events_with_add_del
 .loc[lambda df: df['is_deleted'] == 1]
 ['events_unrolled']
 .str.len()
 .pipe(lambda s: s>0)
 .value_counts()
 .pipe(lambda s: s/s.sum())
)

False    0.606982
True     0.393018
Name: events_unrolled, dtype: float64

In [178]:
(events_non_edited_sents_df['events_unrolled']
 .str.len()
 .pipe(lambda s: s> 0)
 .value_counts()
 .pipe(lambda s: s/s.sum())
)

False    0.685718
True     0.314282
Name: events_unrolled, dtype: float64

In [None]:
pd.DataFrame({'Events Added': ''})

In [12]:
import pandas as pd 
pd.read_csv('../../controlled-sequence-gen/data/news-discourse-high-level-labels-processed.tsv', sep='\t', header=None)

Unnamed: 0,0,1,2,3
0,Main,"Mortgage Investors Corp., the beleaguered home...",../data/Discourse_Profiling/data/test/kbp/NYT_...,1
1,Distant,"''We had to pull in our wings, (but) my plans ...",../data/Discourse_Profiling/data/test/kbp/NYT_...,2
2,Cause,The company has been hammered with complaints ...,../data/Discourse_Profiling/data/test/kbp/NYT_...,3
3,Distant,"But Edwards, 68, flatly rejected allegations o...",../data/Discourse_Profiling/data/test/kbp/NYT_...,4
4,Distant,"It’s the same issue, he said, that has trigger...",../data/Discourse_Profiling/data/test/kbp/NYT_...,5
...,...,...,...,...
17926,Distant,Ms. Miers has worked with Mr. Bush since his d...,../data/Discourse_Profiling/data/validation/ny...,19
17927,Distant,Ms. Miers started at the White House as staff ...,../data/Discourse_Profiling/data/validation/ny...,20
17928,Distant,Mr. Bush's decision to nominate her to the Sup...,../data/Discourse_Profiling/data/validation/ny...,21
17929,Distant,She withdrew her name from consideration but r...,../data/Discourse_Profiling/data/validation/ny...,22


In [210]:
training_df = pd.read_csv('../modeling/data/training_data_short_15__sampled_50000.csv')

In [219]:
WRITE = False

In [221]:
op_label_col = training_df[['deleted_label', 'edited_label', 'unchanged_label']].idxmax(axis=1)
text_col = training_df['sentence']
doc_id_col = training_df[['source', 'entry_id', 'version']].apply(lambda x: '%s-%s-%s' % (x[0], x[1], x[2]), axis=1)
sent_id_col = training_df['sent_idx'].astype(int)

if WRITE:
    pd.concat([
        op_label_col,
        text_col,
        doc_id_col,
        sent_id_col,
    ], axis=1).to_csv(
        '../../controlled-sequence-gen/data/sentence-edits__operation-labels.tsv', 
        index=False,
        header=False,
        sep='\t'
    )

In [222]:
def get_refactor_label(x):
    if x == 0:
        return 'refactor_unchanged'
    if x > 0:
        return 'refactor_up'
    if x < 0:
        return 'refactor_down'

In [223]:
ref_label_col = training_df['refactored_label'].apply(get_refactor_label)
text_col = training_df['sentence']
doc_id_col = training_df[['source', 'entry_id', 'version']].apply(lambda x: '%s-%s-%s' % (x[0], x[1], x[2]), axis=1)
sent_id_col = training_df['sent_idx'].astype(int)

if WRITE:
    pd.concat([
        ref_label_col,
        text_col,
        doc_id_col,
        sent_id_col,
    ], axis=1).to_csv(
        '../../controlled-sequence-gen/data/sentence-edits__refactor-labels.tsv', 
        index=False,
        header=False,
        sep='\t'
    )

In [224]:
add_label_col = (training_df
 [['add_above_label', 'add_below_label']]
 .pipe(lambda df: df>0)
 .assign(add_above_label=lambda df: df['add_above_label'].apply(lambda x: 'add_above' if x else ''))
 .assign(add_below_label=lambda df: df['add_below_label'].apply(lambda x: 'add_below' if x else ''))
 .sum(axis=1)
 .replace({'add_aboveadd_below': 'add_above', '':'unchanged'})
 .replace('{}')
)

text_col = training_df['sentence']
doc_id_col = training_df[['source', 'entry_id', 'version']].apply(lambda x: '%s-%s-%s' % (x[0], x[1], x[2]), axis=1)
sent_id_col = training_df['sent_idx'].astype(int)

if WRITE:
    pd.concat([
        add_label_col,
        text_col,
        doc_id_col,
        sent_id_col,
    ], axis=1).to_csv(
        '../../controlled-sequence-gen/data/sentence-edits__add-labels.tsv', 
        index=False,
        header=False,
        sep='\t'
    )

In [233]:
mt_label_col = ref_label_col + "|||" + add_label_col + "|||" + op_label_col
text_col = training_df['sentence']
doc_id_col = training_df[['source', 'entry_id', 'version']].apply(lambda x: '%s-%s-%s' % (x[0], x[1], x[2]), axis=1)
sent_id_col = training_df['sent_idx'].astype(int)
WRITE = True

if WRITE:
    pd.concat([
        mt_label_col,
        text_col,
        doc_id_col,
        sent_id_col,
    ], axis=1).to_csv(
        '../../controlled-sequence-gen/data/sentence-edits__multitask.tsv', 
        index=False,
        header=False,
        sep='\t'
    )