In [41]:
import s3fs
import sys
sys.path.append('../util')
import util_data_access as da
from tqdm.auto import tqdm
import numpy as np
import os, glob
import pandas as pd 

import sys
sys.path.append('../')
import spark_processing_scripts.util_spark as sus
import spark_processing_scripts.util_general as sug

fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': 'http://s3.dev.obdc.bcs.bloomberg.com'})

# Download Files

In [42]:
db_name = 'cnn'
if not os.path.exists('%s_output' % db_name):
     os.makedirs('%s_output' % db_name)

In [4]:
for f in tqdm(fs.ls('aspangher/edit-pathways/spark_processing_scripts-output/%s/' % db_name)):
    fs.get(f, '%s_output/%s' % (db_name, os.path.basename(f)))

  0%|          | 0/42 [00:00<?, ?it/s]

In [6]:
# da.download_file('guardian.db', 'edit-pathways/dbs/newssniffer-guardian.db.gz')
# ! gunzip guardian.db.gz
# fs.ls('aspangher/edit-pathways/spark_processing_scripts-output_sentences/nyt')

# Read Files

In [48]:
dfs = []
for f in tqdm(glob.glob('%s_output/*' % db_name)):
    df = pd.read_csv(f, compression=None, index_col=0)
    dfs.append(df)

  0%|          | 0/42 [00:00<?, ?it/s]

In [49]:
full_diffs_df = pd.concat(dfs)

In [9]:
full_diffs_df[['entry_id', 'version_x', 'version_y']].drop_duplicates().shape

(35858, 3)

In [20]:
full_diffs_df.head()

Unnamed: 0,entry_id,version_x,version_y,sent_idx_x,sent_idx_y,avg_sentence_distance_x,avg_sentence_distance_y
0,2550,2,3,28.0,28.0,0.0,0.0
1,3265,0,1,,0.0,,
2,2594,0,1,55.0,55.0,0.0,0.0
3,2584,0,1,18.0,,,
4,2951,0,1,37.0,36.0,0.0,


In [21]:
(full_diffs_df
 .assign(c=1)
 .groupby(['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'sent_idx_y'])['c']
 .sum()
 .value_counts()
)

1    1797573
Name: c, dtype: int64

# Check Against Expected

In [43]:
remote_name = sug.conn_mapper_dict[db_name]
da.download_file('%s.db.gz' % db_name, 'edit-pathways/dbs/%s.db.gz' % remote_name)
! gunzip $db_name\.db\.gz

gzip: cnn.db already exists; do you wish to overwrite (y or n)? ^C


In [44]:
import sqlite3

con = sqlite3.connect('%s.db' % db_name)
# full_df = pd.read_sql('select * from entryversion where num_versions > 1 and num_versions < 40 ', con=con)
eligible_ids = pd.read_sql('select entry_id, version from entryversion where num_versions > 1 and num_versions < 40 ', con=con)

In [53]:
summs = pd.read_sql('select entry_id, version, summary from entryversion where num_versions > 1 and num_versions < 40 ', con=con)

In [58]:
summs.loc[lambda df: df['summary'] == '']['entry_id']

1555      1463
8697     11512
11066    11512
11276    15393
11308    15511
11507    15794
13431    11512
13695    11512
13990    18357
14704    19084
15093    19419
16337    19084
16510    19084
16601    20494
16942    21019
18717     9299
18718     9298
18719     9297
18724     9292
18725     9290
18730     9285
18733     9241
18736     9163
18737     9160
18738     9157
18740     9131
18741     9130
18742     9128
18744     9126
18746     9111
         ...  
29003    26041
29004    26038
29005    26037
29008    26082
29009    26077
29010    26075
29011    26073
29018    26097
29019    26096
29021    26094
34943    20494
35181    18357
35419    15794
39581    27218
39831    27693
42107    26717
49519    15794
50456     1463
51772    15511
53399    15393
54513    27693
54559    27218
55257    20494
55474    18357
58724    26717
59634    35762
62602    29379
62742    20011
64948    57979
64950    57979
Name: entry_id, Length: 6041, dtype: int64

In [45]:
expected = []
for entry_id, versions in (
    eligible_ids
        .groupby('entry_id')
        .aggregate(list)['version']
        .iteritems()
):
    for version_pair in zip(versions[:-1], versions[1:]):
        expected.append({'entry_id': entry_id, 'version_pair': version_pair})

expected_df = pd.DataFrame(expected)

In [46]:
expected_df.shape

(45902, 2)

In [50]:
(full_diffs_df
 [['entry_id', 'version_x', 'version_y']]
 .drop_duplicates()
 .shape
)

(35858, 3)

In [51]:
(full_diffs_df
 [['entry_id']]
 .drop_duplicates()
 .shape
)

(18135, 1)

In [52]:
expected_df['entry_id'].drop_duplicates().shape

(19965,)

In [89]:
output_full_diffs_df = full_diffs_df

In [90]:
output_full_diffs_df = (full_diffs_df
 .fillna('nan')
 .drop_duplicates(['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'sent_idx_y'])
 .replace(to_replace='nan', value=np.nan)
)

In [91]:
output_full_diffs_df.head()

Unnamed: 0,entry_id,version_x,version_y,sent_idx_x,sent_idx_y,avg_sentence_distance_x,avg_sentence_distance_y
0,30458,0,1,17.0,19.0,0.396817,0.398281
1,29404,0,1,8.0,8.0,0.0,0.0
2,28893,0,1,12.0,12.0,0.0,0.0
3,29426,0,1,4.0,4.0,0.23022,0.23022
4,29439,0,1,,9.0,,


In [92]:
## final files
import sqlite3
with sqlite3.connect('%s-matched-sentences.db' % db_name) as con:
    output_full_diffs_df.to_sql('matched_sentences', con=con, index=False, chunksize=10000, if_exists='replace')

In [None]:
da.upload_file('2021-05-19__partial-nyt-output.pkl', 'edit-pathways/output_for_sheena/2021-05-25__partial-nyt-matched-output.pkl')

In [None]:
## intermediate files
fname = '2021-05-26__newssniffer-bbc-diffs-output.pkl'
output_full_diffs_df.to_pickle(fname, compression='gzip', chunksize=10000)

In [34]:
da.upload_file(fname, 'edit-pathways/output_for_sheena/%s' % fname)

True

In [36]:
da.upload_file('nyt_sent_output/df_nyt__start_0__end_20000__num_1.pkl', 'edit-pathways/output_for_sheena/df_nyt__start_0__end_20000__num_1.pkl')

True

# Sentences

In [27]:
import re
import sqlite3
import os 
import s3fs
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': 'http://s3.dev.obdc.bcs.bloomberg.com'})

In [29]:
## sentences
db_name = 'cnn'

In [30]:
if not os.path.exists('%s_sent_output' % db_name):
    os.makedirs('%s_sent_output' % db_name)

In [31]:
for f in tqdm(fs.ls('aspangher/edit-pathways/spark_processing_scripts-output_sentences/%s' % db_name)):
    fname = f.split('/')[-1]
    fs.get(f, '%s_sent_output/%s' % (db_name, fname))

  0%|          | 0/17 [00:00<?, ?it/s]

In [32]:
## check if expected entry_ids are there before dumping to SQLite
import glob
import pandas as pd 
from tqdm.auto import tqdm 

all_entry_ids = []
for f in tqdm(glob.glob('%s_sent_output/*' % db_name)):
    entry_ids = pd.read_pickle(f, compression='gzip')['entry_id'].drop_duplicates()
    all_entry_ids.append(entry_ids)

  0%|          | 0/17 [00:00<?, ?it/s]

In [33]:
pd.concat(all_entry_ids).drop_duplicates().shape

(19957,)

In [8]:
with sqlite3.connect('%s-matched-sentences.db' % db_name) as con:
    con.execute('DROP TABLE IF EXISTS split_sentences;')

In [9]:
import pandas as pd 
import pickle
import glob
from tqdm.auto import tqdm

sent_dfs = []
sent_file_list = glob.glob('%s_sent_output/*' % db_name)

# if db_name == 'guardian':
#     sent_file_list = list(filter(lambda x: int(re.search('end_(\d+)', x)[1]) - int(re.search('start_(\d+)', x)[1]) == 5000, sent_file_list))
#     sent_file_list = sorted(sent_file_list, key=lambda x: int(re.search('num_(\d+)', x)[1]))

for f in tqdm(sent_file_list):
    sent_df = pd.read_pickle(f, compression='gzip')
    ## final files
    import sqlite3
    with sqlite3.connect('%s-matched-sentences.db' % db_name) as con:
        sent_df.to_sql('split_sentences', con=con, index=False, chunksize=5000, if_exists='append')
#     break
#     sent_dfs.append(sent_df)

  0%|          | 0/6 [00:00<?, ?it/s]

In [10]:
with sqlite3.connect('%s-matched-sentences.db' % db_name) as con:
    print(pd.read_sql('SELECT count(distinct entry_id) from split_sentences', con=con))
    print(pd.read_sql('SELECT count(distinct entry_id) from matched_sentences', con=con))

   count(distinct entry_id)
0                     26530
   count(distinct entry_id)
0                     26529


In [11]:
with sqlite3.connect('%s-matched-sentences.db' % db_name) as con:
    con.execute('''
        DELETE   FROM split_sentences
        WHERE    rowid not in
                 (
                 select  min(rowid)
                 from split_sentences
                 group by
                     entry_id,
                     version,
                     sent_idx
                 )
    ''')

In [12]:
! gzip $db_name-matched-sentences\.db

In [13]:
dir_name = 'spark_output_final'
# dir_name = 'output_for_sheena'
fs.put(
    '%s-matched-sentences.db.gz' % db_name, 
    'aspangher/edit-pathways/%s/%s-matched-sentences.db.gz' % (dir_name, db_name)
)

In [None]:
# ! aws s3api put-object-acl --bucket aspangher --key edit-pathways/spark_output_final/nyt-matched-sentences.db.gz --acl public-read --endpoint http://s3.dev.obdc.bcs.bloomberg.com 

In [26]:
calc_entry_versions = (full_diffs_df
 .set_index('entry_id')[['version_x', 'version_y']]
 .unstack()
 .to_frame('version')
 .reset_index()
 .drop('level_0', axis=1)
 .drop_duplicates()
)

In [29]:
sent_df[['entry_id', 'version']].drop_duplicates().shape

(79292, 2)

In [27]:
(sent_df[['entry_id','version']]
 .drop_duplicates()
 .merge(
     calc_entry_versions, 
     right_on=['entry_id', 'version'], 
     left_on=['entry_id', 'version'], 
     how='inner'
 ).shape
)

(71073, 2)

# Check PQs

In [17]:
all_pqs = fs.ls('aspangher/edit-pathways/pqs')
pqs = list(filter(lambda x: db_name in x, all_pqs))

In [19]:
mkdir pqs

In [21]:
for pq in pqs:
    fs.get(pq, os.path.join('pqs', os.path.basename(pq)))

In [26]:
all_data = pd.concat(
    list(map(lambda x: pd.read_parquet(x), glob.glob('pqs/*')))
)

In [31]:
all_data['entry_id'].drop_duplicates().shape

(58569,)

In [37]:
expected = []
for entry_id, versions in (
    all_data
        .loc[lambda df: df['num_versions'] > 1]
        .loc[lambda df: df['num_versions'] < 40]
        .groupby('entry_id')
        .aggregate(list)['version']
        .iteritems()
):
    for version_pair in zip(versions[:-1], versions[1:]):
        expected.append({'entry_id': entry_id, 'version_pair': version_pair})

expected_df = pd.DataFrame(expected)

In [38]:
expected_df.shape 

(45902, 2)

# Look at fetching operation

In [1]:
sys.path.append('../spark_processing_scripts')
import util_general as sug

In [3]:
db_name = 'cnn'
s3_path = sug.s3_output_dir_main if not True else sug.s3_output_dir_sentences
file_count = len(sug.get_files(s3_path, db_name, sug.csv_pat)) if not True else len(sug.get_files(s3_path, db_name, sug.pkl_pat))

In [4]:
file_count

17

In [5]:
db_name = 'cnn'
t = sug.download_prefetched_data(db_name, format='csv', split_sentences=False, show_progress=True)

  0%|          | 0/42 [00:00<?, ?it/s]

In [8]:
prefetched_file_idx, last_one, to_fetch_df = sug.download_pq_to_df(db_name, t)

In [9]:
prefetched_file_idx

1

In [11]:
to_fetch_df.shape

(22884, 10)

In [13]:
to_fetch_this_round_df, left_to_fetch_df = sug.get_rows_to_process_df(
    500, 0, to_fetch_df, t
)

len(output_df): 1824
len(to_get_df): 500
len(left_to_process_df): 1324


In [33]:
to_fetch_this_round_df.loc[lambda df: df['entry_id'] == 11]['summary']

10       <p>But that is effectively what he had to do t...
33168                                                     
59493    <p>But that is effectively what he had to do t...
Name: summary, dtype: object

In [36]:
to_fetch_this_round_df.loc[lambda df: df['summary'].str.strip() == ''].shape

(497, 10)

In [44]:
import re

In [None]:
prefetched_entry_id_list = []
fname = sug.conn_mapper_dict[db_name]
file_list = sug.get_fs().ls(sug.s3_pq_dir)
file_pattern = re.compile(r'%s-\d+.pq' % fname)
file_list = list(enumerate(filter(lambda x: re.search(file_pattern, x), file_list)))
file_list = file_list[0:]
# if show_progress:
#     file_list = tqdm(file_list)
for f_idx, fname in file_list:
    with fs.open(fname) as f:
        full_df = pd.read_parquet(f)
    full_df = full_df.loc[lambda df: ~df['entry_id'].isin(prefetched_entry_id_list)]
    if len(full_df['entry_id'].drop_duplicates()) > 50:
        last_one = f_idx < (len(file_list) - 1)
#         return f_idx, last_one, full_df
if len(file_list) == 0:
    f_idx = 0
last_one = f_idx < (len(file_list) - 1)

In [24]:
to_fetch_df = ug.download_pq_to_df(db_name, t)

In [47]:
import re
db_name = 'cnn'
prefetched_entry_id_list = t.values
fname = ug.conn_mapper_dict[db_name]
file_list = ug.get_fs().ls(ug.s3_pq_dir)
file_pattern = re.compile(r'%s-\d+.pq' % fname)
file_list = list(filter(lambda x: re.search(file_pattern, x), file_list))

if False:
    all_full_dfs = []
    for f_idx, fname in enumerate(file_list):
        with ug.get_fs().open(fname) as f:
            full_df = pd.read_parquet(f)

        print(f_idx)
        print('pre-filtering')
        print(full_df.shape)
        print(full_df['entry_id'].drop_duplicates().shape)
        all_full_dfs.append(full_df.copy())
        full_df = full_df.loc[lambda df: ~df['entry_id'].isin(prefetched_entry_id_list)]
        print('post-filtering')
        print(full_df.shape)
        print(full_df['entry_id'].drop_duplicates().shape)
        if len(full_df['entry_id'].drop_duplicates()) > 50:
            print('here')#full_df

NameError: name 'ug' is not defined

In [46]:
full_df = pd.concat(all_full_dfs)

In [48]:
full_df.shape

(714873, 12)

In [49]:
full_df['entry_id'].drop_duplicates().shape

(180000,)

# Prepare data files

In [40]:
db_name = 'independent'
remote_name = sug.conn_mapper_dict[db_name]

# da.download_file('newssniffer-washpo.db.gz', 'edit-pathways/dbs/newssniffer-washpo.db.gz')
if not (os.path.exists('%s.db.gz' % db_name) or os.path.exists('%s.db' % db_name)):
    fs.get('aspangher/edit-pathways/dbs/%s.db.gz' % remote_name, '%s.db.gz' % db_name)
    ! gunzip $db_name\.db\.gz

In [41]:
import sqlite3
with sqlite3.connect('%s.db' % db_name) as con:
    entry_ids = pd.read_sql('select DISTINCT entry_id from entryversion', con=con)['entry_id']
#     full_df = pd.read_sql('select * from entryversion', chunksize=5000, con=con)

In [43]:
entry_ids.shape

(55009,)

In [53]:
if not os.path.exists('%s_pqs' % db_name):
    os.makedirs('%s_pqs' % db_name)

In [54]:
chunk_size = 20000
for chunk_idx, (s_idx, e_idx) in tqdm(enumerate(
    zip(
        range(0, len(entry_ids), chunk_size), 
        range(chunk_size, len(entry_ids) + chunk_size, chunk_size)
    )
)):
    chunk_ids = entry_ids[s_idx: e_idx].values.tolist()
    with sqlite3.connect('%s.db' % db_name) as con:
        chunk_df = pd.read_sql('''
                                SELECT * FROM entryversion
                                WHERE entry_id IN (%s)
        ''' % ', '.join(list(map(str, chunk_ids))), con=con)

    (chunk_df
     .to_parquet('%(db_name)s_pqs/%(db_name)s-%(num)s.pq' % ({'db_name': db_name, 'num': chunk_idx + 1}))
    )

|          | 0/? [00:00<?, ?it/s]

In [55]:
import os 
for f in tqdm(glob.glob('%s_pqs/*' % db_name)):
    remote_fname = os.path.basename(f).replace(db_name, remote_name)
    da.upload_file(f, 'edit-pathways/pqs/' + remote_fname)

  0%|          | 0/3 [00:00<?, ?it/s]

In [56]:
remote_fname

'newssniffer-independent-3.pq'

# Progress
                               edits       sentences
ap.db                          x           x
bbc.db                          ~~~~~ 
calgaryherald.db               
canadaland.db
cbc.db
cnn.db                         x           x
dailymail.db
fox.db
globemail.db
lapresse.db
nationalpost.db
newssniffer-bbc.db.gz          x           x  
newssniffer-guardian.db.gz     x           x
newssniffer-independent.db     x           x 
newssniffer-nytimes.db.gz      x           x 
newssniffer-washpo.db          x           x
reuters.db.gz                  x           x 
telegraph.db 
therebel.db
torontostar.db
torontosun.db

In [98]:
fs.ls('aspangher/edit-pathways/spark_processing_scripts-output/')

['aspangher/edit-pathways/spark_processing_scripts-output/db_nyt__start_0__end_50__num_1.csv.gz',
 'aspangher/edit-pathways/spark_processing_scripts-output/db_nyt__start_1000__end_1500__num_2.csv.gz',
 'aspangher/edit-pathways/spark_processing_scripts-output/db_nyt__start_50__end_100__num_2.csv.gz',
 'aspangher/edit-pathways/spark_processing_scripts-output/df_nyt__0_500.csv',
 'aspangher/edit-pathways/spark_processing_scripts-output/df_nyt__start_0__end_500__num_1.csv.gz',
 'aspangher/edit-pathways/spark_processing_scripts-output/ap',
 'aspangher/edit-pathways/spark_processing_scripts-output/bbc-2',
 'aspangher/edit-pathways/spark_processing_scripts-output/cnn',
 'aspangher/edit-pathways/spark_processing_scripts-output/db_nyt__start_0__end_500',
 "aspangher/edit-pathways/spark_processing_scripts-output/db_nyt__start_0__end_500__num_['aspangher",
 'aspangher/edit-pathways/spark_processing_scripts-output/guardian',
 'aspangher/edit-pathways/spark_processing_scripts-output/independent',
 

In [100]:
fs.ls('aspangher/edit-pathways/spark_processing_scripts-output_sentences')

['aspangher/edit-pathways/spark_processing_scripts-output_sentences/ap',
 'aspangher/edit-pathways/spark_processing_scripts-output_sentences/bbc-2',
 'aspangher/edit-pathways/spark_processing_scripts-output_sentences/guardian',
 'aspangher/edit-pathways/spark_processing_scripts-output_sentences/independent',
 'aspangher/edit-pathways/spark_processing_scripts-output_sentences/nyt',
 'aspangher/edit-pathways/spark_processing_scripts-output_sentences/wp']