In [1]:
import s3fs
import sys
sys.path.append('../util')
import util_data_access as da
from tqdm.auto import tqdm
import numpy as np
import os, glob
import pandas as pd 

import sys
sys.path.append('../')
import spark_processing_scripts.util_spark as sus
import spark_processing_scripts.util_general as sug

fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': 'http://s3.dev.obdc.bcs.bloomberg.com'})

# Download Files

In [2]:
db_name = 'nyt'
if not os.path.exists('%s_output' % db_name):
     os.makedirs('%s_output' % db_name)

In [18]:
for f in tqdm(fs.ls('aspangher/edit-pathways/spark_processing_scripts-output/%s/' % db_name)):
    fs.get(f, '%s_output/%s' % (db_name, os.path.basename(f)))

In [19]:
## why does this not work?
fs.get('aspangher/edit-pathways/spark_processing_scripts-output/%s/' % db_name, '%s_output' % db_name, recursive=True)

In [72]:
# da.download_file('guardian.db', 'edit-pathways/dbs/newssniffer-guardian.db.gz')
# ! gunzip guardian.db.gz
# fs.ls('aspangher/edit-pathways/spark_processing_scripts-output_sentences/nyt')

# Read Files

In [20]:
dfs = []
for f in tqdm(glob.glob('%s_output/*' % db_name)):
    df = pd.read_csv(f, compression=None, index_col=0)
    dfs.append(df)

  0%|          | 0/826 [00:00<?, ?it/s]

In [21]:
full_diffs_df = pd.concat(dfs)

In [22]:
full_diffs_df[['entry_id', 'version_x', 'version_y']].drop_duplicates().shape

(278826, 3)

In [23]:
full_diffs_df.head()

Unnamed: 0,entry_id,version_x,version_y,sent_idx_x,sent_idx_y,avg_sentence_distance_x,avg_sentence_distance_y
0,1348675,0,1,1.0,1.0,0.0,0.0
1,1350005,0,1,21.0,21.0,0.0,0.0
2,1351256,0,1,37.0,37.0,0.0,0.0
3,1348265,0,1,9.0,15.0,0.0,0.0
4,1351613,2,3,91.0,91.0,0.0,0.0


In [24]:
(full_diffs_df
 .assign(c=1)
 .groupby(['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'sent_idx_y'])['c']
 .sum()
 .value_counts()
)

1    15674824
Name: c, dtype: int64

# Check Against Expected

In [78]:
remote_name = sug.conn_mapper_dict[db_name]
da.download_file('%s.db.gz' % db_name, 'edit-pathways/dbs/%s.db.gz' % remote_name)
! gunzip $db_name\.db\.gz

gzip: wp.db already exists; do you wish to overwrite (y or n)? ^C


In [14]:
import sqlite3

con = sqlite3.connect('%s.db' % db_name)
# full_df = pd.read_sql('select * from entryversion where num_versions > 1 and num_versions < 40 ', con=con)
eligible_ids = pd.read_sql('select entry_id, version from entryversion where num_versions > 1 and num_versions < 40 ', con=con)

In [15]:
expected = []
for entry_id, versions in (
    eligible_ids
        .groupby('entry_id')
        .aggregate(list)['version']
        .iteritems()
):
    for version_pair in zip(versions[:-1], versions[1:]):
        expected.append({'entry_id': entry_id, 'version_pair': version_pair})

expected_df = pd.DataFrame(expected)

In [16]:
expected_df.shape

(48650, 2)

In [17]:
(full_diffs_df
 [['entry_id', 'version_x', 'version_y']]
 .drop_duplicates()
 .shape
)

(48238, 3)

In [18]:
(full_diffs_df
 [['entry_id']]
 .drop_duplicates()
 .shape
)

(18997, 1)

In [19]:
expected_df['entry_id'].drop_duplicates().shape

(19176,)

In [71]:
all_dfs = []
for f in tqdm(glob.glob('%s_pqs/*' % db_name)):
    df = pd.read_parquet(f)
    all_dfs.append(df)

  0%|          | 0/12 [00:00<?, ?it/s]

In [73]:
all_df = pd.concat(all_dfs)

In [20]:
output_full_diffs_df = full_diffs_df

In [31]:
output_full_diffs_df = (full_diffs_df
 .fillna('nan')
 .drop_duplicates(['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'sent_idx_y'])
 .replace(to_replace='nan', value=np.nan)
)

In [22]:
output_full_diffs_df.head()

Unnamed: 0,entry_id,version_x,version_y,sent_idx_x,sent_idx_y,avg_sentence_distance_x,avg_sentence_distance_y
0,1265845,0,1,44.0,44.0,0.0,0.0
1,1343259,0,1,25.0,25.0,0.0,0.0
2,1317302,1,2,3.0,3.0,0.0,0.0
3,1361393,0,1,37.0,37.0,0.0,0.0
4,1291614,0,1,46.0,45.0,0.125,0.125


In [32]:
## final files
import sqlite3
with sqlite3.connect('%s-matched-sentences.db' % db_name) as con:
    output_full_diffs_df.to_sql('matched_sentences', con=con, index=False, chunksize=10000, if_exists='replace')

In [None]:
da.upload_file('2021-05-19__partial-nyt-output.pkl', 'edit-pathways/output_for_sheena/2021-05-25__partial-nyt-matched-output.pkl')

In [None]:
## intermediate files
fname = '2021-05-26__newssniffer-bbc-diffs-output.pkl'
output_full_diffs_df.to_pickle(fname, compression='gzip', chunksize=10000)

In [34]:
da.upload_file(fname, 'edit-pathways/output_for_sheena/%s' % fname)

True

In [36]:
da.upload_file('nyt_sent_output/df_nyt__start_0__end_20000__num_1.pkl', 'edit-pathways/output_for_sheena/df_nyt__start_0__end_20000__num_1.pkl')

True

# Sentences

In [25]:
import re
import sqlite3
import os 
import s3fs
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': 'http://s3.dev.obdc.bcs.bloomberg.com'})

In [27]:
## sentences
db_name = 'nyt'

In [28]:
if not os.path.exists('%s_sent_output' % db_name):
    os.makedirs('%s_sent_output' % db_name)

In [29]:
for f in tqdm(fs.ls('aspangher/edit-pathways/spark_processing_scripts-output_sentences/%s' % db_name)):
    fname = f.split('/')[-1]
    fs.get(f, '%s_sent_output/%s' % (db_name, fname))

  0%|          | 0/5 [00:00<?, ?it/s]

In [33]:
## check if expected entry_ids are there before dumping to SQLite
import glob
import pandas as pd 
from tqdm.auto import tqdm 

all_entry_ids = []
for f in tqdm(glob.glob('%s_sent_output/*' % db_name)):
    entry_ids = pd.read_pickle(f, compression='gzip')['entry_id'].drop_duplicates()
    all_entry_ids.append(entry_ids)

  0%|          | 0/5 [00:00<?, ?it/s]

In [32]:
pd.concat(all_entry_ids).drop_duplicates().shape

(19176,)

In [34]:
with sqlite3.connect('%s-matched-sentences.db' % db_name) as con:
    con.execute('DROP TABLE IF EXISTS split_sentences;')

In [35]:
import pandas as pd 
import pickle
import glob
from tqdm.auto import tqdm

sent_dfs = []
sent_file_list = glob.glob('%s_sent_output/*' % db_name)

# if db_name == 'guardian':
#     sent_file_list = list(filter(lambda x: int(re.search('end_(\d+)', x)[1]) - int(re.search('start_(\d+)', x)[1]) == 5000, sent_file_list))
#     sent_file_list = sorted(sent_file_list, key=lambda x: int(re.search('num_(\d+)', x)[1]))

for f in tqdm(sent_file_list):
    sent_df = pd.read_pickle(f, compression='gzip')
    ## final files
    import sqlite3
    with sqlite3.connect('%s-matched-sentences.db' % db_name) as con:
        sent_df.to_sql('split_sentences', con=con, index=False, chunksize=5000, if_exists='append')
#     break
#     sent_dfs.append(sent_df)

  0%|          | 0/5 [00:00<?, ?it/s]

In [35]:
with sqlite3.connect('%s-matched-sentences.db' % db_name) as con:
    print(pd.read_sql('SELECT count(distinct entry_id) from split_sentences', con=con))
    print(pd.read_sql('SELECT count(distinct entry_id) from matched_sentences', con=con))

   count(distinct entry_id)
0                     19176
   count(distinct entry_id)
0                     18997


In [36]:
with sqlite3.connect('%s-matched-sentences.db' % db_name) as con:
    con.execute('''
        DELETE   FROM split_sentences
        WHERE    rowid not in
                 (
                 select  min(rowid)
                 from split_sentences
                 group by
                     entry_id,
                     version,
                     sent_idx
                 )
    ''')

In [37]:
! gzip $db_name-matched-sentences\.db

In [38]:
dir_name = 'spark_output_final'
# dir_name = 'output_for_sheena'
fs.put(
    '%s-matched-sentences.db.gz' % db_name, 
    'aspangher/edit-pathways/%s/%s-matched-sentences.db.gz' % (dir_name, db_name)
)

In [None]:
# ! aws s3api put-object-acl --bucket aspangher --key edit-pathways/spark_output_final/nyt-matched-sentences.db.gz --acl public-read --endpoint http://s3.dev.obdc.bcs.bloomberg.com 

In [26]:
calc_entry_versions = (full_diffs_df
 .set_index('entry_id')[['version_x', 'version_y']]
 .unstack()
 .to_frame('version')
 .reset_index()
 .drop('level_0', axis=1)
 .drop_duplicates()
)

In [29]:
sent_df[['entry_id', 'version']].drop_duplicates().shape

(79292, 2)

In [27]:
(sent_df[['entry_id','version']]
 .drop_duplicates()
 .merge(
     calc_entry_versions, 
     right_on=['entry_id', 'version'], 
     left_on=['entry_id', 'version'], 
     how='inner'
 ).shape
)

(71073, 2)

# Look at fetching operation

In [37]:
sys.path.append('../spark_processing_scripts')
import util_general as sug

In [40]:
s3_path = sug.s3_output_dir_main if not True else sug.s3_output_dir_sentences
file_count = len(sug.get_files(s3_path, db_name, sug.csv_pat)) if not True else len(sug.get_files(s3_path, db_name, sug.pkl_pat))

In [41]:
file_count

55

In [2]:
db_name = 'wp'
t = sug.download_prefetched_data(db_name, format='csv', split_sentences=False, show_progress=True)

In [5]:
prefetched_file_idx, last_one, to_fetch_df = sug.download_pq_to_df(db_name, t)

In [16]:
fname
import re

In [37]:
file_pattern

re.compile(r'newssniffer-washpo-\d+.pq', re.UNICODE)

In [36]:
prefetched_entry_id_list = []
fname = ug.conn_mapper_dict[db_name]
file_list = ug.get_fs().ls(ug.s3_pq_dir)
file_pattern = re.compile(r'%s-\d+.pq' % fname)
file_list = list(enumerate(filter(lambda x: re.search(file_pattern, x), file_list)))
file_list = file_list[0:]
if show_progress:
    file_list = tqdm(file_list)
for f_idx, fname in file_list:
    with get_fs().open(fname) as f:
        full_df = pd.read_parquet(f)
    full_df = full_df.loc[lambda df: ~df['entry_id'].isin(prefetched_entry_id_list)]
    if len(full_df['entry_id'].drop_duplicates()) > 50:
        last_one = f_idx < (len(file_list) - 1)
        return f_idx, last_one, full_df
if len(file_list) == 0:
    f_idx = 0
last_one = f_idx < (len(file_list) - 1)
return f_idx, last_one, []

NameError: name 'show_progress' is not defined

In [24]:
to_fetch_df = ug.download_pq_to_df(db_name, t)

In [62]:
import re
db_name = 'guardian'
prefetched_entry_id_list = t.values
fname = ug.conn_mapper_dict[db_name]
file_list = ug.get_fs().ls(ug.s3_pq_dir)
file_pattern = re.compile(r'%s-\d+.pq' % fname)
file_list = list(filter(lambda x: re.search(file_pattern, x), file_list))

if False:
    all_full_dfs = []
    for f_idx, fname in enumerate(file_list):
        with ug.get_fs().open(fname) as f:
            full_df = pd.read_parquet(f)

        print(f_idx)
        print('pre-filtering')
        print(full_df.shape)
        print(full_df['entry_id'].drop_duplicates().shape)
        all_full_dfs.append(full_df.copy())
        full_df = full_df.loc[lambda df: ~df['entry_id'].isin(prefetched_entry_id_list)]
        print('post-filtering')
        print(full_df.shape)
        print(full_df['entry_id'].drop_duplicates().shape)
        if len(full_df['entry_id'].drop_duplicates()) > 50:
            print('here')#full_df

In [46]:
full_df = pd.concat(all_full_dfs)

In [48]:
full_df.shape

(714873, 12)

In [49]:
full_df['entry_id'].drop_duplicates().shape

(180000,)

# Prepare data files

In [40]:
db_name = 'independent'
remote_name = sug.conn_mapper_dict[db_name]

# da.download_file('newssniffer-washpo.db.gz', 'edit-pathways/dbs/newssniffer-washpo.db.gz')
if not (os.path.exists('%s.db.gz' % db_name) or os.path.exists('%s.db' % db_name)):
    fs.get('aspangher/edit-pathways/dbs/%s.db.gz' % remote_name, '%s.db.gz' % db_name)
    ! gunzip $db_name\.db\.gz

In [41]:
import sqlite3
with sqlite3.connect('%s.db' % db_name) as con:
    entry_ids = pd.read_sql('select DISTINCT entry_id from entryversion', con=con)['entry_id']
#     full_df = pd.read_sql('select * from entryversion', chunksize=5000, con=con)

In [43]:
entry_ids.shape

(55009,)

In [53]:
if not os.path.exists('%s_pqs' % db_name):
    os.makedirs('%s_pqs' % db_name)

In [54]:
chunk_size = 20000
for chunk_idx, (s_idx, e_idx) in tqdm(enumerate(
    zip(
        range(0, len(entry_ids), chunk_size), 
        range(chunk_size, len(entry_ids) + chunk_size, chunk_size)
    )
)):
    chunk_ids = entry_ids[s_idx: e_idx].values.tolist()
    with sqlite3.connect('%s.db' % db_name) as con:
        chunk_df = pd.read_sql('''
                                SELECT * FROM entryversion
                                WHERE entry_id IN (%s)
        ''' % ', '.join(list(map(str, chunk_ids))), con=con)

    (chunk_df
     .to_parquet('%(db_name)s_pqs/%(db_name)s-%(num)s.pq' % ({'db_name': db_name, 'num': chunk_idx + 1}))
    )

|          | 0/? [00:00<?, ?it/s]

In [55]:
import os 
for f in tqdm(glob.glob('%s_pqs/*' % db_name)):
    remote_fname = os.path.basename(f).replace(db_name, remote_name)
    da.upload_file(f, 'edit-pathways/pqs/' + remote_fname)

  0%|          | 0/3 [00:00<?, ?it/s]

In [56]:
remote_fname

'newssniffer-independent-3.pq'

# Progress
                                edits       sentences
ap.db                           in-prog
bbc.db                          ~~~~~ 
calgaryherald.db              
canadaland.db
cbc.db
cnn.db
dailymail.db
fox.db
globemail.db
lapresse.db
nationalpost.db
newssniffer-bbc.db.gz          in-prog             
newssniffer-guardian.db.gz     in-prog
newssniffer-independent.db
newssniffer-nytimes.db.gz      x           x 
newssniffer-washpo.db          in-prog
reuters.db.gz                  x 
telegraph.db 
therebel.db
torontostar.db
torontosun.db