In [13]:
import s3fs
import sys
sys.path.append('../util')
import util_data_access as da
from tqdm.auto import tqdm
import numpy as np
import os, glob
import pandas as pd 

fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': 'http://s3.dev.obdc.bcs.bloomberg.com'})

In [6]:
fs.ls('aspangher/edit-pathways/')

['aspangher/edit-pathways/csvs',
 'aspangher/edit-pathways/db-dumps',
 'aspangher/edit-pathways/dbs',
 'aspangher/edit-pathways/output_for_sheena',
 'aspangher/edit-pathways/pkls',
 'aspangher/edit-pathways/pqs',
 'aspangher/edit-pathways/sample_files_for_sheena',
 'aspangher/edit-pathways/spacy',
 'aspangher/edit-pathways/spark_output_final',
 'aspangher/edit-pathways/spark_processing_scripts-output',
 'aspangher/edit-pathways/spark_processing_scripts-output_sentences',
 'aspangher/edit-pathways/temp',
 'aspangher/edit-pathways/tmp']

# Download Files

In [39]:
db_name = 'guardian'
os.makedirs('%s_output' % db_name)

In [40]:
for f in tqdm(fs.ls('aspangher/edit-pathways/spark_processing_scripts-output/%s/' % db_name)):
    f = '/'.join(f.split('/')[1:])
    fname = f.split('/')[-1]
    
    da.download_file(db_name + '_output/' + fname, f)

  0%|          | 0/120 [00:00<?, ?it/s]

In [52]:
# da.download_file('guardian.db', 'edit-pathways/dbs/newssniffer-guardian.db.gz')
! gunzip guardian.db.gz
# fs.ls('aspangher/edit-pathways/spark_processing_scripts-output_sentences/nyt')

# Read Files

In [42]:
dfs = []
for f in tqdm(glob.glob('%s_output/*' % db_name)):
    df = pd.read_csv(f, compression=None, index_col=0)
    dfs.append(df)

  0%|          | 0/120 [00:00<?, ?it/s]

In [43]:
full_diffs_df = pd.concat(dfs)

In [44]:
full_diffs_df[['entry_id', 'version_x', 'version_y']].drop_duplicates().shape

(155016, 3)

In [45]:
full_diffs_df.head()

Unnamed: 0,entry_id,version_x,version_y,sent_idx_x,sent_idx_y,avg_sentence_distance_x,avg_sentence_distance_y
0,519015,2,3,19.0,18.0,0.0,
1,519864,0,1,27.0,27.0,0.0,0.0
2,519154,9,10,27.0,30.0,0.0,0.0
3,519325,7,8,,3.0,,
4,520036,0,1,10.0,10.0,0.0,0.0


In [46]:
(full_diffs_df
 .assign(c=1)
 .groupby(['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'sent_idx_y'])['c']
 .sum()
 .value_counts()
)

1    5536029
Name: c, dtype: int64

# Check Against Expected

In [53]:
import sqlite3

con = sqlite3.connect('%s.db' % db_name)

# full_df = pd.read_sql('select * from entryversion where num_versions > 1 and num_versions < 40 ', con=con)

eligible_ids = pd.read_sql('select entry_id, version from entryversion where num_versions > 1 and num_versions < 40 ', con=con)

In [54]:
expected = []
for entry_id, versions in eligible_ids.groupby('entry_id').aggregate(list)['version'].iteritems():
    for version_pair in zip(versions[:-1], versions[1:]):
        expected.append({'entry_id': entry_id, 'version_pair': version_pair})

expected_df = pd.DataFrame(expected)

In [55]:
expected_df.shape

(612059, 2)

In [56]:
(full_diffs_df
 [['entry_id', 'version_x', 'version_y']]
 .drop_duplicates()
 .shape
)

(155016, 3)

In [57]:
(full_diffs_df
 [['entry_id']]
 .drop_duplicates()
 .shape
)

(59900, 1)

In [59]:
expected_df['entry_id'].drop_duplicates().shape

(231051,)

In [7]:
output_full_diffs_df = (full_diffs_df
 .fillna('nan')
 .drop_duplicates(['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'sent_idx_y'])
 .replace(to_replace='nan', value=np.nan)
)

In [8]:
output_full_diffs_df.head()

Unnamed: 0,entry_id,version_x,version_y,sent_idx_x,sent_idx_y,avg_sentence_distance_x,avg_sentence_distance_y
0,704716,4,5,8.0,8.0,0.136447,0.136447
1,694571,6,7,,8.0,,
2,703260,0,1,72.0,71.0,0.0,0.0
3,702729,0,1,28.0,28.0,0.0,0.0
4,692624,3,4,4.0,4.0,0.0,0.0


In [10]:
## final files
import sqlite3
with sqlite3.connect('nyt-matched-sentences.db') as con:
    output_full_diffs_df.to_sql('matched_sentences', con=con, index=False, chunksize=10000, if_exists='replace')

In [47]:
fs.ls('aspangher/edit-pathways/spark_processing_scripts-output_sentences')

['aspangher/edit-pathways/spark_processing_scripts-output_sentences/nyt']

In [None]:
da.upload_file('2021-05-19__partial-nyt-output.pkl', 'edit-pathways/output_for_sheena/2021-05-25__partial-nyt-matched-output.pkl')

In [None]:
## intermediate files
output_full_diffs_df.to_pickle('2021-05-25__full-nyt-diffs-output.pkl', compression='gzip')

In [34]:
da.upload_file('2021-05-19__partial-nyt-output.pkl', 'edit-pathways/output_for_sheena/2021-05-25__partial-nyt-matched-output.pkl')

True

In [36]:
da.upload_file('nyt_sent_output/df_nyt__start_0__end_20000__num_1.pkl', 'edit-pathways/output_for_sheena/df_nyt__start_0__end_20000__num_1.pkl')

True

In [35]:
ls nyt_sent_output

df_nyt__start_0__end_20000__num_1.pkl


In [33]:
fs.ls('aspangher/edit-pathways')

['aspangher/edit-pathways/csvs',
 'aspangher/edit-pathways/db-dumps',
 'aspangher/edit-pathways/dbs',
 'aspangher/edit-pathways/pkls',
 'aspangher/edit-pathways/pqs',
 'aspangher/edit-pathways/sample_files_for_sheena',
 'aspangher/edit-pathways/spacy',
 'aspangher/edit-pathways/spark_processing_scripts-output',
 'aspangher/edit-pathways/spark_processing_scripts-output_sentences',
 'aspangher/edit-pathways/temp',
 'aspangher/edit-pathways/tmp']

# Sentences

In [22]:
## sentences

In [48]:
mkdir nyt_sent_output

In [49]:
for f in fs.ls('aspangher/edit-pathways/spark_processing_scripts-output_sentences/nyt'):
    f = '/'.join(f.split('/')[1:])
    fname = f.split('/')[-1]
    da.download_file('nyt_sent_output/' + fname , f)

In [3]:
con = sqlite3.connect('nyt-matched-sentences.db')
con.execute('DROP TABLE IF EXISTS split_sentences;')

In [None]:
pd.read_sql('select * from split_sentences limit 5', con=con)

In [6]:
import pandas as pd 
import pickle
import glob
from tqdm.auto import tqdm

sent_dfs = []
for f in tqdm(glob.glob('nyt_sent_output/*')):
    sent_df = pd.read_pickle(f, compression='gzip')
    ## final files
    import sqlite3
    with sqlite3.connect('nyt-matched-sentences.db') as con:
        sent_df.to_sql('split_sentences', con=con, index=False, chunksize=5000, if_exists='append')
#     break
#     sent_dfs.append(sent_df)

  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
! gzip nyt-matched-sentences.db

In [12]:
da.upload_file('nyt-matched-sentences.db.gz', 'edit-pathways/spark_output_final/nyt-matched-sentences.db.gz')

True

In [None]:
# ! aws s3api put-object-acl --bucket aspangher --key edit-pathways/spark_output_final/nyt-matched-sentences.db.gz --acl public-read --endpoint http://s3.dev.obdc.bcs.bloomberg.com 

In [26]:
calc_entry_versions = (full_diffs_df
 .set_index('entry_id')[['version_x', 'version_y']]
 .unstack()
 .to_frame('version')
 .reset_index()
 .drop('level_0', axis=1)
 .drop_duplicates()
)

In [29]:
sent_df[['entry_id', 'version']].drop_duplicates().shape

(79292, 2)

In [27]:
(sent_df[['entry_id','version']]
 .drop_duplicates()
 .merge(
     calc_entry_versions, 
     right_on=['entry_id', 'version'], 
     left_on=['entry_id', 'version'], 
     how='inner'
 ).shape
)

(71073, 2)

# Look at fetching operation

In [5]:
sys.path.append('../spark_processing_scripts')

In [6]:
import util_general as ug

In [7]:
t = ug._download_prefetched_data_csv('nyt', False, True)

  0%|          | 0/826 [00:00<?, ?it/s]

In [10]:
full_df.shape

NameError: name 'full_df' is not defined

In [101]:
df = ug.get_rows_to_process_df(
    500, 0, t, full_df
)

In [3]:
import sys
sys.path.append('../')
import spark_processing_scripts.util_spark as sus
import spark_processing_scripts.util_general as sug

In [None]:
full_db = sug.download_pq_to_df('nyt')

In [47]:
prefetched_df = sug.download_prefetched_data('nyt', split_sentences=False)

In [74]:
t = (prefetched_df
 .set_index('entry_id')[['version_x', 'version_y']]
 .unstack()
 .to_frame('version')
 .reset_index()
 .drop('level_0', axis=1)
 .drop_duplicates()
)

In [88]:
t2 = full_db[['entry_id', 'version']].drop_duplicates()

In [96]:
prefetched_df['entry_id'].shape

(2560285,)

In [100]:
'entry_id' in prefetched_df

True

In [59]:
full_db['entry_id'].drop_duplicates().shape

(20000,)

In [None]:
df = sug.get_rows_to_process_df(
    500, 0, prefetched_df, full_db
)

In [1]:
import sqlite3
import pandas as pd 
with sqlite3.connect('nyt.db') as con:
    full_nyt_df = pd.read_sql('''
        SELECT * FROM entryversion
        WHERE num_versions > 1
        AND num_versions < 40
    ''', con=con)

In [None]:
full_nyt_df.loc[lambda df: ~df['entry_id'].isin(full_db['entry_id'])].shape

In [107]:
full_nyt_df.loc[lambda df: df['entry_id'].isin(full_db['entry_id'])].shape

(79294, 12)

# Prepare data files

In [60]:
db_name = 'guardian'
# da.download_file('newssniffer-washpo.db.gz', 'edit-pathways/dbs/newssniffer-washpo.db.gz')
if False or not os.path.exists('%s.db.gz' % db_name):
    da.download_file('%s.db.gz' % db_name, 'edit-pathways/dbs/%s.db.gz' % db_name)
    ! gunzip ap.db.gz

INFO:botocore.vendored.requests.packages.urllib3.connectionpool:Resetting dropped connection: s3.dev.obdc.bcs.bloomberg.com
ERROR:root:An error occurred (404) when calling the HeadObject operation: Not Found


gzip: ap.db.gz: No such file or directory


In [61]:
import sqlite3
with sqlite3.connect('%s.db' % db_name) as con:
    entry_ids = pd.read_sql('select DISTINCT entry_id from entryversion', con=con)['entry_id']
#     full_df = pd.read_sql('select * from entryversion', chunksize=5000, con=con)

In [35]:
if not os.path.exists('%s_outpdb_name) ! mkdir ap_pqs

In [36]:
chunk_size = 20000
for chunk_idx, (s_idx, e_idx) in tqdm(enumerate(
    zip(
        range(0, len(entry_ids), chunk_size), 
        range(chunk_size, len(entry_ids) + chunk_size, chunk_size)
    )
)):
    chunk_ids = entry_ids[s_idx: e_idx].values.tolist()
    with sqlite3.connect('%s.db' % db_name) as con:
        chunk_df = pd.read_sql('''
                                SELECT * FROM entryversion
                                WHERE entry_id IN (%s)
        ''' % ', '.join(list(map(str, chunk_ids))), con=con)

    (chunk_df
     .to_parquet('%(db_name)s_pqs/%(db_name)s-%(num)s.pq' % ({'db_name': db_name, 'num': chunk_idx + 1}))
    )

|          | 0/? [00:00<?, ?it/s]

In [37]:
import os 
for f in tqdm(glob.glob('%s_pqs/*' % db_name)):
    fname = os.path.basename(f)
    da.upload_file(f, 'edit-pathways/pqs/' + fname)

  0%|          | 0/2 [00:00<?, ?it/s]

# Progress
                                edits       sentences
ap.db                           in-prog
bbc.db                          ~~~~~ 
calgaryherald.db              
canadaland.db
cbc.db
cnn.db
dailymail.db
fox.db
globemail.db
lapresse.db
nationalpost.db
newssniffer-bbc.db.gz          in-prog             
newssniffer-guardian.db.gz     in-prog
newssniffer-independent.db
newssniffer-nytimes.db.gz      x           x 
newssniffer-washpo.db          in-prog
reuters.db.gz                  x 
telegraph.db 
therebel.db
torontostar.db
torontosun.db