In [120]:
import pandas as pd
from tqdm.auto import tqdm
import re

In [141]:
name_mapper = {
    'bbc': 'bbc-1',
    'newssniffer-bbc': 'bbc-2',
    'newssniffer-guardian': 'guardian',
    'newssniffer-independent': 'independent',
    'newssniffer-nytimes': 'nyt',
    'newssniffer-washpo': 'wp',
}

# Check Running Programs

In [134]:
all_running_docker_programs = []
for i in tqdm([2, 3]):
    a = ! gcloud compute ssh --zone "us-central1-a" "edit-parser-1-$i" --project "usc-research" --command "docker ps --no-trunc"
    a_s = pd.Series(a).to_frame('jobs')
    a_s['box'] = i
    all_running_docker_programs.append(a_s)

  0%|          | 0/2 [00:00<?, ?it/s]

In [135]:
all_running_docker_programs = pd.concat(all_running_docker_programs)
jobs_of_interest = (
    all_running_docker_programs
        .loc[lambda df: 
             df['jobs'].apply(lambda x: (
                 'us.gcr.io/usc-research/edit-parser' in x or 'python3 parsing_script.py' in x
             ) and ('klt-edit-parser' not in x)
)])

jobs_of_interest = (
    jobs_of_interest
        .assign(split_num=lambda df: df['jobs'].apply(lambda x: re.findall('--split_num (\d)', x)).str.get(0))
        .assign(db=lambda df: df['jobs'].str.split().str.get(5))
)

In [136]:
(jobs_of_interest
 .sort_values(['db', 'split_num', 'box'])
 [['db', 'split_num', 'box']]
)

Unnamed: 0,db,split_num,box
1,guardian,,2


In [130]:
full_db_list = [
#     'ap',
#     'bbc-1',
    'bbc-2',
#     'calgaryherald',
#     'canadaland',
    'cbc',
#     'cnn',
#     'dailymail',
#     'fox',
#     'globemail',
    'guardian',
#     'independent',
#     'lapresse',
#     'nationalpost',
#     'nyt',
#     'reuters',
#     'telegraph',
#     'therebel',
    'torontostar',
#     'torontosun',
#     'whitehouse',
#     'wp'
]

set(full_db_list) - set(jobs_of_interest['db'])

{'bbc-2', 'cbc', 'guardian', 'torontostar'}

In [131]:
jobs_of_interest['db'].value_counts()

Series([], Name: db, dtype: int64)

In [132]:
jobs_of_interest['box'].value_counts()

Series([], Name: box, dtype: int64)

# Check Status vs. All Articles

In [13]:
from google.cloud import datastore
import os
from tqdm.auto import tqdm
import pandas as pd

def get_table(table, source=None):
    q = client.query(kind=table)
    if source is not None:
        q = q.add_filter('source', '=', source)
    results = q.fetch()
    res_iter=iter(results)
    output = []
    for res in tqdm(res_iter):
        output.append(res)
    return output

In [164]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/Users/alex/.google-cloud/usc-research-data-access.json'
client = datastore.Client()

In [None]:
t = 'edit-paths-sentence-stats'

sentence_stats = get_table(t)
sentence_stats_df = pd.DataFrame(sentence_stats)

articles_and_version_counts_plus_processed = pd.concat([
    sentence_stats_df['source'].value_counts().to_frame('processed version pairs'),
    sentence_stats_df.groupby('source')['a_id'].aggregate(set).str.len().to_frame('processed articles'),
    articles_and_versions.rename(index=lambda x: name_mapper.get(x, x))
], axis=1).fillna(0).apply(lambda s: s.astype(int))

articles_and_version_counts_plus_processed.head()

articles_and_version_counts_plus_processed[[
    'processed version pairs',
    'version_thresh_counts',
    'processed articles', 
    'article_thresh_counts'
]].assign(a_id_finished=lambda df: df['processed articles'] / df['article_thresh_counts']).sort_values('a_id_finished')

sentence_stats_df['num_added_sents'].value_counts()
# sentence_stats_df['num_changed'].value_counts()

sentence_stats_df['num_changed_sents'].value_counts().sort_index()

# Check Status for Specific Articles

In [None]:
t = 'edit-paths-sentence-stats'
outlet = ['nyt']
outlet_sentence_stats = []

for o in outlet:
    sentence_stats = get_table(t)

outlet_sentence_stats_df = pd.DataFrame(outlet_sentence_stats)

outlet_sentence_stats_df['a_id'].unique().shape

nyt_conn = sqlite3.connect('../data/diffengine-diffs/db/newssniffer-nytimes.db')

nyt_num_versions = pd.read_sql('''
    SELECT DISTINCT entry_id, num_versions 
    FROM entryversion 
    WHERE num_versions < 40
''', nyt_conn)

In [None]:
(nyt_num_versions.merge(
    outlet_sentence_stats_df['a_id'].value_counts().to_frame('retrieved'),
    left_on='entry_id',
    right_index=True
)
 .assign(num_version_pairs=lambda df: df['num_versions'] - 1)
 .loc[lambda df: df['num_version_pairs'] != df['retrieved']]
 .assign(num_missing=lambda df: df['num_version_pairs'] - df['retrieved'])
 ['num_missing'].sum()
)

nyt_num_versions.merge(
    outlet_sentence_stats_df['a_id'].value_counts().to_frame('retrieved'),
    left_on='entry_id',
    right_index=True
)

# Get Sentence Diffs

In [142]:
name_mapper

{'bbc': 'bbc-1',
 'newssniffer-bbc': 'bbc-2',
 'newssniffer-guardian': 'guardian',
 'newssniffer-independent': 'independent',
 'newssniffer-nytimes': 'nyt',
 'newssniffer-washpo': 'wp'}

In [146]:
import glob, os
dbs = os.listdir('../data/diffengine-diffs/db/')

In [157]:
dbs = set(map(lambda x: x.split('.')[0], dbs))

In [159]:
dbs = list(map(lambda x: name_mapper.get(x, x), dbs))

In [160]:
dbs

['torontostar',
 'dailymail',
 'canadaland',
 'fox',
 'reuters',
 'nationalpost',
 'lapresse',
 'calgaryherald',
 'guardian',
 'therebel',
 'ap',
 'whitehouse',
 'globemail',
 'cbc',
 'telegraph',
 'nyt',
 'cnn',
 'wp',
 'independent',
 'torontosun',
 'bbc-2',
 'bbc-1']

In [166]:
to_get = [
    'nyt',
    #'guardian',
#     'washpo',
    'bbc-2',
]

In [167]:
table_name = 'edit-paths-sentence-diffs'

In [None]:
for db in to_get:
    print('fetching from %s...' % db)
    sentence_diffs = get_table(table=table_name, source=db)
    sentence_diffs_df = pd.DataFrame(sentence_diffs)
    
    with sqlite3.connect('../data/diffengine-diffs/output/%s.db' % db) as conn:
        sentence_diffs_df.to_sql('sentence_diffs', con=conn, if_exists='replace')

fetching from nyt...


0it [00:00, ?it/s]

# Examine Sentence Diffs

In [None]:
t = 'edit-paths-sentence-diffs'
sentence_diffs = get_table(t, source='wp')
wp_sentence_diffs_df = pd.DataFrame(sentence_diffs)
(wp_sentence_stats_df
 .assign(c=1)
 .groupby(['a_id', 'version_old', 'version_new'])
 ['c']
 .sum()
)

In [19]:
import sqlite3
with sqlite3.connect('../data/diffengine-diffs/output/wp.db') as conn:
    wp_sentence_diffs_df = pd.read_sql('select * from sentence_diffs', con=conn)

In [21]:
changed_sent_diffs = (
    wp_sentence_diffs_df
     .loc[lambda df: df['tag_old'] == '-']
     .loc[lambda df: df['tag_new'] == '+']
)

(changed_sent_diffs
 .assign(c=1)
 .groupby(['a_id', 'version_old', 'version_new'])
 ['c'].sum()
 .value_counts()
 .sort_index()
 .head()
) 

1    9992
2    5413
3    4314
4    3660
5    2776
Name: c, dtype: int64

In [22]:
import sys
sys.path.insert(0, '..')
from util import util_newssniffer_parsing as unp 

In [None]:
changed_sent_diffs.head(2)

for s_old, s_new in tqdm(changed_sent_diffs[['sent_old', 'sent_new']].itertuples(index=False), total=len(changed_sent_diffs)):
    s_old_diff, s_new_diff = unp.get_word_diffs(s_old, s_new)
#     word_stat_output = {
#         'num_removed_words': sum(map(lambda x: x['tag'] == '-', s_old)),
#         'num_added_words': sum(map(lambda x: x['tag'] == '+', s_new)),
#         'len_old_sent': len(list(filter(lambda x: x['text'] != '', s_old))),
#         'len_new_sent': len(list(filter(lambda x: x['text'] != '', s_new))),
#         'version_nums': (v_old, v_new),
#         's_old': s_old,
#         's_new': s_new,
#         'a_id': a_id,
#         's_idx': s_idx
#     }
#     word_stat_items.append(word_stat_output)