In [11]:
import pandas as pd 
import sqlite3
import sys
sys.path.insert(0, '../util')
import util_refactorings as ur
from importlib import reload
reload(ur)
import glob
from tqdm.auto import tqdm

from util_label import get_split_and_matched_dfs, label_sentences_add, get_sentence_and_doc_labels

In [7]:
import multiprocessing

In [38]:
files = glob.glob('../data/diffengine-diffs/spark-output/*')

In [45]:
files = glob.glob('../modeling/data/doc*.csv')

In [52]:
for f in files:
    with open(f) as fr:
        header= fr.readlines()[:1]
        if 'entry_id' not in header[0]:
            print(f)

../modeling/data/doc-data-independent.csv
../modeling/data/doc-data-wp.csv
../modeling/data/doc-data-reuters.csv
../modeling/data/doc-data-small.csv


In [39]:
files

['../data/diffengine-diffs/spark-output/wp-matched-sentences.db',
 '../data/diffengine-diffs/spark-output/independent-matched-sentences.db',
 '../data/diffengine-diffs/spark-output/reuters-matched-sentences.db',
 '../data/diffengine-diffs/spark-output/guardian-matched-sentences.db',
 '../data/diffengine-diffs/spark-output/nyt-matched-sentences.db',
 '../data/diffengine-diffs/spark-output/bbc-2-matched-sentences.db',
 '../data/diffengine-diffs/spark-output/ap-matched-sentences.db']

In [51]:
all_stats = []
for f in files:
    source = f.split('/')[-1].split('-')[0]
    if '.gz' in f:
        ! gunzip $f
    f = f.replace('.gz', '')
    conn = sqlite3.connect(f)
    print('fetching data for %s ...' % source)
    stats = pd.read_sql('''
        with c1 as (
            SELECT 
                sent_idx_x IS NULL as addition,
                sent_idx_y IS NULL as deletion,
                avg_sentence_distance_x > 0 or avg_sentence_distance_y > 0 as edit,
                1 as sentence,
                entry_id,
                (entry_id || '-' || version_x) as version
            FROM matched_sentences
        )
        SELECT 
        COUNT(DISTINCT entry_id) as num_docs,
        COUNT(DISTINCT version) as num_versions,
        sum(addition) as num_additions,
        sum(deletion) as num_deletions,
        sum(edit) as num_edits,
        sum(sentence) as num_sentences
        from c1
    ''', con=conn)
    stats['source'] = source
    all_stats.append(stats)
#     ! gzip f

fetching data for wp ...
fetching data for independent ...
fetching data for reuters ...
fetching data for guardian ...
fetching data for nyt ...
fetching data for bbc ...
fetching data for ap ...


In [58]:
pd.concat(all_stats).sum().iloc[:-1] * (4_200_000 / 2_138_637)

num_docs           1449605.332742
num_versions            4200000.0
num_additions     10250006.522846
num_deletions      5399573.560169
num_edits          26573277.65301
num_sentences    150987955.319206
dtype: object

In [67]:
(pd.concat(all_stats)
 .sum().iloc[:-1]
 .pipe(lambda s: s * (4_200_000 / 2_138_637))
 .pipe(lambda s: s / s['num_sentences'])
 [['num_edits', 'num_additions', 'num_deletions']]
 .to_dict()
)

{'num_edits': 0.17599600972694093,
 'num_additions': 0.06788625292114472,
 'num_deletions': 0.03576161786384939}

In [83]:
perc = pd.Series(
{'num_edits': 0.17599600972694093,
 'num_additions': 0.06788625292114472,
 'num_deletions': 0.03576161786384939,
 'num_refactors': 1_600_000 / 150987955.319206
}).pipe(lambda s: s * 100).round(1).to_frame('% of Sents.')

In [84]:
num = pd.Series(
{'num_edits': 26.6,
 'num_additions': 10.2,
 'num_deletions': 5.4,
 'num_refactors': 1.6
}).to_frame('Total Num.')

In [87]:
print(pd.concat([num, perc], axis=1).to_latex())

\begin{tabular}{lrr}
\toprule
{} &  Total Num. &  \% of Sents. \\
\midrule
num\_edits     &        26.6 &         17.6 \\
num\_additions &        10.2 &          6.8 \\
num\_deletions &         5.4 &          3.6 \\
num\_refactors &         1.6 &          1.1 \\
\bottomrule
\end{tabular}



In [68]:
for f in files:
    source = f.split('/')[-1].split('-')[0]
    if '.gz' in f:
        ! gunzip $f
    f = f.replace('.gz', '')
    conn = sqlite3.connect(f)
    print('fetching data for %s ...' % source)
    matched_sentences, split_sentences = get_split_and_matched_dfs(conn, sents_max=60)

    print('calculating statistics...')
    edit_statistics = (matched_sentences
     .groupby(['entry_id', 'version_x', 'version_y'])
     .apply(lambda df: pd.Series({
         'mean x dist': df['avg_sentence_distance_x'].mean(),
         'mean y dist': df['avg_sentence_distance_y'].mean(),
         'num_deleted' : df['sent_idx_y'].isnull().sum(),
         'num_added' : df['sent_idx_x'].isnull().sum(),
         'refactors': ur.find_refactors_for_doc(
             df[['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'sent_idx_y']].dropna().astype(int)
         ),
     }))
     .assign(num_refactors=lambda df: df['refactors'].str.len())
     .assign(overall_mean=lambda df: df[['mean x dist', 'mean y dist']].mean(axis=1))
    )

    desired_docs = (edit_statistics
     .loc[lambda df:
          (df['overall_mean'] > .01) |
          (df['num_deleted'] > 0) |
          (df['num_added'] > 0) |
          (df['num_refactors'] > 0)]
    )

    print('processing documents...')
    all_sentence_labels = []
    all_doc_labels = []
    # for sanity checking
    all_matched_sentences = []
    all_split_sentences = []

    for entry_id, v_x, v_y in tqdm(desired_docs.index):
        doc = (matched_sentences
         .loc[lambda df: (df['entry_id'] == entry_id) & (df['version_x'] == v_x) & (df['version_y'] == v_y)]
         .sort_values(['sent_idx_x', 'sent_idx_y'])
        )

        doc_sentences = (split_sentences
         .loc[lambda df: (df['entry_id'] == entry_id) & (df['version'] == v_x) ]
                         .sort_values('sent_idx')
        )

        all_matched_sentences.append(doc)
        all_split_sentences.append(doc_sentences)
        sentence_label_df, doc_label_df = get_sentence_and_doc_labels(doc, doc_sentences)
        all_sentence_labels.append(sentence_label_df)
        all_doc_labels.append(doc_label_df)

    print('writing to disk...')
    all_doc_labels_df = pd.concat(all_doc_labels)
    all_doc_labels_df.to_csv('../modeling/data/doc-data-%s.csv' % source, index=False)
    all_sentence_labels_df = pd.concat(all_sentence_labels)
    ## check 
    assert (all_sentence_labels_df[['edited_label', 'unchanged_label', 'deleted_label']]
            .sum(axis=1)
            .pipe(lambda s: s == 1)
            .all()
           )
    all_sentence_labels_df.to_csv('../modeling/data/sentence-data-%s.csv' % source, index=False)
    ! gzip f

fetching data for nyt ...
calculating statistics...


KeyboardInterrupt: 

In [28]:
for entry_id, v_x, v_y in tqdm(desired_docs.index[61071:]):
    doc = (matched_sentences
     .loc[lambda df: (df['entry_id'] == entry_id) & (df['version_x'] == v_x) & (df['version_y'] == v_y)]
     .sort_values(['sent_idx_x', 'sent_idx_y'])
    )

    doc_sentences = (split_sentences
     .loc[lambda df: (df['entry_id'] == entry_id) & (df['version'] == v_x) ]
                     .sort_values('sent_idx')
    )

    all_matched_sentences.append(doc)
    all_split_sentences.append(doc_sentences)
    sentence_label_df, doc_label_df = get_sentence_and_doc_labels(doc, doc_sentences)
    all_sentence_labels.append(sentence_label_df)
    all_doc_labels.append(doc_label_df)

print('writing to disk...')
all_doc_labels_df = pd.concat(all_doc_labels)
all_doc_labels_df.to_csv('../modeling/data/doc-data-%s.csv' % source, index=False)
all_sentence_labels_df = pd.concat(all_sentence_labels)
## check 
assert (all_sentence_labels_df[['edited_label', 'unchanged_label', 'deleted_label']]
        .sum(axis=1)
        .pipe(lambda s: s == 1)
        .all()
       )
all_sentence_labels_df.to_csv('../modeling/data/sentence-data-%s.csv' % source, index=False)

  0%|          | 0/123814 [00:00<?, ?it/s]

writing to disk...


AssertionError: 

In [31]:
(all_sentence_labels_df
 .loc[lambda df: df[['edited_label', 'unchanged_label', 'deleted_label']].sum(axis=1).pipe(lambda s: s != 1)]
)

Unnamed: 0,entry_id,version,sent_idx,sentence,deleted_label,add_above_label,add_below_label,edited_label,unchanged_label,refactored_label
0,823187,0,0,South Korea’s much-loved K-pop star Lee Seung-...,1,0,0,0,1,0
0,823187,1,0,South Korea’s much-loved K-pop star Lee Seung-...,1,0,0,0,1,0
0,1665344,0,0,Doing something “for the ‘gram” is not typical...,1,0,0,0,1,0
0,1665344,1,0,Doing something “for the ‘gram” is not typical...,1,0,0,0,1,0
0,1668925,0,0,Harry Potter author JK Rowling has defended he...,1,0,0,0,1,0
0,1668925,1,0,Harry Potter author JK Rowling has defended he...,1,0,0,0,1,0
0,1668925,2,0,Harry Potter author JK Rowling has defended he...,1,0,0,0,1,0
8,1800363,0,8,More to come North Korea Asia Pacific news Sha...,1,2,0,1,0,0


In [33]:
all_sentence_labels_df.to_csv('../modeling/data/sentence-data-%s.csv' % source, index=False)