In [1]:
import pandas as pd 
import sqlite3
import sys
sys.path.insert(0, '../util')
import util_refactorings as ur

In [25]:
from importlib import reload

In [26]:
reload(ur)

<module 'util_refactorings' from '../util/util_refactorings.py'>

In [2]:
import glob
files = glob.glob('../data/diffengine-diffs/spark-output/*')

In [3]:
conn = sqlite3.connect(files[0])

In [4]:
low_count_versions = pd.read_sql('''
    with c1 as 
        (SELECT entry_id, 
            CAST(version as INT) as version, 
            COUNT(1) as c from split_sentences 
            GROUP BY entry_id, version)
    SELECT entry_id, version from c1
    WHERE c < 20 and c > 3
''', con=conn)

# get join keys
low_count_entry_ids = ', '.join(list(map(str, low_count_versions['entry_id'].unique())))
joint_keys = low_count_versions.pipe(lambda df: df['entry_id'].astype(str) + '-' + df['version'].astype(str))
joint_keys = "'%s'" % "', '".join(joint_keys.tolist())

# matched sentences
matched_sentences = pd.read_sql('''
    WITH c1 as ( 
    SELECT *, 
    entry_id || '-' || version_x as key_x,
    entry_id || '-' || version_y as key_y 
    FROM matched_sentences 
    )
    SELECT *
    FROM c1
    WHERE key_x in (%s) AND key_y  in (%s)
    ''' % (joint_keys, joint_keys)
, con=conn)

# get split sentences
split_sentences = pd.read_sql('''
    with c1 AS (
        SELECT *, entry_id || '-' || CAST(version AS INT) as key FROM split_sentences
    )
    SELECT entry_id, CAST(version AS INT) as version, sent_idx, sentence 
    FROM c1
    WHERE key IN (%s)
''' % joint_keys, con=conn)

In [5]:
matched_sentences.head()

Unnamed: 0,entry_id,version_x,version_y,sent_idx_x,sent_idx_y,avg_sentence_distance_x,avg_sentence_distance_y,key_x,key_y
0,1135398,0,1,7.0,8.0,0.0,0.0,1135398-0,1135398-1
1,1136793,1,2,11.0,15.0,6.026512e-07,6.026512e-07,1136793-1,1136793-2
2,1136865,0,1,1.0,11.0,0.3060372,0.3060372,1136865-0,1136865-1
3,1135569,0,1,6.0,8.0,0.185578,0.185578,1135569-0,1135569-1
4,1136062,0,1,1.0,1.0,0.1791577,0.1791577,1136062-0,1136062-1


In [13]:
edit_statistics = (matched_sentences
 .groupby(['entry_id', 'version_x', 'version_y'])
 .apply(lambda df: pd.Series({
     'mean x dist': df['avg_sentence_distance_x'].mean(),
     'mean y dist': df['avg_sentence_distance_y'].mean(),
     'num_deleted' : df['sent_idx_y'].isnull().sum(),
     'num_added' : df['sent_idx_x'].isnull().sum(),
     'refactors': ur.find_refactors_for_doc(
         df[['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'sent_idx_y']].dropna().astype(int)
     ),
 }))
 .assign(num_refactors=lambda df: df['refactors'].str.len())
 .assign(overall_mean=lambda df: df[['mean x dist', 'mean y dist']].mean(axis=1))
)

In [14]:
edit_statistics['num_refactors'].value_counts()

0    5167
1      78
2       8
3       3
6       1
Name: num_refactors, dtype: int64

In [15]:
edit_statistics.loc[lambda df: df['num_refactors'] > 0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mean x dist,mean y dist,num_deleted,num_added,refactors,num_refactors,overall_mean
entry_id,version_x,version_y,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
743901,0,1,0.127152,0.097867,8,3,"[(13, 7)]",1,0.112510
748239,2,3,0.000000,0.000000,0,0,"[(13, 12)]",1,0.000000
749079,2,3,0.165933,0.151510,0,0,"[(16, 6)]",1,0.158721
753327,0,1,0.035911,0.009719,0,0,"[(11, 10)]",1,0.022815
762189,0,1,0.050644,0.050644,0,0,"[(10, 10)]",1,0.050644
...,...,...,...,...,...,...,...,...,...
1880778,2,3,0.141510,0.112455,1,1,"[(8, 5)]",1,0.126982
1882121,2,3,0.037494,0.039123,0,0,"[(10, 9)]",1,0.038309
1882924,0,1,0.170068,0.170922,1,3,"[(7, 8)]",1,0.170495
1884356,0,1,0.232532,0.212893,7,4,"[(4, 6)]",1,0.222712


In [67]:
doc = (matched_sentences
 .loc[lambda df: (df['entry_id'] == 743901) & (df['version_x'] == 0) & (df['version_y'] == 1)]
 .sort_values(['sent_idx_x', 'sent_idx_y'])
)

doc_sentences = (split_sentences
 .loc[lambda df: (df['entry_id'] == 743901) & (df['version'] == 0) ]
                 .sort_values('sent_idx')
)

In [353]:
def label_sentences_add(doc):
    doc = doc.copy()
    sent_idxs = doc['sent_idx_y'].dropna().sort_values().tolist()
    additions = doc.loc[lambda df: df['sent_idx_x'].isnull()]['sent_idx_y'].tolist()
    
    add_aboves = []
    add_belows = []
    idx_in_add_l = 0
    while idx_in_add_l < len(additions):
        a = additions[idx_in_add_l]
        idx_in_sent_l = sent_idxs.index(a)
        cluster_size = 1
        if idx_in_sent_l < len(sent_idxs) - cluster_size:
            add_above = sent_idxs[idx_in_sent_l + cluster_size]
            while add_above in additions:
                cluster_size += 1
                add_above = sent_idxs[idx_in_sent_l + cluster_size]
            add_aboves.append({
                'add_above': add_above,
                'cluster_size': cluster_size
            })
        if idx_in_sent_l > 0:
            add_below = sent_idxs[idx_in_sent_l - 1]
            add_belows.append({
                'add_below': add_below,
                'cluster_size': cluster_size
            })
        idx_in_add_l += cluster_size
    
    return add_aboves, add_belows

In [370]:
## label each sentence in the old version as:
# 1. Deleted in the new version
# 2. Sentence added above/sentence added below  
# 3. Sentence edited
# 4. Sentence refactored

# 5. Sentence split (?)
# 6. Sentence merge (?)

def get_sentence_and_doc_labels(doc, doc_sentences):
    # 1. Detect deletions:
    deleted_labeled_sentences = pd.concat([
        (doc_sentences
         .loc[lambda df: ~df['sent_idx'].isin(doc['sent_idx_x'].dropna())]
         .assign(deleted_label=True)
         .rename(columns={'version':'version_x', 'sent_idx':'sent_idx_x'})
         [['entry_id', 'version_x', 'sent_idx_x', 'deleted_label']]
        )
        ,
        (doc
         .loc[lambda df: df['sent_idx_y'].isnull()]
          .assign(deleted_label=True)
          [['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'deleted_label']]
        )
    ]).drop_duplicates()

    # 2. Sentence additions above/below
    add_aboves, add_belows = label_sentences_add(doc)
    if len(add_aboves) > 0:
        add_above_labeled_sentences = (pd.DataFrame(add_aboves)
        #  .assign(add_above_label=lambda df: df['cluster_size'].apply(lambda x: 'add above ' + str(x)))
         .rename(columns={'cluster_size': 'add_above_label'})
         .merge(doc, how='inner', right_on='sent_idx_y', left_on='add_above')
         [['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'add_above_label']]
        )
    else:
        add_above_labeled_sentences = pd.DataFrame()
        
    if len(add_belows) > 0:
        add_below_labeled_sentences = (pd.DataFrame(add_belows)
        #  .assign(add_below_label=lambda df: df['cluster_size'].apply(lambda x: 'add below ' + str(x))) 
         .rename(columns={'cluster_size': 'add_below_label'})
         .merge(doc, how='inner', right_on='sent_idx_y', left_on='add_below')
         [['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'add_below_label']]
        )
    else:
        add_below_labeled_sentences = pd.DataFrame()
#         doc['add_below_label'] = 0 

    # 3. Sentence edits:
    edited_sentences = (doc
     .loc[lambda df: df['sent_idx_y'].notnull() & df['sent_idx_x'].notnull() & (df['avg_sentence_distance_x'] > .01)]
     .assign(edited_label=True)
      [['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'edited_label']]
    )
    unchanged_sentences = (doc
     .loc[lambda df: df['sent_idx_y'].notnull() & df['sent_idx_x'].notnull() & (df['avg_sentence_distance_x'] <= .01)]
     .assign(unchanged_label=True)
      [['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'unchanged_label']]
    )

    
    # 4. Sentence Refactors
    refactors = ur.find_refactors_for_doc(doc)
    refactored_sentences = (doc
     .loc[lambda df: df.apply(lambda x: (x['sent_idx_x'], x['sent_idx_y']) in refactors, axis=1)]
     .assign(refactored_label=lambda df: 
             df
             .pipe(lambda df: df['sent_idx_y'] - df['sent_idx_x'])
    #          .apply(lambda x: 'move %(direction)s %(num_steps)s' % ({
    #              'direction': 'up' if x < 0 else 'down',
    #              'num_steps': abs(int(x))
    #              }))
            )
       [['entry_id', 'version_x', 'version_y', 'sent_idx_x', 'refactored_label']]
    )

    ## Concat to make Sentence-Level DF 
    sentence_label_df = (pd.concat([
        deleted_labeled_sentences,
        add_above_labeled_sentences,
        add_below_labeled_sentences,
        edited_sentences,
        unchanged_sentences,
        refactored_sentences,
    ])
     .assign(version_y=lambda df: df['version_y'].fillna(method='bfill'))
     .fillna(False)
     .astype(int)
    )
    if 'add_below_label' not in sentence_label_df:
        sentence_label_df['add_below_label'] = 0
    if 'add_above_label' not in sentence_label_df:
        sentence_label_df['add_above_label'] = 0
    
    sentence_label_df = (sentence_label_df
         .groupby(['entry_id', 'version_x', 'sent_idx_x'])
         .agg({
             'deleted_label': lambda s: max(s),
             'add_above_label': lambda s: max(s),
             'add_below_label': lambda s: max(s),
             'edited_label': lambda s: max(s),
             'unchanged_label': lambda s: max(s),
             'refactored_label': lambda s: min(s)
         })
         .reset_index()
        )
    
    sentence_label_df = doc_sentences.merge(
            sentence_label_df,
            how='left',
            left_on=['entry_id', 'version', 'sent_idx'],
            right_on=['entry_id', 'version_x', 'sent_idx_x']
        ).drop(['version_x', 'sent_idx_x'], axis=1).fillna(0)
        
    ## Make Doc-Label DF
    doc_label_df = (sentence_label_df
     .assign(refactored_label=lambda df: (df['refactored_label'] != 0).astype(int))
     .groupby(['entry_id', 'version'])
     .aggregate({
         'deleted_label':'sum',
         'add_above_label':'sum',
         'edited_label': 'sum',
         'refactored_label': 'sum',
         'sentence': lambda s: '<SENT>'.join(s)
     })
     .rename(columns={
         'deleted_label': 'num_deleted',
         'add_above_label': 'num_added',
         'edited_label': 'num_edited',
         'refactored_label': 'num_refactored',
         'sentence': 'sentences'
     })
    )   
    
    return sentence_label_df, doc_label_df

In [391]:
all_sentence_labels = []
all_doc_labels = []
# for sanity checking
all_matched_sentences = []
all_split_sentences = []

for _, entry_id, v_x, v_y in (
    matched_sentences[['entry_id', 'version_x', 'version_y']]
    .drop_duplicates()
    .head()
    .itertuples()
):
    doc = (matched_sentences
     .loc[lambda df: (df['entry_id'] == entry_id) & (df['version_x'] == v_x) & (df['version_y'] == v_y)]
     .sort_values(['sent_idx_x', 'sent_idx_y'])
    )

    doc_sentences = (split_sentences
     .loc[lambda df: (df['entry_id'] == entry_id) & (df['version'] == v_x) ]
                     .sort_values('sent_idx')
    )
    
    all_matched_sentences.append(doc)
    all_split_sentences.append(doc_sentences)
    sentence_label_df, doc_label_df = get_sentence_and_doc_labels(doc, doc_sentences)
    all_sentence_labels.append(sentence_label_df)
    all_doc_labels.append(doc_label_df)

In [392]:
all_doc_labels_df = pd.concat(all_doc_labels)

In [393]:
all_doc_labels_df.to_csv('../modeling/data/doc-data-small.csv', index=False)

In [394]:
all_sentence_labels_df = pd.concat(all_sentence_labels)
## check 
assert (all_sentence_labels_df[['edited_label', 'unchanged_label', 'refactored_label']]
        .sum(axis=1)
        .pipe(lambda s: s == 1)
        .all()
       )

In [395]:
all_sentence_labels_df.to_csv('../modeling/data/sentence-data-small.csv', index=False)

In [410]:
sum(list(map(mul, torch.tensor([1,2,3]), [2,3,4])))

tensor(20)

In [409]:
from operator import add, mul