# Filter single node differences
We have tons of single node differences - how can we filter them?

In [1]:
import json
import difflib
from IPython.core.display import display, HTML
from diff_match_patch import diff_match_patch

from colorama import Fore
import ipysheet

import re

In [2]:
SINGLE_NODE_DIFF_PATH = "/projects/bdata/datasets/kaggle-competitions/processed/matches.jsonl"

In [3]:
!head $SINGLE_NODE_DIFF_PATH -n 5

[{"slug": "allen21huang", "version_id": "23387521", "source": "# check and change our directory\nos.chdir('/kaggle/input/natural-images/data/natural_images')\nprint(os.listdir())"}, {"slug": "allen21huang", "version_id": "23383256", "source": "os.chdir('/kaggle/input')\nprint(os.listdir())"}]
[{"slug": "allen21huang", "version_id": "23387521", "source": "# check and change our directory\nos.chdir('/kaggle/input/natural-images/data/natural_images')\nprint(os.listdir())"}, {"slug": "allen21huang", "version_id": "23383256", "source": "os.chdir('/kaggle/input')\nprint(os.listdir())"}]
[{"slug": "allen21huang", "version_id": "23387521", "source": "# check and change our directory\nos.chdir('/kaggle/input/natural-images/data/natural_images')\nprint(os.listdir())"}, {"slug": "allen21huang", "version_id": "23383639", "source": "os.chdir('/kaggle/input')\nprint(os.listdir())"}]
[{"slug": "allen21huang", "version_id": "23387521", "source": "# check and change our directory\nos.chdir('/kaggle/inp

In [4]:
!wc -l $SINGLE_NODE_DIFF_PATH

23610029 /projects/bdata/datasets/kaggle-competitions/processed/matches.jsonl


In [5]:
single_node_samples = !shuf -n 10000 $SINGLE_NODE_DIFF_PATH
single_node_samples = [json.loads(x) for x in single_node_samples]

In [6]:
from io import StringIO
import tokenize
import token
import sys

def remove_comments(src):
    """
    This reads tokens using tokenize.generate_tokens and recombines them
    using tokenize.untokenize, and skipping comment/docstring tokens in between
    """
    f = StringIO(src)
    class SkipException(Exception): pass
    processed_tokens = []
    last_token = None
    # go thru all the tokens and try to skip comments and docstrings
    for tok in tokenize.generate_tokens(f.readline):
        t_type, t_string, t_srow_scol, t_erow_ecol, t_line = tok

        try:
            if t_type == tokenize.COMMENT:
                raise SkipException()

            elif t_type == tokenize.STRING:

                if last_token is None or last_token[0] in [tokenize.INDENT]:
                    # FIXEME: this may remove valid strings too?
                    #raise SkipException()
                    pass

        except SkipException:
            pass
        else:
            processed_tokens.append(tok)

        last_token = tok

    return tokenize.untokenize(processed_tokens)


def remove_comments_from_src(src):
    """
    This reads tokens using tokenize.generate_tokens and recombines them
    using tokenize.untokenize, and skipping comment/docstring tokens in between
    """
    try:
        f = StringIO(src)
        class SkipException(Exception): pass
        processed_tokens = []
        last_token = None
        # go thru all the tokens and try to skip comments and docstrings
        for tok in tokenize.generate_tokens(f.readline):
            t_type, t_string, t_srow_scol, t_erow_ecol, t_line = tok

            try:
                if t_type == tokenize.COMMENT:
                    raise SkipException()

                elif t_type == tokenize.STRING:

                    if last_token is None or last_token[0] in [tokenize.INDENT]:
                        # FIXEME: this may remove valid strings too?
                        #raise SkipException()
                        pass

            except SkipException:
                pass
            else:
                processed_tokens.append(tok)

            last_token = tok

        return tokenize.untokenize(processed_tokens)
    #I belive this happens with invalid python
    except (ValueError, tokenize.TokenError, IndentationError) as e:
        return src


def remove_comments_and_docstrings(src):
    
    source = StringIO(src)
    mod = StringIO()
    
    prev_toktype = token.INDENT
    first_line = None
    last_lineno = -1
    last_col = 0

    tokgen = tokenize.generate_tokens(source.readline)
    for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
        if 0:   # Change to if 1 to see the tokens fly by.
            print("%10s %-14s %-20r %r" % (
                tokenize.tok_name.get(toktype, toktype),
                "%d.%d-%d.%d" % (slineno, scol, elineno, ecol),
                ttext, ltext
                ))
        if slineno > last_lineno:
            last_col = 0
        if scol > last_col:
            mod.write(" " * (scol - last_col))
#         if toktype == token.STRING and prev_toktype == token.INDENT:
#             # Docstring
#             continue
#         elif toktype == tokenize.COMMENT:
#             # Comment
#             continue
        elif not(toktype == tokenize.COMMENT) and not(toktype == token.STRING and prev_toktype == token.INDENT):
            mod.write(ttext)
        prev_toktype = toktype
        last_col = ecol
        last_lineno = elineno
    return mod.getvalue()

In [7]:
test = """
#train_images = train_dogs[:2000] + train_cats[:2000]
"""

In [8]:
print(remove_comments(test))


                                                     



In [9]:
print(remove_comments_and_docstrings(test))






The following diffs use were generated using:
`python tree_matches --length_threshold 3 --ignore_function_args --remove_exact_duplicates --ignore_strings --sequential_matches`

In [10]:
examples = !cat /homes/gws/mikeam/RobustDataScience/matches.jsonl
examples = [json.loads(x) for x in examples]

In [11]:
# def html_diff(a,b):
#     differ = difflib.HtmlDiff(wrapcolumn = 80)
#     return differ.make_table(a.split("\n"),b.split("\n"), context = True)


def pprint_code_diff(a,b):
    dmp = diff_match_patch()
    diffs = dmp.diff_main(a,b)
    dmp.diff_cleanupSemantic(diffs)
    display(HTML(dmp.diff_prettyHtml(diffs)))
    
def compare_diff(examples,index):
    orig = examples[index][0]
    nn = examples[index][1]
    pprint_code_diff(orig["source"],nn["source"])

In [12]:
html_break = '<br><span style="color:red">------------------------------</span>'
for i in range(150,170):
    compare_diff(examples,i)
    display(HTML(html_break))

# What about the 'Kaggle Diffs'?

In [13]:
diff_examples = !shuf -n 10000 /homes/gws/mikeam/RobustDataScience/diffs_new.jsonl
diff_examples = [json.loads(x) for x in diff_examples]

In [14]:
def red(text):
    return (Fore.RED + text + Fore.RESET)

def green(text):
    return (Fore.GREEN + text + Fore.RESET)
    
def display_kaggle_diff(diff,return_string=False):
    diff_disp = get_diff_visual(diff)
    if return_string:
        return diff_disp
    else:
        print(diff_disp)
        
def get_diff_visual(diff):
    
    string_lines = []
    
    string_lines.append(Fore.BLUE + diff["original_path"])
    string_lines.append("-"*100 + Fore.RESET)
    
    for line in diff["cell_diff"].split("\n"):
        if len(line) == 0:
            string_lines.append("")
            continue
        if line[0] == "+":
            string_lines.append(green(line))
        elif line[0] == "-":
            string_lines.append(red(line))
        else:
            string_lines.append(line)
    return "\n".join(string_lines)


for diff in diff_examples[:40]:
    display_kaggle_diff(diff)

[34mdata/processed/competitions/imet-2019-fgvc6/mathormad/14614696.json
----------------------------------------------------------------------------------------------------[39m

[31m-# train all layers[39m
[32m+                  [39m
 for layer in model.layers:
     layer.trainable = True
 
 callbacks_list = [checkpoint, csv_logger, reduceLROnPlat]
 model.compile(loss='binary_crossentropy',
[31m-            # loss=focal_loss,[39m
[32m+                              [39m
             optimizer=Adam(lr=1e-4))
[31m-            # optimizer=AdamAccumulate(lr=1e-4, accum_iters=2))[39m
[32m+                                                               [39m
 
 model.fit_generator(
     train_mixup,
     steps_per_epoch=np.ceil(float(len(train_indexes)) / float(batch_size)),
     validation_data=validation_generator,
     validation_steps=np.ceil(float(len(valid_indexes)) / float(batch_size)),
     epochs=epochs,
     verbose=1,
[31m-    max_queue_size=16, workers=WORKERS, use_mu

## Adding some new features...
The following was done with:
`python src/data/tree_diffs.py  data/processed/competitions/ . --ignore_comments --python_only --git_context 3 --ignore_empty_lines --ignore_line_shuffle --n_workers 1 `

In [41]:
MORE_FEATURES_PATH = "/homes/gws/mikeam/RobustDataScience/diffs_test.jsonl"
!wc -l $MORE_FEATURES_PATH

224532 /homes/gws/mikeam/RobustDataScience/diffs_test.jsonl


Note how many more examples we got, I think by decreasing the context between lines such that we have smaller hunks.

In [42]:
more_features_examples = !shuf -n 100000 $MORE_FEATURES_PATH
more_features_examples = [json.loads(x) for x in more_features_examples]

In [43]:
for diff in more_features_examples[:40]:
    display_kaggle_diff(diff)

[34mdata/processed/competitions/zillow-prize-1/jaccojurg/30688082.json
----------------------------------------------------------------------------------------------------[39m
 plt.figure(figsize=(15,10))
 
[31m-Countries = ['Italy', 'Netherlands','Sweden','Germany', 'China','US','Belgium', 'Japan','France'][39m
[32m+Countries = ['Italy', 'Netherlands - Netherlands'][39m
 #Countries = ['Italy', 'Netherlands', 'France'] #, 'Belgium'] #, 'Belgium','France']
 for country in Countries:
 
[31m-#for r in range(len(info)):[39m
[31m-#    plt.annotate(info.loc[r]['event'],(info.loc[r]['dayssince'],info.loc[r]['CasesPerM']))[39m
[32m+for r in range(len(info)):[39m
[32m+    plt.annotate(info.loc[r]['event'],(info.loc[r]['dayssince'],info.loc[r]['CasesPerM']),[39m
[32m+                 xytext = (-10+info.loc[r]['dayssince'],info.loc[r]['CasesPerM']-1),[39m
[32m+                 #color=c[info.loc[r]['location']],[39m
[32m+                 arrowprops=dict(arrowstyle="->",connectio

In [18]:
def show_diffs_that_match_regex(diffs,regex,limit=100):
    matches = 0
    for diff in diffs:
        if re.search(regex,diff["cell_diff"]):
            display_kaggle_diff(diff)
            matches += 1
        if matches == limit:
            break

In [19]:
show_diffs_that_match_regex(more_features_examples, "ttest_ind", limit=100)

[34mdata/processed/competitions/santander-customer-transaction-prediction/aaronl87/16375570.json
----------------------------------------------------------------------------------------------------[39m

 import pandas as pd
[31m-from pandas.plotting import scatter_matrix[39m
 import numpy as np
[31m-[39m
 from scipy.stats import ttest_ind, levene
 
[31m-%matplotlib inline[39m
[31m-[39m


[34mdata/processed/competitions/titanic/andyyang/1342951.json
----------------------------------------------------------------------------------------------------[39m

[31m-# Statitic Test[39m
[32m+# Statitic Test, variable is continuous, so we choose T-test[39m
 # H0: People survived and not survived have same fare, mean(survive_fare)=mean(non_survive_fare)
[32m+from scipy.stats import ttest_ind[39m
 


[34mdata/processed/competitions/m5-forecasting-accuracy/armenabnousi/29611576.json
---------------------------------------------------------------------------------------------------

## Consolidated Diff Filters
The following were created using the consolidated diff filters:

In [46]:
CONSOLIDATED_DIFF_FILTERS_PATH = "/homes/gws/mikeam/RobustDataScience/data/processed/filtered_diffs.jsonl"
consolidated_examples = !shuf $CONSOLIDATED_DIFF_FILTERS_PATH
consolidated_examples = [json.loads(x) for x in consolidated_examples]

In [48]:
len(consolidated_examples)

93032

In [47]:
for diff in consolidated_examples[100:200]:
    display_kaggle_diff(diff)

[34mdata/processed/competitions/tmdb-box-office-prediction/takedown/13477503.json
----------------------------------------------------------------------------------------------------[39m

[31m-test['revenue'] =  np.expm1(test["xgbfinal"])[39m
[32m+test['revenue'] =  np.expm1(test["lgbfinal"])[39m
 test[['id','revenue']].head()
[34mdata/processed/competitions/severstal-steel-defect-detection/watanabe2362/20644177.json
----------------------------------------------------------------------------------------------------[39m
 def rle2mask(rle):
 def mask2rle(mask):
[31m-    if np.sum(mask) == 0: return '1 1'[39m
[32m+    if np.sum(mask) == 0: return ''[39m
     ar = mask.flatten(order='F')
[34mdata/processed/competitions/aerial-cactus-identification/kshashankrao/15278097.json
----------------------------------------------------------------------------------------------------[39m
train_dataset = Dataset(data = X_train, transform=image_transform)
[31m-test_dataset = Dataset(dat

In [59]:
show_diffs_that_match_regex(consolidated_examples, "^[\+-].*ttest_ind", limit=100)

In [60]:
show_diffs_that_match_regex(consolidated_examples, "^[\+-].*KMeans", limit=100)

[34mdata/processed/competitions/new-york-city-taxi-fare-prediction/yairhadad1/12395801.json
----------------------------------------------------------------------------------------------------[39m
[31m-cls_k_means = KMeans(n_clusters=2)[39m
[32m+cls_k_means = KMeans(n_clusters=3)[39m
 cls_k_means.fit(k_mean_X)
[34mdata/processed/competitions/sf-crime/nguyenbaopc/22348067.json
----------------------------------------------------------------------------------------------------[39m
[31m-kmeans = KMeans(n_clusters=4)[39m
[32m+kmeans = KMeans(n_clusters=14)[39m
 kmeans.fit(df_input)
[34mdata/processed/competitions/nyc-taxi-trip-duration/priyanka13/1441720.json
----------------------------------------------------------------------------------------------------[39m
[31m-kmeans = KMeans(n_clusters=2, random_state=0).fit(xkdt)[39m
[32m+kmeans = KMeans(n_clusters=3, random_state=0).fit(xkdt)[39m
 test['dropoff_id']=kmeans.labels_
[34mdata/processed/competitions/bosch-productio

In [61]:
show_diffs_that_match_regex(consolidated_examples, "^[\+-].*dropna", limit=100)

[34mdata/processed/competitions/house-prices-advanced-regression-techniques/plasticgrammer/4889718.json
----------------------------------------------------------------------------------------------------[39m
[31m-skewed = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness[39m
[32m+skewed = all_data[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)[39m
 skewed_feats = skewed[skewed > 1].index
 print(skewed_feats)
[34mdata/processed/competitions/titanic/headsortails/1151431.json
----------------------------------------------------------------------------------------------------[39m
[31m-df = train.loc[:,tcols].dropna()[39m
[32m+df = training.loc[:,tcols].dropna()[39m
 X = df.loc[:,cols]
[34mdata/processed/competitions/ga-customer-revenue-prediction/danofer/5722315.json
----------------------------------------------------------------------------------------------------[39m
[32m+    df.dropna(how="all",axis=1,inplace=True)[

In [62]:
show_diffs_that_match_regex(consolidated_examples, "^[\+-].*Dropout", limit=100)

[34mdata/processed/competitions/tweet-sentiment-extraction/lomen0857/36358298.json
----------------------------------------------------------------------------------------------------[39m
[31m-    x2 = tf.keras.layers.Dropout(0.13)(x[0]) [39m
[32m+    x2 = tf.keras.layers.Dropout(0.15)(x[0]) [39m
     x2 = tf.keras.layers.Conv1D(1,1)(x2)
[34mdata/processed/competitions/jigsaw-unintended-bias-in-toxicity-classification/cevangelist/12729100.json
----------------------------------------------------------------------------------------------------[39m
[31m-    drop_0 = L.SpatialDropout1D(0.2)(emb)[39m
[32m+    drop_0 = L.Dropout(0.5, seed=42)(emb)[39m
[32m+    bi_lstm_0 = L.Bidirectional(L.CuDNNLSTM(RECURRENT_UNITS, return_sequences=False))(drop_0)[39m
[34mdata/processed/competitions/Kannada-MNIST/mak4alex/21546456.json
----------------------------------------------------------------------------------------------------[39m
[31m-    con_drop_layer1 = Dropout(0.25)(relu_layer

In [63]:
show_diffs_that_match_regex(consolidated_examples, "^[\+-].*chisquare", limit=100)

In [64]:
show_diffs_that_match_regex(consolidated_examples, "^[\+-].*LinearRegression", limit=100)

[34mdata/processed/competitions/house-prices-advanced-regression-techniques/y2kshehan/2007135.json
----------------------------------------------------------------------------------------------------[39m
[31m-lin_reg_pl = LinearRegression()[39m
 #Predicting the SalePrice using cross validation (KFold method)
[31m-y_pred_pl = cross_val_predict(lin_reg_pl, X_poly, y, cv=6 )[39m
[32m+y_pred_pl = cross_val_predict(lin_reg_pl, X_poly, y, cv=10 )[39m
 #Polynominal Regression Accuracy with cross validation
 accuracy_pl = metrics.r2_score(y, y_pred_pl)
[34mdata/processed/competitions/restaurant-revenue-prediction/ani310/574131.json
----------------------------------------------------------------------------------------------------[39m
[31m-cls = linear_model.LinearRegression()[39m
[32m+cls = RandomForestRegressor()[39m
 cls.fit(xTrain, yTrain)
[34mdata/processed/competitions/covid19-global-forecasting-week-3/letili0417/31396638.json
----------------------------------------------

In [65]:
show_diffs_that_match_regex(consolidated_examples, "^[\+-].*wilcoxon", limit=100)

In [66]:
show_diffs_that_match_regex(consolidated_examples, "^[\+-].*SGD", limit=100)

[34mdata/processed/competitions/online-sales/sgrsgrsgr/6969291.json
----------------------------------------------------------------------------------------------------[39m
[31m-clf = SGDClassifier(loss='log', penalty='none', max_iter=1000, fit_intercept=True, random_state=1234)[39m
[32m+clf = SGDClassifier(loss='log', penalty='L2', max_iter=5000, fit_intercept=True, random_state=1234)[39m
 clf.fit(X_train, y_train)
[34mdata/processed/competitions/histopathologic-cancer-detection/artgor/7509653.json
----------------------------------------------------------------------------------------------------[39m
[31m-optimizer = optim.SGD(model_conv.parameters(), lr=0.1, momentum=0.9)[39m
[32m+optimizer = optim.SGD(model_conv.parameters(), lr=0.001, momentum=0.9)[39m
 exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)
[34mdata/processed/competitions/plant-seedlings-classification/allunia/18786594.json
-----------------------------------------------------------

In [67]:
show_diffs_that_match_regex(consolidated_examples, "^[\+-].*BERT", limit=100)

[34mdata/processed/competitions/tweet-sentiment-extraction/drhouse3/33634205.json
----------------------------------------------------------------------------------------------------[39m
[31m-model_config = transformers.AlbertConfig.from_pretrained(config.BERT_PATH)[39m
[32m+model_config = transformers.RobertaConfig.from_pretrained(config.BERT_PATH)[39m
 model_config.output_hidden_states = True
[34mdata/processed/competitions/tweet-sentiment-extraction/viswajithkn/32086267.json
----------------------------------------------------------------------------------------------------[39m
[31m-    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)[39m
[32m+    model_config = transformers.RobertaConfig.from_pretrained(config.roberta_path)[39m
     model = BertBaseQA(768, 2,model_config).to(device)
[34mdata/processed/competitions/jigsaw-multilingual-toxic-comment-classification/bamps53/30818481.json
--------------------------------------------------------------

## Wait, do we want to keep positional args afterall?


In [70]:
POSITIONAL_ARGS_PATH="~/RobustDataScience/data/processed/filtered_diffs_with_args.jsonl"
positional_args_examples = !shuf $POSITIONAL_ARGS_PATH
positional_args_examples = [json.loads(x) for x in positional_args_examples]

In [69]:
!shuf $POSITIONAL_ARGS_PATH

shuf: ./data/processed/filtered_diffs_with_args.jsonl: No such file or directory


In [71]:
show_diffs_that_match_regex(positional_args_examples, "^[\+-].*ttest_ind", limit=100)

## Using keywords from libraries
The following examples use libraries mined with `/homes/gws/mikeam/RobustDataScience/src/data/scrape_library_structures.py`:

In [84]:
USING_LIBRARY_ARGS = "/homes/gws/mikeam/RobustDataScience/data/processed/filtered_with_lib_structure.txt"
using_lib_examples = !shuf $USING_LIBRARY_ARGS
using_lib_examples = [json.loads(x) for x in using_lib_examples]

In [85]:
for diff in using_lib_examples[:40]:
    display_kaggle_diff(diff)

[34mdata/processed/competitions/microsoft-malware-prediction/adityaecdrid/8717145.json
----------------------------------------------------------------------------------------------------[39m

[31m-train['first_4'] = train['MachineIdentifier'].apply(lambda x: x[:4])[39m
[32m+train['first_4'] = train['MachineIdentifier'].apply(lambda x: x[:4]).astype('category')[39m
 


[31m-test['first_4'] = test['MachineIdentifier'].apply(lambda x: x[:4])[39m
[32m+test['first_4'] = test['MachineIdentifier'].apply(lambda x: x[:4]).astype('category')[39m
 


[34mdata/processed/competitions/pubg-finish-placement-prediction/praneethvarmaalluri/8130448.json
----------------------------------------------------------------------------------------------------[39m
 config = tf.contrib.learn.RunConfig(tf_random_seed=42)
 feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(x_train_squad_fpp_scaled)
[31m-dnn_reg_squad_fpp = tf.contrib.learn.DNNRegressor(hidden_units=[2500,2500,250