## Notebook Template to Quickly Test Things Out

In [1]:
# General imports
# import torch
import numpy as np
import os, sys
import json
from tqdm import tqdm
import pandas as pd

In [3]:
# Local imports
sys.path.insert(0, 'src')
from utils import read_json, read_lists
# from utils.model_utils import prepare_device
# # from parse_config import ConfigParser
# from data_loader import data_loaders
# import model.model as module_arch

In [55]:
# Define constants, paths
config_path = 'configs/'
timestamp = '0112_121958'
csv_path = os.path.join('saved', 'edit', 'trials', 'CINIC10_ImageNet-VGG_16', timestamp, 'results_table.csv')


In [96]:
# Load CSV as pandas dataframe
df = pd.read_csv(csv_path)
n_total = len(df)
print("CSV loaded from {}".format(csv_path))
print("{} rows".format(n_total))

CSV loaded from saved/edit/trials/CINIC10_ImageNet-VGG_16/0112_121958/results_table.csv
37 rows


In [114]:
# Round all numbers to 3 decimal places
df.round(3)
mean_df = df.mean()
std_df = df.std()

In [119]:
metrics = [['{} Accuracy', '{} Mean Precision', '{} Mean Recall', '{} Mean F1'], 
           ['{} Target Precision', '{} Target Recall', '{} Target F1'],
           ['{} Orig Pred Precision', '{} Orig Pred Recall', '{} Orig Pred F1']]
print("{:<30} {:<15} {:<20}".format("Metric", "Pre-Edit", "Post-Edit"))
for row in metrics:
    for metric in row:
        print("{:<30} {:<15.3f} {:.3f}({:.3f})".format(
            metric, mean_df[metric.format("Pre")],
            mean_df[metric.format("Post")], std_df[metric.format("Post")]))
    print("")

Metric                         Pre-Edit        Post-Edit           
{} Accuracy                    0.687           0.677(0.018)
{} Mean Precision              0.692           0.682(0.032)
{} Mean Recall                 0.687           0.677(0.018)
{} Mean F1                     0.684           0.672(0.025)

{} Target Precision            0.702           0.590(0.107)
{} Target Recall               0.430           0.495(0.049)
{} Target F1                   0.533           0.528(0.036)

{} Orig Pred Precision         0.655           0.671(0.264)
{} Orig Pred Recall            0.656           0.484(0.214)
{} Orig Pred F1                0.655           0.554(0.226)



## Hypothesis 1: Masked modifications will have greater changes than noise

In [90]:
# Compare mean post edit accuracy, precision, recall, and f1
mean_masked = masked_rows.mean()
mean_gaussian = gaussian_rows.mean()
std_masked = masked_rows.std()
std_gaussian = gaussian_rows.std()
# print(mean_masked)

metrics = [['{} Accuracy', '{} Mean Precision', '{} Mean Recall', '{} Mean F1'], 
           ['{} Target Precision', '{} Target Recall', '{} Target F1'],
           ['{} Orig Pred Precision', '{} Orig Pred Recall', '{} Orig Pred F1']]
print("{:<30} {:<15} {:<20} {:<20}".format("Metric", "Pre-Edit", "Masked", "Gaussian"))
for row in metrics:
    for metric in row:
        print("{:<30} {:<15.3f} {:.3f}({:.3f}) {:<6} {:.3f}({:.3f})".format(
            metric, mean_masked[metric.format("Pre")],
            mean_masked[metric.format("Post")], std_masked[metric.format("Post")], "",
            mean_gaussian[metric.format("Post")], std_gaussian[metric.format("Post")]))
    print("")

Metric                         Pre-Edit        Masked               Gaussian            
{} Accuracy                    0.687           0.674(0.020)        0.685(0.006)
{} Mean Precision              0.692           0.677(0.037)        0.694(0.002)
{} Mean Recall                 0.687           0.674(0.020)        0.685(0.006)
{} Mean F1                     0.684           0.668(0.029)        0.682(0.007)

{} Target Precision            0.706           0.566(0.112)        0.669(0.045)
{} Target Recall               0.427           0.506(0.047)        0.454(0.032)
{} Target F1                   0.532           0.524(0.041)        0.539(0.009)

{} Orig Pred Precision         0.654           0.653(0.300)        0.720(0.124)
{} Orig Pred Recall            0.661           0.449(0.231)        0.580(0.126)
{} Orig Pred F1                0.657           0.523(0.249)        0.637(0.120)



### Hypothesis 2: How many edits actually improved all three metrics for target class?

Will these also incur larger harm in overall metrics?

In [91]:
improve_target_rows = df[
    (df['Post Target Precision'] > df['Pre Target Precision']) &
    (df['Post Target Recall'] > df ['Pre Target Recall']) & 
    (df['Post Target F1'] > df['Pre Target F1'])]      

improve_target_recall = df[df['Post Target Recall'] > df ['Pre Target Recall']] 
improve_target_f1 = df[df['Post Target F1'] > df ['Pre Target F1']] 
improve_target_precision = df[df['Post Target Precision'] > df ['Pre Target Precision']] 
    
improve_target_precision_f1 = pd.merge(improve_target_precision, improve_target_f1, how='inner', on=['ID'])
improve_target_recall_f1 = pd.merge(improve_target_recall, improve_target_f1, how='inner', on=['ID'])
improve_target_recall_and_precision = pd.merge(improve_target_recall, improve_target_precision, how='inner', on=['ID'])

print("{} edits improved all target metrics ".format(len(improve_target_rows)))
print("{} edits improved target recall".format(len(improve_target_recall)))
print("{} edits improved target F1".format(len(improve_target_f1)))
print("{} edits improved target precision".format(len(improve_target_precision)))
print("{} edits improved target precision + f1".format(len(improve_target_precision_f1)))
print("{} edits improved target recall + f1".format(len(improve_target_recall_f1)))
print("{} edits improved target precision + recall".format(len(improve_target_recall_and_precision)))


0 edits improved all target metrics 
34 edits improved target recall
26 edits improved target F1
3 edits improved target precision
1 edits improved target precision + f1
25 edits improved target recall + f1
0 edits improved target precision + recall


## Hypothesis 3: If the edit improved the target class (let's say F1), then the metrics of the originally predicted class will be worse

In [92]:
orig_metrics = ['{} Orig Pred Precision', '{} Orig Pred Recall', '{} Orig Pred F1']
mean_improve_target_f1 = improve_target_f1.mean()
std_improve_target_f1 = improve_target_f1.std()

print("Average mean metrics in original predicted class for rows that improved f1 in target class")
print("{:<30} {:<15} {:<20}".format("Metric", "Pre-Edit", "Post-Edit"))
for metric in orig_metrics:
    print("{:<30} {:.3f}({:.3f}) {:<6} {:.3f}({:.3f})".format(
        metric, 
        mean_improve_target_f1[metric.format("Pre")], 
        std_improve_target_f1[metric.format("Pre")], 
        "",
        mean_improve_target_f1[metric.format("Post")],
        std_improve_target_f1[metric.format("Post")]))
    
not_improve_target_f1 = df[~df.isin(improve_target_f1)].dropna()
mean_not_improve_target_f1 = not_improve_target_f1.mean()
std_not_improve_target_f1 = not_improve_target_f1.std()

print("Average mean metrics in original predicted class for rows that did NOT improve f1 in target class")
print("{:<30} {:<15} {:<20}".format("Metric", "Pre-Edit", "Post-Edit"))
for metric in orig_metrics:
    print("{:<30} {:.3f}({:.3f}) {:<6} {:.3f}({:.3f})".format(
        metric, 
        mean_not_improve_target_f1[metric.format("Pre")], 
        std_not_improve_target_f1[metric.format("Pre")], 
        "",
        mean_not_improve_target_f1[metric.format("Post")],
        std_not_improve_target_f1[metric.format("Post")]))

Average mean metrics in original predicted class for rows that improved f1 in target class
Metric                         Pre-Edit        Post-Edit           
{} Orig Pred Precision         0.645(0.106)        0.695(0.176)
{} Orig Pred Recall            0.644(0.107)        0.532(0.177)
{} Orig Pred F1                0.644(0.104)        0.596(0.174)
Average mean metrics in original predicted class for rows that did NOT improve f1 in target class
Metric                         Pre-Edit        Post-Edit           
{} Orig Pred Precision         0.678(0.112)        0.615(0.411)
{} Orig Pred Recall            0.686(0.089)        0.372(0.258)
{} Orig Pred F1                0.680(0.092)        0.456(0.305)


## Hypothesis 4: If a gaussian noise segment is producing sucessful change, will the masked segment as well?

Result: not necessarily

In [93]:
# Count number of rows that are gaussian noise and are masked
masked_rows = df[df['ID'].str.contains('masked')]
n_masked = len(masked_rows)

gaussian_rows = df[df['ID'].str.contains('gaussian')]
n_gaussian = len(gaussian_rows)
print("{} masked modifications\n{} Gaussian modifications".format(n_masked, n_gaussian))

# For gaussian rows, are their corresponding masked segment also there?
gaussian_IDs = gaussian_rows['ID']
corresponding_masked_IDs = gaussian_IDs.replace('gaussian', 'masked', regex=True)
# print(corresponding_masked_IDs)

segments_with_both_gaussian_and_masked = list(set(corresponding_masked_IDs) & set(df['ID']))
n_both = len(segments_with_both_gaussian_and_masked)
print("{}/{} gaussian modifications have corresponding masked segment as success:".format(n_both, n_gaussian))
# print(segments_with_both_gaussian_and_masked)

27 masked modifications
10 Gaussian modifications
3/10 gaussian modifications have corresponding masked segment as success:


## Hypothesis 5: Smaller segments will produce smaller changes


## Hypothesis 6: The neighbors of the value images should be less affected than the neighbors of the key images

Compare distance between key-keyN and val-valN before and after the edit. The difference should be smaller for val-valN

In [109]:
# mean_diff_key_keyN = (df['Post key-keyN (F)'] - df['Pre key-keyN (F)']).mean()
mean_df = df.mean()
for data_type in ["F", "L"]:
    mean_pre_key_keyN = mean_df['Pre key-keyN ({})'.format(data_type)]
    mean_post_key_keyN = mean_df['Post key-keyN ({})'.format(data_type)]
    diff_key_keyN = mean_post_key_keyN - mean_pre_key_keyN
    percent_diff_key_keyN = diff_key_keyN / mean_pre_key_keyN * 100
    # mean_diff_val_valN = (df['Post val-valN (F)'] - df['Pre val-valN (F)']).mean()
    mean_pre_val_valN = mean_df['Pre val-valN ({})'.format(data_type)]
    mean_post_val_valN = mean_df['Post val-valN ({})'.format(data_type)]
    diff_val_valN = mean_post_val_valN - mean_pre_val_valN
    percent_diff_val_valN = diff_val_valN / mean_pre_val_valN * 100

    print("{}".format("Features" if data_type == 'F' else "Logits"))
    print("\tKey -> Key Neighbors: {:.3f} ==> {:.3f}".format(mean_pre_key_keyN, mean_post_key_keyN))
    print("\t\tMean difference: {:.3f} ({:.2f}%)".format(diff_key_keyN, percent_diff_key_keyN))
    print("\tVal -> Val Neighbors: {:.3f} ==> {:.3f}".format(mean_pre_val_valN, mean_post_val_valN))
    print("\t\tMean difference: {:.3f} ({:.2f}%)".format(diff_val_valN, percent_diff_val_valN))

Features
	Key -> Key Neighbors: 0.782 ==> 0.755
		Mean difference: -0.027 (-3.43%)
	Val -> Val Neighbors: 0.979 ==> 0.895
		Mean difference: -0.084 (-8.57%)
Logits
	Key -> Key Neighbors: 1.065 ==> 0.990
		Mean difference: -0.075 (-7.04%)
	Val -> Val Neighbors: 1.342 ==> 1.134
		Mean difference: -0.208 (-15.52%)


## Hypothesis 6: Edits that shared an original image will have smaller spread in post edit metrics


In [157]:
# Obtain unique key images
ids = list(df['ID'])
unique_keys = set()
for image_id in ids:
    key_id = image_id.split('/')[0]
    unique_keys.add(key_id)
    
metrics = [['{} Accuracy', '{} Mean Precision', '{} Mean Recall', '{} Mean F1'], 
           ['{} Target Precision', '{} Target Recall', '{} Target F1'],
           ['{} Orig Pred Precision', '{} Orig Pred Recall', '{} Orig Pred F1'],
           ['{} key-val (F)']]


key_image_stds = pd.DataFrame()
small_spread_keys = []
for unique_key in unique_keys:
    cur_rows = df[df['ID'].str.contains(unique_key)]
    mean_cur_rows = cur_rows.mean()
    std_cur_rows = cur_rows.std()
    # print("Accuracy STD: {:.3f}".format(std_cur_rows['Post Accuracy']))
    # print("Precision STD: {:.3f}".format(std_cur_rows['Post Mean Precision']))
    # print("Recall STD: {:.3f}".format(std_cur_rows['Post Mean Recall']))
    if std_cur_rows['Post Accuracy'] < 0.01:
        small_spread_keys.append(unique_key)
    else:
        print(cur_rows['Post Accuracy'])
    if len(cur_rows) > 1:
        key_image_stds = key_image_stds.append(std_cur_rows, ignore_index=True)
        

mean_key_image_stds = key_image_stds.mean()
print("{:<30} {:<15} {:<20}".format("Metric", "STD Overall", "Avg STD Grouped by Key Image"))
for row in metrics:
    for metric in row:
        print("{:<30} {:<15.3f} {:.3f}".format(
            metric.format("Post"), std_df[metric.format("Post")],
            mean_key_image_stds[metric.format("Post")]))
    print("")

# print("Average accuracy standard deviation for an image: {}".format(key_image_stds['Post Accuracy']))
print(small_spread_keys)

16    0.687371
17    0.688529
18    0.669086
19    0.620000
20    0.685500
21    0.685386
22    0.620614
23    0.687300
Name: Post Accuracy, dtype: float64
14    0.641871
Name: Post Accuracy, dtype: float64
36    0.672886
Name: Post Accuracy, dtype: float64
15    0.675343
Name: Post Accuracy, dtype: float64
24    0.687429
25    0.636157
26    0.687714
27    0.685071
28    0.688229
Name: Post Accuracy, dtype: float64
6    0.668386
Name: Post Accuracy, dtype: float64
Metric                         STD Overall     Avg STD Grouped by Key Image
Post Accuracy                  0.018           0.014
Post Mean Precision            0.032           0.019
Post Mean Recall               0.018           0.014
Post Mean F1                   0.025           0.019

Post Target Precision          0.107           0.088
Post Target Recall             0.049           0.044
Post Target F1                 0.036           0.025

Post Orig Pred Precision       0.264           0.178
Post Orig Pred Recall       

## Hypothesis 9: Edits that share a original prediction will have more similar post edit metric results

In [139]:
unique_original_predictions = set(df['Pre key Prediction'])
print(unique_original_predictions)

for og_prediction in unique_original_predictions:
    cur_rows = df[df['Pre key Prediction'] == og_prediction]
    # print(len(cur_rows))
    mean_cur_rows = cur_rows.mean()
    std_cur_rows = cur_rows.std()
    print(std_cur_rows['Post Accuracy'])

{2, 3, 4, 6, 7}
0.004919442891969285
0.006148371534599434
0.024926112445238086
0.006064829466900532
0.028172920852970768
