In [2]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('..')

from src.config import *
from src.data_loader import *
from src.features import *
import networkx as nx

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Worked Example for `num_of_<edge_type>_edge_type`

1. load graph metadata
2. sample any individual graph
4. load graph dataframe(s) and manifests and explore via pandas

In [3]:
meta_df = get_graph_metadata(filter_nas=True, uq=0.99)

print("meta_df.shape", meta_df.shape)
meta_df.head(5)

meta_df.shape (1945, 9)


Unnamed: 0_level_0,G_pre_nodes,G_pre_edges,G_post_nodes,G_post_edges,G_delta_nodes,G_delta_edges,id,label,edge_node_ratio
graph_id_with_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
10-1,6919.0,24176.0,6796.0,23614.0,60.0,224.0,10,1,3.494147
1000-0,1706.0,5810.0,1707.0,5811.0,6.0,10.0,1000,0,3.405627
1001-1,2733.0,10458.0,814.0,2853.0,161.0,322.0,1001,1,3.826564
1005-1,7486.0,28481.0,6627.0,23157.0,950.0,4471.0,1005,1,3.804569
1010b-0,4551.0,17920.0,,,6.0,10.0,1010b,0,3.937596


In [4]:
sample = meta_df.sample(n=1)
sample_id = sample.index.values[0]

print("sample_id", sample_id)
sample

sample_id 887-1


Unnamed: 0_level_0,G_pre_nodes,G_pre_edges,G_post_nodes,G_post_edges,G_delta_nodes,G_delta_edges,id,label,edge_node_ratio
graph_id_with_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
887-1,3340.0,15865.0,3329.0,16246.0,19.0,56.0,887,1,4.75


In [5]:
df = load_df(sample_id, GraphType.PRE_INTERVENTION)

print("df.shape", df.shape)
df.head(5)

df.shape (15865, 46)


Unnamed: 0,edge_source,edge_target,edge_args,edge_script_id,edge_parent,edge_value,edge_edge_type,edge_timestamp,edge_status,edge_attr_name,...,target_is_deleted,target_tag_name,target_url,target_script_id,target_script_type,target_text,target_method,target_rule,target_is_alt,edge_response_hash
0,n1,n20,,,,,create node,102.0,,,...,False,html,,,,,,,,
1,n1,n20,,,2.0,,insert node,120.0,,,...,False,html,,,,,,,,
2,n1,n23,,,,,create node,148.0,,,...,False,html,,,,,,,,
3,n1,n23,,,,http://www.w3.org/1999/xhtml,set attribute,157.0,,,...,False,html,,,,,,,,
4,n1,n23,,,,de,set attribute,209.0,,,...,False,html,,,,,,,,


In [6]:
key_prefix = 'num_of_edge_type_'

df['edge_edge_type'].value_counts().to_dict()

{'create node': 3148,
 'set attribute': 3025,
 'insert node': 2864,
 'structure': 2311,
 'js result': 955,
 'js call': 955,
 'add event listener': 625,
 'remove node': 564,
 'event listener': 478,
 'request start': 207,
 'request complete': 188,
 'storage read result': 158,
 'read storage call': 158,
 'remove event listener': 147,
 'request error': 19,
 'execute': 15,
 'storage set': 11,
 'resource block': 9,
 'delete attribute': 8,
 'filter': 8,
 'shield': 4,
 'storage bucket': 3,
 'delete storage': 2,
 'cross DOM': 2,
 'execute from attribute': 1}

In [7]:
# Inspect manifest
manifest = load_manifest(sample_id)

print(manifest.keys())
print(manifest['validation'].keys())
print(manifest['result'].keys())

print("Blocked resource potentially resulting in web compat issue", manifest['validation']['url'])

dict_keys(['validation', 'result'])
dict_keys(['entry', 'url', 'result', 'filterMismatches'])
dict_keys(['snapshot', 'dump', 'screenshot'])
Blocked resource potentially resulting in web compat issue https://www.br.de/nachrichten/verkehr/index.html


In [14]:
# Extract single feature for graph
from src.features import *

sample_id
df_pre = load_df(sample_id, GraphType.PRE_INTERVENTION)
df_delta = load_df(sample_id, GraphType.DELTA)
manifest = load_manifest(sample_id)

get_total_no_of_nodes(df_delta)

{'total_no_of_nodes': 19}

In [16]:
extract_general_and_specific_features(df_pre, manifest, local_scope=True)

{'auto_total_no_of_nodes': 11,
 'auto_total_no_of_edges': 323,
 'auto_no_of_unique_edge_types': 6,
 'auto_no_of_unique_source_node_types': 5,
 'auto_no_of_unique_target_node_type': 5,
 'auto_no_of_edges_of_type_storage_read_result': 158,
 'auto_no_of_edges_of_type_read_storage_call': 158,
 'auto_no_of_edges_of_type_structure': 3,
 'auto_no_of_edges_of_type_cross_dom': 2,
 'auto_no_of_edges_of_type_create_node': 1,
 'auto_no_of_edges_of_type_insert_node': 1,
 'auto_no_of_source_nodes_of_type_cookie_jar': 158,
 'auto_no_of_source_nodes_of_type_script': 158,
 'auto_no_of_source_nodes_of_type_dom_root': 4,
 'auto_no_of_source_nodes_of_type_parser': 2,
 'auto_no_of_source_nodes_of_type_html_element': 1,
 'auto_no_of_target_nodes_of_type_script': 158,
 'auto_no_of_target_nodes_of_type_cookie_jar': 158,
 'auto_no_of_target_nodes_of_type_text_node': 3,
 'auto_no_of_target_nodes_of_type_html_element': 2,
 'auto_no_of_target_nodes_of_type_dom_root': 2,
 'expert_no_of_text_nodes_created': 1,
 'ex

In [13]:
# Extract all features for graph
from src.features import *

sample_id
df_pre = load_df(sample_id, GraphType.PRE_INTERVENTION)
df_delta = load_df(sample_id, GraphType.DELTA)
manifest = load_manifest(sample_id)

extract_features_for_all_graph_types_and_scopes(sample_id, df_pre, df_delta, manifest)

{'graph_id': '887-1',
 'did_break': 1,
 'global_pre_absolute_auto_total_no_of_nodes': 3335,
 'global_pre_absolute_auto_total_no_of_edges': 15865,
 'global_pre_absolute_auto_no_of_unique_edge_types': 25,
 'global_pre_absolute_auto_no_of_unique_source_node_types': 13,
 'global_pre_absolute_auto_no_of_unique_target_node_type': 16,
 'global_pre_absolute_auto_no_of_edges_of_type_create_node': 3148,
 'global_pre_absolute_auto_no_of_edges_of_type_set_attribute': 3025,
 'global_pre_absolute_auto_no_of_edges_of_type_insert_node': 2864,
 'global_pre_absolute_auto_no_of_edges_of_type_structure': 2311,
 'global_pre_absolute_auto_no_of_edges_of_type_js_result': 955,
 'global_pre_absolute_auto_no_of_edges_of_type_js_call': 955,
 'global_pre_absolute_auto_no_of_edges_of_type_add_event_listener': 625,
 'global_pre_absolute_auto_no_of_edges_of_type_remove_node': 564,
 'global_pre_absolute_auto_no_of_edges_of_type_event_listener': 478,
 'global_pre_absolute_auto_no_of_edges_of_type_request_start': 207,
