# üß™ Data Validation & CFG Pair Inspection

Validates the CSV dataset generated from Boa (`job-*.csv`), checks:
- PRE/POST balance
- CFG parseability (DOT format)
- Sample method-level pairing

Based on data in `../assets/data-samples/`

In [1]:
import pandas as pd
import os
import re
from IPython.display import display, SVG
import graphviz
import warnings
warnings.filterwarnings('ignore', category=pd.errors.ParserWarning)

In [2]:
# üîç Find available job CSVs
data_dir = "../assets/data-samples/"
csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv')]
print(f"Found {len(csv_files)} CSV files:")
for f in sorted(csv_files):
    size_mb = os.path.getsize(os.path.join(data_dir, f)) / (1024**2)
    print(f"  - {f} ({size_mb:.1f} MB)")

Found 1 CSV files:
  - job-113755.csv (1046.2 MB)


In [None]:
# üì• Load smallest sample (or prompt for job ID)
if csv_files:
    sample_file = min(csv_files, key=lambda f: os.path.getsize(os.path.join(data_dir, f)))
    df = pd.read_csv(os.path.join(data_dir, sample_file))
    print(f"‚úì Loaded {len(df)} rows from {sample_file}")
else:
    raise FileNotFoundError("No CSV files found in assets/data-samples/")

In [None]:
# üìä Basic stats
print("=== Dataset Overview ===")
print(f"- Total rows: {len(df):,}")
print(f"- Unique projects: {df['project_name'].nunique()}")
print(f"- Unique commits: {df['commit_url'].nunique()}")
print(f"- Unique methods: {df['method_name'].nunique()}")
print(f"- CFG States: {df['CFG State'].value_counts().to_dict()}")

# Check for malformed CFG state
invalid_states = df[~df['CFG State'].isin(['PRE', 'POST'])]
if len(invalid_states) > 0:
    print(f"‚ö†Ô∏è {len(invalid_states)} rows with invalid CFG State (e.g. blank/NaN)")

In [None]:
# üîó Validate PRE/POST pairing
# Group by commit + file + method
pair_key = ['commit_url', 'file_path', 'method_name']
grouped = df.groupby(pair_key)['CFG State'].apply(set).reset_index()
grouped['has_pair'] = grouped['CFG State'].apply(lambda s: s == {'PRE', 'POST'})

paired_count = grouped['has_pair'].sum()
total_groups = len(grouped)

print(f"=== Pairing Integrity ===")
print(f"- {paired_count:,} / {total_groups:,} method-commit groups have both PRE & POST")
print(f"- Pairing rate: {100 * paired_count / total_groups:.1f}%")

if paired_count > 0:
    # Save one full pair for inspection
    first_paired = grouped[grouped['has_pair']].iloc[0]
    sample_pair = df[
        (df['commit_url'] == first_paired['commit_url']) &
        (df['file_path'] == first_paired['file_path']) &
        (df['method_name'] == first_paired['method_name'])
    ][['CFG State', 'cfg_dot']]
    display(sample_pair)

In [None]:
# üõ†Ô∏è Helper: Render DOT as SVG (safe for malformed DOT)
def render_cfg(dot_str, title="CFG"):
    try:
        # Boa‚Äôs DOT may have \l line breaks ‚Üí replace with \n for Graphviz
        dot_clean = re.sub(r'\\l', r'\\n', dot_str)
        dot_clean = re.sub(r'\\n\s*', r'\\n', dot_clean)
        
        # Ensure graph has a name
        if not dot_clean.strip().startswith('digraph'):
            dot_clean = 'digraph G {' + dot_clean + '}'
        
        src = graphviz.Source(dot_clean, format='svg')
        svg = src.pipe(format='svg').decode('utf-8')
        return SVG(svg)
    except Exception as e:
        return f"‚ùå Failed to render CFG: {e}"

In [None]:
# üñºÔ∏è Visualize a PRE/POST pair (if available)
if 'sample_pair' in locals() and len(sample_pair) == 2:
    pre_cfg = sample_pair[sample_pair['CFG State'] == 'PRE']['cfg_dot'].iloc[0]
    post_cfg = sample_pair[sample_pair['CFG State'] == 'POST']['cfg_dot'].iloc[0]
    
    print("### üîç Sample CFG Pair (PRE ‚Üí POST)")
    print("**PRE (vulnerable version):**")
    display(render_cfg(pre_cfg, "PRE"))
    
    print("**POST (fixed version):**")
    display(render_cfg(post_cfg, "POST"))
else:
    print("‚ö†Ô∏è No full PRE/POST pair found in sample. Try a larger dataset.")

In [None]:
# üìâ Optional: Basic CFG metrics (node/edge count)
def count_nodes_edges(dot_str):
    # Very naive count: assumes [n] format like "[0] ENTRY"
    nodes = len(re.findall(r'\[\d+\]', dot_str))
    edges = len(re.findall(r'->', dot_str))
    return nodes, edges

df[['node_count', 'edge_count']] = df['cfg_dot'].apply(
    lambda x: pd.Series(count_nodes_edges(x))
)

print("=== CFG Complexity (Sample Stats) ===")
print(df.groupby('CFG State')[['node_count', 'edge_count']].agg(['mean', 'median', 'max']).round(1))

## ‚úÖ Next Steps

- [ ] Run on full dataset (e.g., `job-113755.csv`) to get robust stats
- [ ] Compute cyclomatic complexity: `CC = edges - nodes + 2`
- [ ] Export metrics to `../reports/cfg_metrics.csv`
- [ ] Identify commits with largest CFG delta (for case studies)

‚û°Ô∏è Proceed to `02_cfg_analysis.ipynb` for deeper analysis.