# Data Exploration Notebook
## Smart Secrets Scanner - Training Dataset Analysis

**Purpose:** Explore, validate, and visualize the training dataset for the Smart Secrets Scanner fine-tuned model.

**Dataset:** v3 (1000 examples - 500 ALERT + 500 SAFE)

**Sections:**
1. Introduction & Setup
2. Load Training Data
3. Data Quality Checks
4. Class Balance Analysis
5. Secret Type Distribution
6. Sample Examples
7. Token Length Analysis
8. Summary & Recommendations

## 1. Introduction & Setup

Import required libraries and configure paths.

In [None]:
# Import libraries
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from collections import Counter
import numpy as np

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

print("‚úÖ Libraries imported successfully")

In [None]:
# Set paths
PROJECT_ROOT = Path.cwd().parent if 'notebooks' in str(Path.cwd()) else Path.cwd()
DATA_DIR = PROJECT_ROOT / 'data' / 'processed'

TRAIN_FILE = DATA_DIR / 'smart-secrets-scanner-train-v3.jsonl'
VAL_FILE = DATA_DIR / 'smart-secrets-scanner-val-v3.jsonl'

print(f"Project Root: {PROJECT_ROOT}")
print(f"Data Directory: {DATA_DIR}")
print(f"Training File: {TRAIN_FILE}")
print(f"Validation File: {VAL_FILE}")
print(f"\nTrain file exists: {TRAIN_FILE.exists()}")
print(f"Val file exists: {VAL_FILE.exists()}")

## 2. Load Training Data

Load JSONL files into pandas DataFrames for analysis.

In [None]:
def load_jsonl(file_path):
    """Load JSONL file into a list of dictionaries."""
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.strip()))
    return data

# Load datasets
try:
    train_data = load_jsonl(TRAIN_FILE)
    val_data = load_jsonl(VAL_FILE)
    
    # Convert to DataFrames
    train_df = pd.DataFrame(train_data)
    val_df = pd.DataFrame(val_data)
    
    print(f"‚úÖ Training data loaded: {len(train_df)} examples")
    print(f"‚úÖ Validation data loaded: {len(val_df)} examples")
    print(f"\nTotal dataset size: {len(train_df) + len(val_df)} examples")
    
except FileNotFoundError as e:
    print(f"‚ùå Error: {e}")
    print("\nPlease ensure you've generated the v3 dataset using:")
    print("  python scripts/generate_simple_training_data.py")

In [None]:
# Display basic info
print("Training Dataset Info:")
print("=" * 50)
print(train_df.info())
print("\nFirst 3 examples:")
train_df.head(3)

## 3. Data Quality Checks

Validate data schema, check for issues, and ensure quality.

In [None]:
# Schema validation
required_fields = ['instruction', 'input', 'output']

print("Schema Validation:")
print("=" * 50)

for field in required_fields:
    train_has = field in train_df.columns
    val_has = field in val_df.columns
    status = "‚úÖ" if (train_has and val_has) else "‚ùå"
    print(f"{status} Field '{field}': Train={train_has}, Val={val_has}")

# Check for null values
print("\nNull Values Check:")
print("=" * 50)
print("Training data:")
print(train_df.isnull().sum())
print("\nValidation data:")
print(val_df.isnull().sum())

In [None]:
# Check for empty strings
print("Empty String Check:")
print("=" * 50)

for field in required_fields:
    train_empty = (train_df[field] == '').sum()
    val_empty = (val_df[field] == '').sum()
    print(f"Field '{field}': Train={train_empty}, Val={val_empty}")

# Check for duplicates
print("\nDuplicate Check:")
print("=" * 50)
train_dupes = train_df.duplicated(subset=['input']).sum()
val_dupes = val_df.duplicated(subset=['input']).sum()
print(f"Training duplicates (by input): {train_dupes}")
print(f"Validation duplicates (by input): {val_dupes}")

## 4. Class Balance Analysis

Analyze distribution of ALERT vs SAFE examples.

In [None]:
# Extract labels from output field
def extract_label(output_text):
    """Extract ALERT or SAFE from output text."""
    if 'ALERT' in output_text.upper():
        return 'ALERT'
    elif 'SAFE' in output_text.upper():
        return 'SAFE'
    else:
        return 'UNKNOWN'

train_df['label'] = train_df['output'].apply(extract_label)
val_df['label'] = val_df['output'].apply(extract_label)

# Count labels
train_counts = train_df['label'].value_counts()
val_counts = val_df['label'].value_counts()

print("Training Set Distribution:")
print("=" * 50)
print(train_counts)
print(f"\nBalance: {train_counts.get('ALERT', 0) / len(train_df) * 100:.1f}% ALERT")

print("\nValidation Set Distribution:")
print("=" * 50)
print(val_counts)
print(f"\nBalance: {val_counts.get('ALERT', 0) / len(val_df) * 100:.1f}% ALERT")

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Training set
train_counts.plot(kind='bar', ax=axes[0], color=['#e74c3c', '#2ecc71'])
axes[0].set_title('Training Set Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Label')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)
axes[0].grid(axis='y', alpha=0.3)

# Add count labels on bars
for i, v in enumerate(train_counts):
    axes[0].text(i, v + 10, str(v), ha='center', va='bottom', fontweight='bold')

# Validation set
val_counts.plot(kind='bar', ax=axes[1], color=['#e74c3c', '#2ecc71'])
axes[1].set_title('Validation Set Distribution', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Label')
axes[1].set_ylabel('Count')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=0)
axes[1].grid(axis='y', alpha=0.3)

# Add count labels on bars
for i, v in enumerate(val_counts):
    axes[1].text(i, v + 2, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\n‚úÖ Dataset is {'balanced' if abs(train_counts['ALERT'] - train_counts['SAFE']) < 50 else 'imbalanced'}")

## 5. Secret Type Distribution

Analyze what types of secrets appear in the dataset.

In [None]:
# Extract secret types from ALERT examples
def extract_secret_type(text):
    """Extract secret type from input or output text."""
    text_upper = text.upper()
    
    # Define patterns to search for
    patterns = {
        'AWS': ['AWS', 'AKIA', 'ACCESS_KEY', 'SECRET_KEY'],
        'Stripe': ['STRIPE', 'SK_LIVE', 'PK_LIVE'],
        'GitHub': ['GITHUB', 'GHP_', 'GHSA_'],
        'Database': ['DATABASE', 'DB_PASSWORD', 'POSTGRES', 'MYSQL'],
        'API Key': ['API_KEY', 'APIKEY', 'API-KEY'],
        'OAuth': ['OAUTH', 'CLIENT_SECRET', 'CLIENT_ID'],
        'JWT': ['JWT', 'TOKEN', 'BEARER'],
        'Firebase': ['FIREBASE', 'GOOGLE'],
        'Other': []
    }
    
    for secret_type, keywords in patterns.items():
        if any(kw in text_upper for kw in keywords):
            return secret_type
    
    return 'Other'

# Extract types for ALERT examples only
alert_examples = train_df[train_df['label'] == 'ALERT'].copy()
alert_examples['secret_type'] = alert_examples.apply(
    lambda row: extract_secret_type(row['input'] + ' ' + row['output']), 
    axis=1
)

secret_type_counts = alert_examples['secret_type'].value_counts()

print("Secret Type Distribution (ALERT examples):")
print("=" * 50)
print(secret_type_counts)
print(f"\nTotal ALERT examples: {len(alert_examples)}")

In [None]:
# Visualize secret types
fig, ax = plt.subplots(figsize=(10, 6))

secret_type_counts.plot(kind='barh', ax=ax, color='#3498db')
ax.set_title('Secret Type Distribution in Training Data', fontsize=14, fontweight='bold')
ax.set_xlabel('Count')
ax.set_ylabel('Secret Type')
ax.grid(axis='x', alpha=0.3)

# Add count labels
for i, v in enumerate(secret_type_counts):
    ax.text(v + 2, i, str(v), va='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 6. Sample Examples

Display representative examples from the dataset.

In [None]:
# Display ALERT examples
print("Sample ALERT Examples:")
print("=" * 80)

alert_samples = train_df[train_df['label'] == 'ALERT'].sample(3, random_state=42)

for idx, row in alert_samples.iterrows():
    print(f"\nExample {idx + 1}:")
    print(f"Input: {row['input'][:100]}...")
    print(f"Output: {row['output'][:150]}...")
    print("-" * 80)

In [None]:
# Display SAFE examples
print("Sample SAFE Examples:")
print("=" * 80)

safe_samples = train_df[train_df['label'] == 'SAFE'].sample(3, random_state=42)

for idx, row in safe_samples.iterrows():
    print(f"\nExample {idx + 1}:")
    print(f"Input: {row['input'][:100]}...")
    print(f"Output: {row['output'][:150]}...")
    print("-" * 80)

## 7. Token Length Analysis

Analyze the length distribution of inputs and outputs.

In [None]:
# Calculate character lengths (rough proxy for tokens)
train_df['input_length'] = train_df['input'].str.len()
train_df['output_length'] = train_df['output'].str.len()
train_df['total_length'] = train_df['input_length'] + train_df['output_length']

# Statistics
print("Input Length Statistics:")
print("=" * 50)
print(train_df['input_length'].describe())

print("\nOutput Length Statistics:")
print("=" * 50)
print(train_df['output_length'].describe())

print("\nTotal Length Statistics:")
print("=" * 50)
print(train_df['total_length'].describe())

In [None]:
# Visualize length distributions
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Input length distribution
axes[0, 0].hist(train_df['input_length'], bins=30, color='#3498db', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Input Length Distribution', fontweight='bold')
axes[0, 0].set_xlabel('Characters')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].grid(axis='y', alpha=0.3)

# Output length distribution
axes[0, 1].hist(train_df['output_length'], bins=30, color='#2ecc71', edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Output Length Distribution', fontweight='bold')
axes[0, 1].set_xlabel('Characters')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].grid(axis='y', alpha=0.3)

# Total length distribution
axes[1, 0].hist(train_df['total_length'], bins=30, color='#e74c3c', edgecolor='black', alpha=0.7)
axes[1, 0].set_title('Total Length Distribution (Input + Output)', fontweight='bold')
axes[1, 0].set_xlabel('Characters')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].grid(axis='y', alpha=0.3)

# Length by label
train_df.boxplot(column='total_length', by='label', ax=axes[1, 1])
axes[1, 1].set_title('Total Length by Label', fontweight='bold')
axes[1, 1].set_xlabel('Label')
axes[1, 1].set_ylabel('Total Characters')
plt.sca(axes[1, 1])
plt.xticks(rotation=0)

plt.tight_layout()
plt.show()

## 8. Summary & Recommendations

Summary of findings and recommendations for model training.

In [None]:
# Generate summary report
print("Dataset Summary Report")
print("=" * 80)

print(f"\nüìä Dataset Size:")
print(f"   - Training examples: {len(train_df)}")
print(f"   - Validation examples: {len(val_df)}")
print(f"   - Total: {len(train_df) + len(val_df)}")

print(f"\n‚öñÔ∏è  Class Balance:")
print(f"   - Training ALERT: {train_counts.get('ALERT', 0)} ({train_counts.get('ALERT', 0) / len(train_df) * 100:.1f}%)")
print(f"   - Training SAFE: {train_counts.get('SAFE', 0)} ({train_counts.get('SAFE', 0) / len(train_df) * 100:.1f}%)")
balance_status = "‚úÖ Balanced" if abs(train_counts.get('ALERT', 0) - train_counts.get('SAFE', 0)) < 50 else "‚ö†Ô∏è Imbalanced"
print(f"   - Status: {balance_status}")

print(f"\nüîç Data Quality:")
quality_checks = [
    ("Schema validation", all(f in train_df.columns for f in required_fields)),
    ("No null values", train_df.isnull().sum().sum() == 0),
    ("No empty strings", all((train_df[f] != '').all() for f in required_fields)),
    ("No duplicates", train_df.duplicated(subset=['input']).sum() == 0)
]

for check_name, passed in quality_checks:
    status = "‚úÖ" if passed else "‚ùå"
    print(f"   {status} {check_name}")

print(f"\nüìè Length Statistics:")
print(f"   - Avg input length: {train_df['input_length'].mean():.0f} chars")
print(f"   - Avg output length: {train_df['output_length'].mean():.0f} chars")
print(f"   - Max total length: {train_df['total_length'].max()} chars (~{train_df['total_length'].max() / 4:.0f} tokens)")

print(f"\nüéØ Secret Type Coverage:")
for secret_type, count in secret_type_counts.head(5).items():
    print(f"   - {secret_type}: {count} examples")

print(f"\nüí° Recommendations:")
print(f"   ‚úÖ Dataset size (1000 examples) is good for fine-tuning")
print(f"   ‚úÖ 80/20 train/val split is appropriate")
if balance_status == "‚úÖ Balanced":
    print(f"   ‚úÖ Class balance is excellent for binary classification")
print(f"   ‚úÖ Diverse secret types will improve model generalization")
print(f"   ‚úÖ Ready to proceed with fine-tuning!")

print("\n" + "=" * 80)
print("Next Steps: Proceed to notebook 02_fine_tuning_interactive.ipynb")
print("=" * 80)