# Data Exploration Notebook

This notebook explores the collected Python functions dataset.

**Contents:**
1. Load and inspect raw data
2. Class distribution (buggy vs clean)
3. Code length analysis
4. Sample functions
5. Feature distribution (after preprocessing)

In [None]:
# Standard imports
import sys
sys.path.append('..')  # Add project root to path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
from pathlib import Path

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print('Libraries loaded successfully!')

## 1. Load Raw Data

In [None]:
# Load raw data
data_path = Path('../data/raw/functions.csv')

if data_path.exists():
    df = pd.read_csv(data_path)
    print(f'Loaded {len(df)} samples')
    print(f'\nColumns: {list(df.columns)}')
else:
    print(f'Data file not found: {data_path}')
    print('Please run data collection first: python -m src.data_collection')

In [None]:
# Basic info
df.info()

In [None]:
# First few rows (truncate code for display)
df_display = df.copy()
df_display['code'] = df_display['code'].str[:100] + '...'
df_display.head()

## 2. Class Distribution

In [None]:
# Class counts
class_counts = df['has_bug'].value_counts()
print('Class Distribution:')
print(f'  Clean (has_bug=0): {class_counts.get(0, 0)}')
print(f'  Buggy (has_bug=1): {class_counts.get(1, 0)}')
print(f'\nClass Ratio: {class_counts.get(0, 0) / max(class_counts.get(1, 1), 1):.2f}:1')

In [None]:
# Visualize class distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart
ax1 = axes[0]
labels = ['Clean', 'Buggy']
counts = [class_counts.get(0, 0), class_counts.get(1, 0)]
colors = ['#2ecc71', '#e74c3c']
bars = ax1.bar(labels, counts, color=colors)
ax1.set_ylabel('Count')
ax1.set_title('Class Distribution')
for bar, count in zip(bars, counts):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 50, 
             str(count), ha='center', fontsize=12)

# Pie chart
ax2 = axes[1]
ax2.pie(counts, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
ax2.set_title('Class Proportion')

plt.tight_layout()
plt.show()

## 3. Code Length Analysis

In [None]:
# Lines of code statistics
print('Lines of Code Statistics:')
print(df['lines_of_code'].describe())

In [None]:
# Distribution by class
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
ax1 = axes[0]
for label, color, name in [(0, '#2ecc71', 'Clean'), (1, '#e74c3c', 'Buggy')]:
    subset = df[df['has_bug'] == label]['lines_of_code']
    if len(subset) > 0:
        ax1.hist(subset, bins=50, alpha=0.6, label=name, color=color)
ax1.set_xlabel('Lines of Code')
ax1.set_ylabel('Frequency')
ax1.set_title('Distribution of Function Length')
ax1.legend()

# Box plot
ax2 = axes[1]
df.boxplot(column='lines_of_code', by='has_bug', ax=ax2)
ax2.set_xlabel('Has Bug')
ax2.set_ylabel('Lines of Code')
ax2.set_title('Function Length by Class')
ax2.set_xticklabels(['Clean', 'Buggy'])
plt.suptitle('')  # Remove auto-generated title

plt.tight_layout()
plt.show()

## 4. Repository Analysis

In [None]:
# Repository distribution
print(f'Unique repositories: {df["repo"].nunique()}')
print('\nTop 10 repositories:')
df['repo'].value_counts().head(10)

In [None]:
# Repository stars distribution
fig, ax = plt.subplots(figsize=(10, 5))
df['stars'].hist(bins=50, ax=ax, color='steelblue', edgecolor='white')
ax.set_xlabel('Repository Stars')
ax.set_ylabel('Frequency')
ax.set_title('Distribution of Repository Stars')
plt.show()

## 5. Sample Functions

In [None]:
def display_function(code, title='Function'):
    """Pretty print a function with syntax highlighting."""
    print(f'\n{"="*60}')
    print(f'{title}')
    print('='*60)
    print(code)
    print('='*60)

In [None]:
# Sample clean functions
clean_samples = df[df['has_bug'] == 0].sample(min(3, len(df[df['has_bug']==0])), random_state=42)

print('SAMPLE CLEAN FUNCTIONS')
for i, (_, row) in enumerate(clean_samples.iterrows()):
    display_function(
        row['code'], 
        f"Clean Function {i+1}: {row['function_name']} (from {row['repo']})"
    )

In [None]:
# Sample buggy functions (if any)
buggy_samples = df[df['has_bug'] == 1]

if len(buggy_samples) > 0:
    buggy_samples = buggy_samples.sample(min(3, len(buggy_samples)), random_state=42)
    print('SAMPLE BUGGY FUNCTIONS')
    for i, (_, row) in enumerate(buggy_samples.iterrows()):
        display_function(
            row['code'], 
            f"Buggy Function {i+1}: {row['function_name']} (from {row['repo']})"
        )
else:
    print('No buggy functions in dataset yet.')
    print('Run data collection to collect buggy functions from bug fix commits.')

## 6. Processed Features Analysis

Run this section after preprocessing: `python -m src.preprocessing`

In [None]:
# Load processed data if available
processed_dir = Path('../data/processed')
metadata_path = processed_dir / 'metadata.json'

if metadata_path.exists():
    with open(metadata_path) as f:
        metadata = json.load(f)
    print('Preprocessing Metadata:')
    print(json.dumps(metadata, indent=2))
else:
    print('Processed data not found.')
    print('Run preprocessing first: python -m src.preprocessing')

In [None]:
# Load and visualize features
features_path = processed_dir / 'features.npz'
feature_names_path = processed_dir / 'feature_names.json'

if features_path.exists() and feature_names_path.exists():
    features = np.load(features_path)['features']
    with open(feature_names_path) as f:
        feature_names = json.load(f)
    
    print(f'Feature matrix shape: {features.shape}')
    print(f'Number of features: {len(feature_names)}')
    print(f'\nFeature names: {feature_names}')
else:
    print('Feature files not found. Run preprocessing first.')

In [None]:
# Feature statistics
if features_path.exists():
    features_df = pd.DataFrame(features, columns=feature_names)
    print('Feature Statistics (first 10):')
    features_df.iloc[:, :10].describe()

In [None]:
# Feature distributions
if features_path.exists():
    # Select first 9 features for visualization
    fig, axes = plt.subplots(3, 3, figsize=(15, 12))
    axes = axes.flatten()
    
    for i, (ax, name) in enumerate(zip(axes, feature_names[:9])):
        ax.hist(features[:, i], bins=50, edgecolor='white')
        ax.set_title(name)
        ax.set_xlabel('Value')
        ax.set_ylabel('Frequency')
    
    plt.suptitle('Feature Distributions', fontsize=14, y=1.02)
    plt.tight_layout()
    plt.show()

In [None]:
# Feature correlation heatmap
if features_path.exists():
    # Calculate correlation matrix (first 15 features)
    n_features = min(15, len(feature_names))
    corr_matrix = np.corrcoef(features[:, :n_features].T)
    
    plt.figure(figsize=(12, 10))
    sns.heatmap(
        corr_matrix, 
        xticklabels=feature_names[:n_features],
        yticklabels=feature_names[:n_features],
        cmap='RdBu_r',
        center=0,
        annot=True,
        fmt='.2f',
        square=True
    )
    plt.title('Feature Correlation Matrix')
    plt.tight_layout()
    plt.show()

## 7. Data Split Verification

In [None]:
# Verify train/val/test split
train_idx_path = processed_dir / 'train_indices.npy'
val_idx_path = processed_dir / 'val_indices.npy'
test_idx_path = processed_dir / 'test_indices.npy'
labels_path = processed_dir / 'labels.npy'

if all(p.exists() for p in [train_idx_path, val_idx_path, test_idx_path, labels_path]):
    train_idx = np.load(train_idx_path)
    val_idx = np.load(val_idx_path)
    test_idx = np.load(test_idx_path)
    labels = np.load(labels_path)
    
    print('Data Split:')
    print(f'  Train: {len(train_idx)} samples ({len(train_idx)/len(labels)*100:.1f}%)')
    print(f'  Validation: {len(val_idx)} samples ({len(val_idx)/len(labels)*100:.1f}%)')
    print(f'  Test: {len(test_idx)} samples ({len(test_idx)/len(labels)*100:.1f}%)')
    
    print('\nClass balance in each split:')
    for name, idx in [('Train', train_idx), ('Val', val_idx), ('Test', test_idx)]:
        buggy = labels[idx].sum()
        clean = len(idx) - buggy
        print(f'  {name}: {buggy} buggy ({buggy/len(idx)*100:.1f}%), {clean} clean ({clean/len(idx)*100:.1f}%)')
else:
    print('Split files not found. Run preprocessing first.')

## Summary

This notebook explored:
- Raw data structure and content
- Class distribution (buggy vs clean)
- Code length patterns
- Repository sources
- Sample functions
- Extracted features (if preprocessing completed)
- Data split verification

**Next Steps:**
1. If no buggy functions: Continue running data collection
2. If data collected: Run preprocessing (`python -m src.preprocessing`)
3. After preprocessing: Train the model (`python -m src.train`)