# Batch Analysis of Multiple Codebases

This notebook demonstrates how to analyze multiple projects and compare their health metrics.

Topics covered:
1. Analyzing multiple projects
2. Comparing health scores
3. Identifying patterns across projects
4. Generating comparative reports
5. Tracking metrics over time

In [None]:
from pathlib import Path
from falkor.graph import Neo4jClient
from falkor.pipeline import IngestionPipeline
from falkor.detectors import AnalysisEngine
from falkor.config import load_config
import pandas as pd
import matplotlib.pyplot as plt
import json
from datetime import datetime

## 1. Define Projects to Analyze

List the repositories you want to analyze and compare.

In [None]:
# Define projects
projects = [
    {
        "name": "Project A",
        "path": "/path/to/project-a",
        "patterns": ["**/*.py"]
    },
    {
        "name": "Project B",
        "path": "/path/to/project-b",
        "patterns": ["**/*.py"]
    },
    {
        "name": "Project C",
        "path": "/path/to/project-c",
        "patterns": ["**/*.py", "**/*.js"]
    },
]

print(f"Configured {len(projects)} projects for analysis")

## 2. Analyze Each Project

Run the complete analysis workflow for each project.

In [None]:
def analyze_project(project_config, config):
    """
    Analyze a single project and return health report.
    """
    name = project_config["name"]
    repo_path = project_config["path"]
    patterns = project_config["patterns"]
    
    print(f"\n{'='*60}")
    print(f"Analyzing: {name}")
    print(f"{'='*60}")
    
    # Connect to Neo4j
    db = Neo4jClient(
        uri=config.neo4j.uri,
        username=config.neo4j.user,
        password=config.neo4j.password
    )
    
    try:
        # Clear previous data for this project (optional)
        # db.clear_graph()  # WARNING: Clears entire graph!
        
        # Ingest
        print(f"\n1. Ingesting {name}...")
        pipeline = IngestionPipeline(
            repo_path=repo_path,
            neo4j_client=db,
            follow_symlinks=False,
            max_file_size_mb=10,
            batch_size=100
        )
        pipeline.ingest(patterns=patterns)
        
        stats = db.get_stats()
        print(f"   Files: {stats['total_files']}")
        print(f"   Classes: {stats['total_classes']}")
        print(f"   Functions: {stats['total_functions']}")
        
        # Analyze
        print(f"\n2. Running analysis...")
        engine = AnalysisEngine(db)
        health = engine.analyze()
        
        print(f"   Grade: {health.grade}")
        print(f"   Score: {health.overall_score:.1f}/100")
        print(f"   Findings: {health.findings_summary.total}")
        
        # Save report
        output_file = f"reports/{name.replace(' ', '_').lower()}_report.json"
        Path("reports").mkdir(exist_ok=True)
        with open(output_file, "w") as f:
            report_data = health.to_dict()
            report_data["project_name"] = name
            report_data["project_path"] = repo_path
            json.dump(report_data, f, indent=2)
        
        print(f"   Report saved: {output_file}")
        
        return {
            "name": name,
            "health": health,
            "stats": stats,
            "report_file": output_file
        }
        
    finally:
        db.close()

In [None]:
# Load configuration
config = load_config()

# Analyze all projects
results = []
for project in projects:
    try:
        result = analyze_project(project, config)
        results.append(result)
    except Exception as e:
        print(f"\n‚ùå Error analyzing {project['name']}: {e}")

print(f"\n{'='*60}")
print(f"‚úì Analysis complete for {len(results)}/{len(projects)} projects")
print(f"{'='*60}")

## 3. Compare Health Scores

Create a comparative view of all projects.

In [None]:
# Build comparison dataframe
comparison_data = []
for result in results:
    health = result["health"]
    stats = result["stats"]
    
    comparison_data.append({
        "Project": result["name"],
        "Grade": health.grade,
        "Overall Score": round(health.overall_score, 1),
        "Structure": round(health.structure_score, 1),
        "Quality": round(health.quality_score, 1),
        "Architecture": round(health.architecture_score, 1),
        "Files": stats["total_files"],
        "Classes": stats["total_classes"],
        "Functions": stats["total_functions"],
        "Critical": health.findings_summary.critical,
        "High": health.findings_summary.high,
        "Medium": health.findings_summary.medium
    })

df = pd.DataFrame(comparison_data)
print("\nüìä Project Comparison:")
print(df.to_string(index=False))

# Save comparison
df.to_csv("reports/comparison.csv", index=False)
print("\n‚úì Saved to reports/comparison.csv")

## 4. Visualize Comparisons

In [None]:
# Score comparison chart
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Overall scores
axes[0, 0].barh(df["Project"], df["Overall Score"], color='skyblue')
axes[0, 0].set_xlabel('Score')
axes[0, 0].set_title('Overall Health Score')
axes[0, 0].set_xlim([0, 100])

# Category scores
x = range(len(df))
width = 0.25
axes[0, 1].bar([i - width for i in x], df["Structure"], width, label='Structure', color='lightgreen')
axes[0, 1].bar(x, df["Quality"], width, label='Quality', color='lightcoral')
axes[0, 1].bar([i + width for i in x], df["Architecture"], width, label='Architecture', color='lightyellow')
axes[0, 1].set_ylabel('Score')
axes[0, 1].set_title('Category Scores')
axes[0, 1].set_xticks(x)
axes[0, 1].set_xticklabels(df["Project"], rotation=45, ha='right')
axes[0, 1].legend()
axes[0, 1].set_ylim([0, 100])

# Codebase size
axes[1, 0].bar(df["Project"], df["Files"], color='mediumpurple')
axes[1, 0].set_ylabel('Count')
axes[1, 0].set_title('Number of Files')
axes[1, 0].tick_params(axis='x', rotation=45)

# Findings
axes[1, 1].bar(df["Project"], df["Critical"], label='Critical', color='red')
axes[1, 1].bar(df["Project"], df["High"], bottom=df["Critical"], label='High', color='orange')
axes[1, 1].bar(df["Project"], df["Medium"], bottom=df["Critical"] + df["High"], label='Medium', color='yellow')
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('Findings by Severity')
axes[1, 1].tick_params(axis='x', rotation=45)
axes[1, 1].legend()

plt.tight_layout()
plt.savefig('reports/comparison_charts.png', dpi=150, bbox_inches='tight')
plt.show()

print("\n‚úì Charts saved to reports/comparison_charts.png")

## 5. Identify Best and Worst Practices

In [None]:
# Best project
best_project = df.loc[df["Overall Score"].idxmax()]
print("\nüèÜ Best Project:")
print(f"   {best_project['Project']} - Grade {best_project['Grade']} ({best_project['Overall Score']}/100)")

# Worst project
worst_project = df.loc[df["Overall Score"].idxmin()]
print("\n‚ö†Ô∏è  Needs Attention:")
print(f"   {worst_project['Project']} - Grade {worst_project['Grade']} ({worst_project['Overall Score']}/100)")

# Project with most issues
df["Total Issues"] = df["Critical"] + df["High"] + df["Medium"]
most_issues = df.loc[df["Total Issues"].idxmax()]
print("\nüîß Most Issues:")
print(f"   {most_issues['Project']} - {most_issues['Total Issues']} issues ({most_issues['Critical']} critical)")

# Averages
print("\nüìà Average Metrics:")
print(f"   Overall Score: {df['Overall Score'].mean():.1f}")
print(f"   Files per project: {df['Files'].mean():.0f}")
print(f"   Issues per project: {df['Total Issues'].mean():.0f}")

## 6. Generate Executive Summary Report

In [None]:
# Create executive summary
summary = {
    "analysis_date": datetime.now().isoformat(),
    "projects_analyzed": len(results),
    "summary": {
        "average_score": round(df["Overall Score"].mean(), 1),
        "grade_distribution": df["Grade"].value_counts().to_dict(),
        "total_files": int(df["Files"].sum()),
        "total_classes": int(df["Classes"].sum()),
        "total_functions": int(df["Functions"].sum()),
        "total_critical_issues": int(df["Critical"].sum()),
        "total_high_issues": int(df["High"].sum())
    },
    "best_project": {
        "name": best_project["Project"],
        "grade": best_project["Grade"],
        "score": best_project["Overall Score"]
    },
    "needs_attention": {
        "name": worst_project["Project"],
        "grade": worst_project["Grade"],
        "score": worst_project["Overall Score"],
        "critical_issues": worst_project["Critical"]
    },
    "projects": comparison_data
}

# Save summary
with open("reports/executive_summary.json", "w") as f:
    json.dump(summary, f, indent=2)

print("\n‚úì Executive summary saved to reports/executive_summary.json")
print("\n" + "="*60)
print("EXECUTIVE SUMMARY")
print("="*60)
print(json.dumps(summary, indent=2))

## 7. Track Metrics Over Time (Optional)

Save results with timestamps to track improvements over time.

In [None]:
# Append to historical data
history_file = "reports/history.jsonl"

with open(history_file, "a") as f:
    for project_data in comparison_data:
        record = {
            "timestamp": datetime.now().isoformat(),
            **project_data
        }
        f.write(json.dumps(record) + "\n")

print(f"‚úì Historical data appended to {history_file}")
print("\nRun this analysis regularly to track progress over time!")

## Summary

This notebook demonstrated how to:

1. **Batch Analysis**: Analyze multiple projects efficiently
2. **Comparison**: Compare health scores and metrics
3. **Patterns**: Identify common issues across projects
4. **Reporting**: Generate executive summaries
5. **Tracking**: Monitor improvements over time

## Use Cases

- **Portfolio Management**: Track health of all company projects
- **Team Comparisons**: Compare team code quality
- **Migration Planning**: Identify which projects need refactoring first
- **Progress Tracking**: Monitor improvements after refactoring initiatives
- **Best Practices**: Learn from highest-quality projects

## Tips for Production Use

1. Run analysis on a schedule (daily/weekly)
2. Store results in a time-series database
3. Set up alerts for critical issues
4. Create dashboards for stakeholders
5. Integrate with CI/CD pipelines

## Next Steps

- Automate with a cron job or CI/CD pipeline
- Create custom dashboards with your favorite BI tool
- Set quality gates based on scores
- Share reports with stakeholders